datachain 0.8.1__tar.gz → 0.8.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.8.1/src/datachain.egg-info → datachain-0.8.3}/PKG-INFO +84 -2
- {datachain-0.8.1 → datachain-0.8.3}/README.rst +82 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/quick-start.md +6 -6
- {datachain-0.8.1 → datachain-0.8.3}/pyproject.toml +1 -1
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/cache.py +4 -2
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/catalog/catalog.py +100 -54
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/catalog/datasource.py +4 -6
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/azure.py +21 -1
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/fsspec.py +35 -9
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/gcs.py +16 -7
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/local.py +4 -4
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/s3.py +10 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/dataset.py +1 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/dc.py +15 -3
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/listing.py +18 -3
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/listing.py +1 -5
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/node.py +27 -1
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/session.py +1 -1
- {datachain-0.8.1 → datachain-0.8.3/src/datachain.egg-info}/PKG-INFO +84 -2
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain.egg-info/requires.txt +1 -1
- datachain-0.8.3/tests/func/fake-service-account-credentials.json +9 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_catalog.py +150 -12
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_datachain.py +6 -2
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_pull.py +1 -0
- datachain-0.8.3/tests/unit/test_client_gcs.py +14 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_client_s3.py +6 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_listing.py +1 -1
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_session.py +5 -0
- datachain-0.8.1/tests/unit/test_client_gcs.py +0 -17
- {datachain-0.8.1 → datachain-0.8.3}/.cruft.json +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.gitattributes +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.github/codecov.yaml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.github/dependabot.yml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.github/workflows/release.yml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.github/workflows/tests.yml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.gitignore +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/.pre-commit-config.yaml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/LICENSE +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/assets/datachain.svg +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/contributing.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/examples.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/index.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/overrides/main.html +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/references/datachain.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/references/datatype.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/references/file.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/references/index.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/references/sql.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/references/torch.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/references/udf.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/docs/tutorials.md +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/multimodal/wds.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/mkdocs.yml +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/noxfile.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/setup.cfg +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/__main__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/asyn.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/cli.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/cli_utils.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/hf.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/config.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/error.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/array.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/base.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/conditional.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/func.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/numeric.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/path.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/random.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/string.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/window.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/job.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/clip.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/diff.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/file.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/hf.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/image.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/settings.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/tar.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/text.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/udf.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/utils.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/bbox.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/pose.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/segment.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/progress.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/py.typed +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/batch.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/dataset.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/metrics.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/params.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/queue.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/schema.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/udf.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/utils.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/remote/studio.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/types.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/utils.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/studio.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/telemetry.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain/utils.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/conftest.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/data.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/examples/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/examples/test_examples.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/examples/wds_data.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_client.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_datasets.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_listing.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_ls.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_metrics.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_pytorch.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_query.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_session.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_toolkit.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/scripts/feature_class.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/test_atomicity.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/test_cli_e2e.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/test_cli_studio.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/test_query_e2e.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/test_telemetry.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_asyn.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_cache.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_catalog.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_client.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_config.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_dataset.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_func.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_metastore.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_query.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_query_params.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_serializer.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_utils.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.8.1 → datachain-0.8.3}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -72,7 +72,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
|
72
72
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
73
73
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
74
74
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
75
|
-
Requires-Dist: pytest-servers[all]>=0.5.
|
|
75
|
+
Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
|
|
76
76
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
77
77
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
78
78
|
Requires-Dist: virtualenv; extra == "tests"
|
|
@@ -145,6 +145,88 @@ Getting Started
|
|
|
145
145
|
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
146
146
|
to get started with `DataChain` and learn more.
|
|
147
147
|
|
|
148
|
+
.. code:: bash
|
|
149
|
+
|
|
150
|
+
pip install datachain
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
Example: download subset of files based on metadata
|
|
154
|
+
---------------------------------------------------
|
|
155
|
+
|
|
156
|
+
Sometimes users only need to download a specific subset of files from cloud storage,
|
|
157
|
+
rather than the entire dataset.
|
|
158
|
+
For example, you could use a JSON file's metadata to download just cat images with
|
|
159
|
+
high confidence scores.
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
.. code:: py
|
|
163
|
+
|
|
164
|
+
from datachain import Column, DataChain
|
|
165
|
+
|
|
166
|
+
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
167
|
+
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
168
|
+
|
|
169
|
+
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
170
|
+
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
171
|
+
|
|
172
|
+
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
173
|
+
& (Column("meta.inference.class_") == "cat"))
|
|
174
|
+
likely_cats.export_files("high-confidence-cats/", signal="file")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
Example: LLM based text-file evaluation
|
|
178
|
+
---------------------------------------
|
|
179
|
+
|
|
180
|
+
In this example, we evaluate chatbot conversations stored in text files
|
|
181
|
+
using LLM based evaluation.
|
|
182
|
+
|
|
183
|
+
.. code:: shell
|
|
184
|
+
|
|
185
|
+
$ pip install mistralai # Requires version >=1.0.0
|
|
186
|
+
$ export MISTRAL_API_KEY=_your_key_
|
|
187
|
+
|
|
188
|
+
Python code:
|
|
189
|
+
|
|
190
|
+
.. code:: py
|
|
191
|
+
|
|
192
|
+
from mistralai import Mistral
|
|
193
|
+
from datachain import File, DataChain, Column
|
|
194
|
+
|
|
195
|
+
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
196
|
+
|
|
197
|
+
def eval_dialogue(file: File) -> bool:
|
|
198
|
+
client = Mistral()
|
|
199
|
+
response = client.chat.complete(
|
|
200
|
+
model="open-mixtral-8x22b",
|
|
201
|
+
messages=[{"role": "system", "content": PROMPT},
|
|
202
|
+
{"role": "user", "content": file.read()}])
|
|
203
|
+
result = response.choices[0].message.content
|
|
204
|
+
return result.lower().startswith("success")
|
|
205
|
+
|
|
206
|
+
chain = (
|
|
207
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
208
|
+
.settings(parallel=4, cache=True)
|
|
209
|
+
.map(is_success=eval_dialogue)
|
|
210
|
+
.save("mistral_files")
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
successful_chain = chain.filter(Column("is_success") == True)
|
|
214
|
+
successful_chain.export_files("./output_mistral")
|
|
215
|
+
|
|
216
|
+
print(f"{successful_chain.count()} files were exported")
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
221
|
+
|
|
222
|
+
.. code:: shell
|
|
223
|
+
|
|
224
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/
|
|
225
|
+
1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
|
|
226
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
|
|
227
|
+
31
|
|
228
|
+
|
|
229
|
+
|
|
148
230
|
Key Features
|
|
149
231
|
============
|
|
150
232
|
|
|
@@ -42,6 +42,88 @@ Getting Started
|
|
|
42
42
|
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
43
43
|
to get started with `DataChain` and learn more.
|
|
44
44
|
|
|
45
|
+
.. code:: bash
|
|
46
|
+
|
|
47
|
+
pip install datachain
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
Example: download subset of files based on metadata
|
|
51
|
+
---------------------------------------------------
|
|
52
|
+
|
|
53
|
+
Sometimes users only need to download a specific subset of files from cloud storage,
|
|
54
|
+
rather than the entire dataset.
|
|
55
|
+
For example, you could use a JSON file's metadata to download just cat images with
|
|
56
|
+
high confidence scores.
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
.. code:: py
|
|
60
|
+
|
|
61
|
+
from datachain import Column, DataChain
|
|
62
|
+
|
|
63
|
+
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
64
|
+
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
65
|
+
|
|
66
|
+
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
67
|
+
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
68
|
+
|
|
69
|
+
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
70
|
+
& (Column("meta.inference.class_") == "cat"))
|
|
71
|
+
likely_cats.export_files("high-confidence-cats/", signal="file")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
Example: LLM based text-file evaluation
|
|
75
|
+
---------------------------------------
|
|
76
|
+
|
|
77
|
+
In this example, we evaluate chatbot conversations stored in text files
|
|
78
|
+
using LLM based evaluation.
|
|
79
|
+
|
|
80
|
+
.. code:: shell
|
|
81
|
+
|
|
82
|
+
$ pip install mistralai # Requires version >=1.0.0
|
|
83
|
+
$ export MISTRAL_API_KEY=_your_key_
|
|
84
|
+
|
|
85
|
+
Python code:
|
|
86
|
+
|
|
87
|
+
.. code:: py
|
|
88
|
+
|
|
89
|
+
from mistralai import Mistral
|
|
90
|
+
from datachain import File, DataChain, Column
|
|
91
|
+
|
|
92
|
+
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
93
|
+
|
|
94
|
+
def eval_dialogue(file: File) -> bool:
|
|
95
|
+
client = Mistral()
|
|
96
|
+
response = client.chat.complete(
|
|
97
|
+
model="open-mixtral-8x22b",
|
|
98
|
+
messages=[{"role": "system", "content": PROMPT},
|
|
99
|
+
{"role": "user", "content": file.read()}])
|
|
100
|
+
result = response.choices[0].message.content
|
|
101
|
+
return result.lower().startswith("success")
|
|
102
|
+
|
|
103
|
+
chain = (
|
|
104
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
105
|
+
.settings(parallel=4, cache=True)
|
|
106
|
+
.map(is_success=eval_dialogue)
|
|
107
|
+
.save("mistral_files")
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
successful_chain = chain.filter(Column("is_success") == True)
|
|
111
|
+
successful_chain.export_files("./output_mistral")
|
|
112
|
+
|
|
113
|
+
print(f"{successful_chain.count()} files were exported")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
118
|
+
|
|
119
|
+
.. code:: shell
|
|
120
|
+
|
|
121
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/
|
|
122
|
+
1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
|
|
123
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
|
|
124
|
+
31
|
|
125
|
+
|
|
126
|
+
|
|
45
127
|
Key Features
|
|
46
128
|
============
|
|
47
129
|
|
|
@@ -39,8 +39,8 @@ using JSON metadata:
|
|
|
39
39
|
``` py
|
|
40
40
|
from datachain import Column, DataChain
|
|
41
41
|
|
|
42
|
-
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
|
|
43
|
-
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
|
|
42
|
+
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
43
|
+
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
44
44
|
|
|
45
45
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
46
46
|
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
@@ -78,7 +78,7 @@ def is_positive_dialogue_ending(file) -> bool:
|
|
|
78
78
|
|
|
79
79
|
chain = (
|
|
80
80
|
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
|
|
81
|
-
object_name="file", type="text")
|
|
81
|
+
object_name="file", type="text", anon=True)
|
|
82
82
|
.settings(parallel=8, cache=True)
|
|
83
83
|
.map(is_positive=is_positive_dialogue_ending)
|
|
84
84
|
.save("file_response")
|
|
@@ -132,7 +132,7 @@ def eval_dialogue(file: File) -> bool:
|
|
|
132
132
|
return result.lower().startswith("success")
|
|
133
133
|
|
|
134
134
|
chain = (
|
|
135
|
-
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
135
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
136
136
|
.map(is_success=eval_dialogue)
|
|
137
137
|
.save("mistral_files")
|
|
138
138
|
)
|
|
@@ -177,7 +177,7 @@ def eval_dialog(file: File) -> ChatCompletionResponse:
|
|
|
177
177
|
{"role": "user", "content": file.read()}])
|
|
178
178
|
|
|
179
179
|
chain = (
|
|
180
|
-
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
180
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
181
181
|
.settings(parallel=4, cache=True)
|
|
182
182
|
.map(response=eval_dialog)
|
|
183
183
|
.map(status=lambda response: response.choices[0].message.content.lower()[:7])
|
|
@@ -273,7 +273,7 @@ from datachain import C, DataChain
|
|
|
273
273
|
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
274
274
|
|
|
275
275
|
chain = (
|
|
276
|
-
DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
|
|
276
|
+
DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
|
|
277
277
|
.map(label=lambda name: name.split(".")[0], params=["file.name"])
|
|
278
278
|
.select("file", "label").to_pytorch(
|
|
279
279
|
transform=processor.image_processor,
|
|
@@ -61,14 +61,16 @@ class DataChainCache:
|
|
|
61
61
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
62
62
|
size = file.size
|
|
63
63
|
if size < 0:
|
|
64
|
-
size = await client.get_size(from_path)
|
|
64
|
+
size = await client.get_size(from_path, version_id=file.version)
|
|
65
65
|
cb = callback or TqdmCallback(
|
|
66
66
|
tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
|
|
67
67
|
tqdm_cls=Tqdm,
|
|
68
68
|
size=size,
|
|
69
69
|
)
|
|
70
70
|
try:
|
|
71
|
-
await client.get_file(
|
|
71
|
+
await client.get_file(
|
|
72
|
+
from_path, tmp_info, callback=cb, version_id=file.version
|
|
73
|
+
)
|
|
72
74
|
finally:
|
|
73
75
|
if not callback:
|
|
74
76
|
cb.close()
|
|
@@ -240,7 +240,8 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
240
240
|
class NodeGroup:
|
|
241
241
|
"""Class for a group of nodes from the same source"""
|
|
242
242
|
|
|
243
|
-
listing: "Listing"
|
|
243
|
+
listing: Optional["Listing"]
|
|
244
|
+
client: "Client"
|
|
244
245
|
sources: list[DataSource]
|
|
245
246
|
|
|
246
247
|
# The source path within the bucket
|
|
@@ -268,9 +269,7 @@ class NodeGroup:
|
|
|
268
269
|
Download this node group to cache.
|
|
269
270
|
"""
|
|
270
271
|
if self.sources:
|
|
271
|
-
self.
|
|
272
|
-
self.iternodes(recursive), shared_progress_bar=pbar
|
|
273
|
-
)
|
|
272
|
+
self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
|
|
274
273
|
|
|
275
274
|
|
|
276
275
|
def check_output_dataset_file(
|
|
@@ -375,7 +374,7 @@ def collect_nodes_for_cp(
|
|
|
375
374
|
|
|
376
375
|
# Collect all sources to process
|
|
377
376
|
for node_group in node_groups:
|
|
378
|
-
listing: Listing = node_group.listing
|
|
377
|
+
listing: Optional[Listing] = node_group.listing
|
|
379
378
|
valid_sources: list[DataSource] = []
|
|
380
379
|
for dsrc in node_group.sources:
|
|
381
380
|
if dsrc.is_single_object():
|
|
@@ -383,6 +382,7 @@ def collect_nodes_for_cp(
|
|
|
383
382
|
total_files += 1
|
|
384
383
|
valid_sources.append(dsrc)
|
|
385
384
|
else:
|
|
385
|
+
assert listing
|
|
386
386
|
node = dsrc.node
|
|
387
387
|
if not recursive:
|
|
388
388
|
print(f"{node.full_path} is a directory (not copied).")
|
|
@@ -433,37 +433,51 @@ def instantiate_node_groups(
|
|
|
433
433
|
)
|
|
434
434
|
|
|
435
435
|
output_dir = output
|
|
436
|
+
output_file = None
|
|
436
437
|
if copy_to_filename:
|
|
437
438
|
output_dir = os.path.dirname(output)
|
|
438
439
|
if not output_dir:
|
|
439
440
|
output_dir = "."
|
|
441
|
+
output_file = os.path.basename(output)
|
|
440
442
|
|
|
441
443
|
# Instantiate these nodes
|
|
442
444
|
for node_group in node_groups:
|
|
443
445
|
if not node_group.sources:
|
|
444
446
|
continue
|
|
445
|
-
listing: Listing = node_group.listing
|
|
447
|
+
listing: Optional[Listing] = node_group.listing
|
|
446
448
|
source_path: str = node_group.source_path
|
|
447
449
|
|
|
448
450
|
copy_dir_contents = always_copy_dir_contents or source_path.endswith("/")
|
|
449
|
-
|
|
450
|
-
node_group.sources
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
listing.
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
451
|
+
if not listing:
|
|
452
|
+
source = node_group.sources[0]
|
|
453
|
+
client = source.client
|
|
454
|
+
node = NodeWithPath(source.node, [output_file or source.node.path])
|
|
455
|
+
instantiated_nodes = [node]
|
|
456
|
+
if not virtual_only:
|
|
457
|
+
node.instantiate(
|
|
458
|
+
client, output_dir, instantiate_progress_bar, force=force
|
|
459
|
+
)
|
|
460
|
+
else:
|
|
461
|
+
instantiated_nodes = listing.collect_nodes_to_instantiate(
|
|
462
|
+
node_group.sources,
|
|
463
|
+
copy_to_filename,
|
|
464
|
+
recursive,
|
|
465
|
+
copy_dir_contents,
|
|
466
|
+
source_path,
|
|
467
|
+
node_group.is_edatachain,
|
|
468
|
+
node_group.is_dataset,
|
|
465
469
|
)
|
|
470
|
+
if not virtual_only:
|
|
471
|
+
listing.instantiate_nodes(
|
|
472
|
+
instantiated_nodes,
|
|
473
|
+
output_dir,
|
|
474
|
+
total_files,
|
|
475
|
+
force=force,
|
|
476
|
+
shared_progress_bar=instantiate_progress_bar,
|
|
477
|
+
)
|
|
478
|
+
|
|
466
479
|
node_group.instantiated_nodes = instantiated_nodes
|
|
480
|
+
|
|
467
481
|
if instantiate_progress_bar:
|
|
468
482
|
instantiate_progress_bar.close()
|
|
469
483
|
|
|
@@ -592,7 +606,7 @@ class Catalog:
|
|
|
592
606
|
client_config=None,
|
|
593
607
|
object_name="file",
|
|
594
608
|
skip_indexing=False,
|
|
595
|
-
) -> tuple["Listing", str]:
|
|
609
|
+
) -> tuple[Optional["Listing"], "Client", str]:
|
|
596
610
|
from datachain.lib.dc import DataChain
|
|
597
611
|
from datachain.listing import Listing
|
|
598
612
|
|
|
@@ -603,16 +617,19 @@ class Catalog:
|
|
|
603
617
|
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
604
618
|
source, self.session, update=update
|
|
605
619
|
)
|
|
620
|
+
lst = None
|
|
621
|
+
client = Client.get_client(list_uri, self.cache, **self.client_config)
|
|
622
|
+
|
|
623
|
+
if list_ds_name:
|
|
624
|
+
lst = Listing(
|
|
625
|
+
self.metastore.clone(),
|
|
626
|
+
self.warehouse.clone(),
|
|
627
|
+
client,
|
|
628
|
+
dataset_name=list_ds_name,
|
|
629
|
+
object_name=object_name,
|
|
630
|
+
)
|
|
606
631
|
|
|
607
|
-
lst
|
|
608
|
-
self.metastore.clone(),
|
|
609
|
-
self.warehouse.clone(),
|
|
610
|
-
Client.get_client(list_uri, self.cache, **self.client_config),
|
|
611
|
-
dataset_name=list_ds_name,
|
|
612
|
-
object_name=object_name,
|
|
613
|
-
)
|
|
614
|
-
|
|
615
|
-
return lst, list_path
|
|
632
|
+
return lst, client, list_path
|
|
616
633
|
|
|
617
634
|
def _remove_dataset_rows_and_warehouse_info(
|
|
618
635
|
self, dataset: DatasetRecord, version: int, **kwargs
|
|
@@ -635,13 +652,13 @@ class Catalog:
|
|
|
635
652
|
) -> Optional[list["DataSource"]]:
|
|
636
653
|
enlisted_sources = []
|
|
637
654
|
for src in sources: # Opt: parallel
|
|
638
|
-
listing, file_path = self.enlist_source(
|
|
655
|
+
listing, client, file_path = self.enlist_source(
|
|
639
656
|
src,
|
|
640
657
|
update,
|
|
641
658
|
client_config=client_config or self.client_config,
|
|
642
659
|
skip_indexing=skip_indexing,
|
|
643
660
|
)
|
|
644
|
-
enlisted_sources.append((listing, file_path))
|
|
661
|
+
enlisted_sources.append((listing, client, file_path))
|
|
645
662
|
|
|
646
663
|
if only_index:
|
|
647
664
|
# sometimes we don't really need listing result (e.g on indexing process)
|
|
@@ -649,10 +666,16 @@ class Catalog:
|
|
|
649
666
|
return None
|
|
650
667
|
|
|
651
668
|
dsrc_all: list[DataSource] = []
|
|
652
|
-
for listing, file_path in enlisted_sources:
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
669
|
+
for listing, client, file_path in enlisted_sources:
|
|
670
|
+
if not listing:
|
|
671
|
+
nodes = [Node.from_file(client.get_file_info(file_path))]
|
|
672
|
+
dir_only = False
|
|
673
|
+
else:
|
|
674
|
+
nodes = listing.expand_path(file_path)
|
|
675
|
+
dir_only = file_path.endswith("/")
|
|
676
|
+
dsrc_all.extend(
|
|
677
|
+
DataSource(listing, client, node, dir_only) for node in nodes
|
|
678
|
+
)
|
|
656
679
|
return dsrc_all
|
|
657
680
|
|
|
658
681
|
def enlist_sources_grouped(
|
|
@@ -667,7 +690,7 @@ class Catalog:
|
|
|
667
690
|
|
|
668
691
|
def _row_to_node(d: dict[str, Any]) -> Node:
|
|
669
692
|
del d["file__source"]
|
|
670
|
-
return Node.
|
|
693
|
+
return Node.from_row(d)
|
|
671
694
|
|
|
672
695
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
673
696
|
client_config = client_config or self.client_config
|
|
@@ -677,7 +700,7 @@ class Catalog:
|
|
|
677
700
|
edatachain_data = parse_edatachain_file(src)
|
|
678
701
|
indexed_sources = []
|
|
679
702
|
for ds in edatachain_data:
|
|
680
|
-
listing, source_path = self.enlist_source(
|
|
703
|
+
listing, _, source_path = self.enlist_source(
|
|
681
704
|
ds["data-source"]["uri"],
|
|
682
705
|
update,
|
|
683
706
|
client_config=client_config,
|
|
@@ -701,6 +724,7 @@ class Catalog:
|
|
|
701
724
|
client = self.get_client(source, **client_config)
|
|
702
725
|
uri = client.uri
|
|
703
726
|
dataset_name, _, _, _ = get_listing(uri, self.session)
|
|
727
|
+
assert dataset_name
|
|
704
728
|
listing = Listing(
|
|
705
729
|
self.metastore.clone(),
|
|
706
730
|
self.warehouse.clone(),
|
|
@@ -713,6 +737,7 @@ class Catalog:
|
|
|
713
737
|
indexed_sources.append(
|
|
714
738
|
(
|
|
715
739
|
listing,
|
|
740
|
+
client,
|
|
716
741
|
source,
|
|
717
742
|
[_row_to_node(r) for r in rows],
|
|
718
743
|
ds_name,
|
|
@@ -722,25 +747,28 @@ class Catalog:
|
|
|
722
747
|
|
|
723
748
|
enlisted_sources.append((False, True, indexed_sources))
|
|
724
749
|
else:
|
|
725
|
-
listing, source_path = self.enlist_source(
|
|
750
|
+
listing, client, source_path = self.enlist_source(
|
|
726
751
|
src, update, client_config=client_config
|
|
727
752
|
)
|
|
728
|
-
enlisted_sources.append((False, False, (listing, source_path)))
|
|
753
|
+
enlisted_sources.append((False, False, (listing, client, source_path)))
|
|
729
754
|
|
|
730
755
|
node_groups = []
|
|
731
756
|
for is_datachain, is_dataset, payload in enlisted_sources: # Opt: parallel
|
|
732
757
|
if is_dataset:
|
|
733
758
|
for (
|
|
734
759
|
listing,
|
|
760
|
+
client,
|
|
735
761
|
source_path,
|
|
736
762
|
nodes,
|
|
737
763
|
dataset_name,
|
|
738
764
|
dataset_version,
|
|
739
765
|
) in payload:
|
|
740
|
-
|
|
766
|
+
assert listing
|
|
767
|
+
dsrc = [DataSource(listing, client, node) for node in nodes]
|
|
741
768
|
node_groups.append(
|
|
742
769
|
NodeGroup(
|
|
743
770
|
listing,
|
|
771
|
+
client,
|
|
744
772
|
dsrc,
|
|
745
773
|
source_path,
|
|
746
774
|
dataset_name=dataset_name,
|
|
@@ -749,18 +777,30 @@ class Catalog:
|
|
|
749
777
|
)
|
|
750
778
|
elif is_datachain:
|
|
751
779
|
for listing, source_path, paths in payload:
|
|
752
|
-
|
|
780
|
+
assert listing
|
|
781
|
+
dsrc = [
|
|
782
|
+
DataSource(listing, listing.client, listing.resolve_path(p))
|
|
783
|
+
for p in paths
|
|
784
|
+
]
|
|
753
785
|
node_groups.append(
|
|
754
|
-
NodeGroup(
|
|
786
|
+
NodeGroup(
|
|
787
|
+
listing,
|
|
788
|
+
listing.client,
|
|
789
|
+
dsrc,
|
|
790
|
+
source_path,
|
|
791
|
+
is_edatachain=True,
|
|
792
|
+
)
|
|
755
793
|
)
|
|
756
794
|
else:
|
|
757
|
-
listing, source_path = payload
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
795
|
+
listing, client, source_path = payload
|
|
796
|
+
if not listing:
|
|
797
|
+
nodes = [Node.from_file(client.get_file_info(source_path))]
|
|
798
|
+
as_container = False
|
|
799
|
+
else:
|
|
800
|
+
as_container = source_path.endswith("/")
|
|
801
|
+
nodes = listing.expand_path(source_path, use_glob=not no_glob)
|
|
802
|
+
dsrc = [DataSource(listing, client, n, as_container) for n in nodes]
|
|
803
|
+
node_groups.append(NodeGroup(listing, client, dsrc, source_path))
|
|
764
804
|
|
|
765
805
|
return node_groups
|
|
766
806
|
|
|
@@ -1196,10 +1236,16 @@ class Catalog:
|
|
|
1196
1236
|
|
|
1197
1237
|
return q.to_db_records()
|
|
1198
1238
|
|
|
1199
|
-
def signed_url(
|
|
1239
|
+
def signed_url(
|
|
1240
|
+
self,
|
|
1241
|
+
source: str,
|
|
1242
|
+
path: str,
|
|
1243
|
+
version_id: Optional[str] = None,
|
|
1244
|
+
client_config=None,
|
|
1245
|
+
) -> str:
|
|
1200
1246
|
client_config = client_config or self.client_config
|
|
1201
1247
|
client = Client.get_client(source, self.cache, **client_config)
|
|
1202
|
-
return client.url(path)
|
|
1248
|
+
return client.url(path, version_id=version_id)
|
|
1203
1249
|
|
|
1204
1250
|
def export_dataset_table(
|
|
1205
1251
|
self,
|
|
@@ -4,21 +4,19 @@ from datachain.node import DirType, NodeWithPath
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class DataSource:
|
|
7
|
-
def __init__(self, listing, node, as_container=False):
|
|
7
|
+
def __init__(self, listing, client, node, as_container=False):
|
|
8
8
|
self.listing = listing
|
|
9
|
+
self.client = client
|
|
9
10
|
self.node = node
|
|
10
11
|
self.as_container = (
|
|
11
12
|
as_container # Indicates whether a .tar file is handled as a container
|
|
12
13
|
)
|
|
13
14
|
|
|
14
|
-
def get_full_path(self):
|
|
15
|
-
return self.get_node_full_path(self.node)
|
|
16
|
-
|
|
17
15
|
def get_node_full_path(self, node):
|
|
18
|
-
return self.
|
|
16
|
+
return self.client.get_full_path(node.full_path)
|
|
19
17
|
|
|
20
18
|
def get_node_full_path_from_path(self, full_path):
|
|
21
|
-
return self.
|
|
19
|
+
return self.client.get_full_path(full_path)
|
|
22
20
|
|
|
23
21
|
def is_single_object(self):
|
|
24
22
|
return self.node.dir_type == DirType.FILE or (
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
from urllib.parse import parse_qs, urlsplit, urlunsplit
|
|
2
3
|
|
|
3
4
|
from adlfs import AzureBlobFileSystem
|
|
4
5
|
from tqdm import tqdm
|
|
@@ -25,6 +26,16 @@ class AzureClient(Client):
|
|
|
25
26
|
size=v.get("size", ""),
|
|
26
27
|
)
|
|
27
28
|
|
|
29
|
+
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Generate a signed URL for the given path.
|
|
32
|
+
"""
|
|
33
|
+
version_id = kwargs.pop("version_id", None)
|
|
34
|
+
result = self.fs.sign(
|
|
35
|
+
self.get_full_path(path, version_id), expiration=expires, **kwargs
|
|
36
|
+
)
|
|
37
|
+
return result + (f"&versionid={version_id}" if version_id else "")
|
|
38
|
+
|
|
28
39
|
async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
|
|
29
40
|
prefix = start_prefix
|
|
30
41
|
if prefix:
|
|
@@ -57,4 +68,13 @@ class AzureClient(Client):
|
|
|
57
68
|
finally:
|
|
58
69
|
result_queue.put_nowait(None)
|
|
59
70
|
|
|
71
|
+
@classmethod
|
|
72
|
+
def version_path(cls, path: str, version_id: Optional[str]) -> str:
|
|
73
|
+
parts = list(urlsplit(path))
|
|
74
|
+
query = parse_qs(parts[3])
|
|
75
|
+
if "versionid" in query:
|
|
76
|
+
raise ValueError("path already includes a version query")
|
|
77
|
+
parts[3] = f"versionid={version_id}" if version_id else ""
|
|
78
|
+
return urlunsplit(parts)
|
|
79
|
+
|
|
60
80
|
_fetch_default = _fetch_flat
|