datachain 0.8.2__tar.gz → 0.8.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.8.2 → datachain-0.8.3}/PKG-INFO +2 -2
- {datachain-0.8.2 → datachain-0.8.3}/pyproject.toml +1 -1
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/cache.py +4 -2
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/catalog/catalog.py +100 -54
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/catalog/datasource.py +4 -6
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/client/azure.py +21 -1
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/client/fsspec.py +35 -9
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/client/gcs.py +10 -2
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/client/local.py +4 -4
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/client/s3.py +10 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/dataset.py +1 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/dc.py +15 -3
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/listing.py +18 -3
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/listing.py +1 -5
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/node.py +27 -1
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/session.py +1 -1
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain.egg-info/PKG-INFO +2 -2
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain.egg-info/requires.txt +1 -1
- datachain-0.8.3/tests/func/fake-service-account-credentials.json +9 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_catalog.py +150 -12
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_datachain.py +6 -2
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_pull.py +1 -0
- datachain-0.8.3/tests/unit/test_client_gcs.py +14 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_client_s3.py +6 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_listing.py +1 -1
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_session.py +5 -0
- datachain-0.8.2/tests/unit/test_client_gcs.py +0 -6
- {datachain-0.8.2 → datachain-0.8.3}/.cruft.json +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.gitattributes +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.github/codecov.yaml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.github/dependabot.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.github/workflows/release.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.github/workflows/tests.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.gitignore +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/.pre-commit-config.yaml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/LICENSE +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/README.rst +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/assets/datachain.svg +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/contributing.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/examples.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/index.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/overrides/main.html +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/quick-start.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/references/datachain.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/references/datatype.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/references/file.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/references/index.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/references/sql.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/references/torch.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/references/udf.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/docs/tutorials.md +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/multimodal/wds.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/mkdocs.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/noxfile.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/setup.cfg +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/__main__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/asyn.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/cli.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/cli_utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/client/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/client/hf.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/config.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/error.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/array.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/base.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/conditional.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/func.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/numeric.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/path.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/random.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/string.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/func/window.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/job.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/clip.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/diff.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/file.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/hf.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/image.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/settings.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/tar.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/text.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/udf.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/model/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/model/bbox.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/model/pose.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/model/segment.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/progress.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/py.typed +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/batch.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/dataset.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/metrics.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/params.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/queue.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/schema.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/udf.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/query/utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/remote/studio.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/types.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/sql/utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/studio.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/telemetry.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain/utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/conftest.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/data.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/examples/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/examples/test_examples.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/examples/wds_data.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_client.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_datasets.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_listing.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_ls.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_metrics.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_pytorch.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_query.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_session.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/func/test_toolkit.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/scripts/feature_class.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/test_atomicity.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/test_cli_e2e.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/test_cli_studio.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/test_query_e2e.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/test_telemetry.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_asyn.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_cache.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_catalog.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_client.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_config.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_dataset.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_func.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_metastore.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_query.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_query_params.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_serializer.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.8.2 → datachain-0.8.3}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -72,7 +72,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
|
72
72
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
73
73
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
74
74
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
75
|
-
Requires-Dist: pytest-servers[all]>=0.5.
|
|
75
|
+
Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
|
|
76
76
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
77
77
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
78
78
|
Requires-Dist: virtualenv; extra == "tests"
|
|
@@ -61,14 +61,16 @@ class DataChainCache:
|
|
|
61
61
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
62
62
|
size = file.size
|
|
63
63
|
if size < 0:
|
|
64
|
-
size = await client.get_size(from_path)
|
|
64
|
+
size = await client.get_size(from_path, version_id=file.version)
|
|
65
65
|
cb = callback or TqdmCallback(
|
|
66
66
|
tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
|
|
67
67
|
tqdm_cls=Tqdm,
|
|
68
68
|
size=size,
|
|
69
69
|
)
|
|
70
70
|
try:
|
|
71
|
-
await client.get_file(
|
|
71
|
+
await client.get_file(
|
|
72
|
+
from_path, tmp_info, callback=cb, version_id=file.version
|
|
73
|
+
)
|
|
72
74
|
finally:
|
|
73
75
|
if not callback:
|
|
74
76
|
cb.close()
|
|
@@ -240,7 +240,8 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
240
240
|
class NodeGroup:
|
|
241
241
|
"""Class for a group of nodes from the same source"""
|
|
242
242
|
|
|
243
|
-
listing: "Listing"
|
|
243
|
+
listing: Optional["Listing"]
|
|
244
|
+
client: "Client"
|
|
244
245
|
sources: list[DataSource]
|
|
245
246
|
|
|
246
247
|
# The source path within the bucket
|
|
@@ -268,9 +269,7 @@ class NodeGroup:
|
|
|
268
269
|
Download this node group to cache.
|
|
269
270
|
"""
|
|
270
271
|
if self.sources:
|
|
271
|
-
self.
|
|
272
|
-
self.iternodes(recursive), shared_progress_bar=pbar
|
|
273
|
-
)
|
|
272
|
+
self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
|
|
274
273
|
|
|
275
274
|
|
|
276
275
|
def check_output_dataset_file(
|
|
@@ -375,7 +374,7 @@ def collect_nodes_for_cp(
|
|
|
375
374
|
|
|
376
375
|
# Collect all sources to process
|
|
377
376
|
for node_group in node_groups:
|
|
378
|
-
listing: Listing = node_group.listing
|
|
377
|
+
listing: Optional[Listing] = node_group.listing
|
|
379
378
|
valid_sources: list[DataSource] = []
|
|
380
379
|
for dsrc in node_group.sources:
|
|
381
380
|
if dsrc.is_single_object():
|
|
@@ -383,6 +382,7 @@ def collect_nodes_for_cp(
|
|
|
383
382
|
total_files += 1
|
|
384
383
|
valid_sources.append(dsrc)
|
|
385
384
|
else:
|
|
385
|
+
assert listing
|
|
386
386
|
node = dsrc.node
|
|
387
387
|
if not recursive:
|
|
388
388
|
print(f"{node.full_path} is a directory (not copied).")
|
|
@@ -433,37 +433,51 @@ def instantiate_node_groups(
|
|
|
433
433
|
)
|
|
434
434
|
|
|
435
435
|
output_dir = output
|
|
436
|
+
output_file = None
|
|
436
437
|
if copy_to_filename:
|
|
437
438
|
output_dir = os.path.dirname(output)
|
|
438
439
|
if not output_dir:
|
|
439
440
|
output_dir = "."
|
|
441
|
+
output_file = os.path.basename(output)
|
|
440
442
|
|
|
441
443
|
# Instantiate these nodes
|
|
442
444
|
for node_group in node_groups:
|
|
443
445
|
if not node_group.sources:
|
|
444
446
|
continue
|
|
445
|
-
listing: Listing = node_group.listing
|
|
447
|
+
listing: Optional[Listing] = node_group.listing
|
|
446
448
|
source_path: str = node_group.source_path
|
|
447
449
|
|
|
448
450
|
copy_dir_contents = always_copy_dir_contents or source_path.endswith("/")
|
|
449
|
-
|
|
450
|
-
node_group.sources
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
listing.
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
451
|
+
if not listing:
|
|
452
|
+
source = node_group.sources[0]
|
|
453
|
+
client = source.client
|
|
454
|
+
node = NodeWithPath(source.node, [output_file or source.node.path])
|
|
455
|
+
instantiated_nodes = [node]
|
|
456
|
+
if not virtual_only:
|
|
457
|
+
node.instantiate(
|
|
458
|
+
client, output_dir, instantiate_progress_bar, force=force
|
|
459
|
+
)
|
|
460
|
+
else:
|
|
461
|
+
instantiated_nodes = listing.collect_nodes_to_instantiate(
|
|
462
|
+
node_group.sources,
|
|
463
|
+
copy_to_filename,
|
|
464
|
+
recursive,
|
|
465
|
+
copy_dir_contents,
|
|
466
|
+
source_path,
|
|
467
|
+
node_group.is_edatachain,
|
|
468
|
+
node_group.is_dataset,
|
|
465
469
|
)
|
|
470
|
+
if not virtual_only:
|
|
471
|
+
listing.instantiate_nodes(
|
|
472
|
+
instantiated_nodes,
|
|
473
|
+
output_dir,
|
|
474
|
+
total_files,
|
|
475
|
+
force=force,
|
|
476
|
+
shared_progress_bar=instantiate_progress_bar,
|
|
477
|
+
)
|
|
478
|
+
|
|
466
479
|
node_group.instantiated_nodes = instantiated_nodes
|
|
480
|
+
|
|
467
481
|
if instantiate_progress_bar:
|
|
468
482
|
instantiate_progress_bar.close()
|
|
469
483
|
|
|
@@ -592,7 +606,7 @@ class Catalog:
|
|
|
592
606
|
client_config=None,
|
|
593
607
|
object_name="file",
|
|
594
608
|
skip_indexing=False,
|
|
595
|
-
) -> tuple["Listing", str]:
|
|
609
|
+
) -> tuple[Optional["Listing"], "Client", str]:
|
|
596
610
|
from datachain.lib.dc import DataChain
|
|
597
611
|
from datachain.listing import Listing
|
|
598
612
|
|
|
@@ -603,16 +617,19 @@ class Catalog:
|
|
|
603
617
|
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
604
618
|
source, self.session, update=update
|
|
605
619
|
)
|
|
620
|
+
lst = None
|
|
621
|
+
client = Client.get_client(list_uri, self.cache, **self.client_config)
|
|
622
|
+
|
|
623
|
+
if list_ds_name:
|
|
624
|
+
lst = Listing(
|
|
625
|
+
self.metastore.clone(),
|
|
626
|
+
self.warehouse.clone(),
|
|
627
|
+
client,
|
|
628
|
+
dataset_name=list_ds_name,
|
|
629
|
+
object_name=object_name,
|
|
630
|
+
)
|
|
606
631
|
|
|
607
|
-
lst
|
|
608
|
-
self.metastore.clone(),
|
|
609
|
-
self.warehouse.clone(),
|
|
610
|
-
Client.get_client(list_uri, self.cache, **self.client_config),
|
|
611
|
-
dataset_name=list_ds_name,
|
|
612
|
-
object_name=object_name,
|
|
613
|
-
)
|
|
614
|
-
|
|
615
|
-
return lst, list_path
|
|
632
|
+
return lst, client, list_path
|
|
616
633
|
|
|
617
634
|
def _remove_dataset_rows_and_warehouse_info(
|
|
618
635
|
self, dataset: DatasetRecord, version: int, **kwargs
|
|
@@ -635,13 +652,13 @@ class Catalog:
|
|
|
635
652
|
) -> Optional[list["DataSource"]]:
|
|
636
653
|
enlisted_sources = []
|
|
637
654
|
for src in sources: # Opt: parallel
|
|
638
|
-
listing, file_path = self.enlist_source(
|
|
655
|
+
listing, client, file_path = self.enlist_source(
|
|
639
656
|
src,
|
|
640
657
|
update,
|
|
641
658
|
client_config=client_config or self.client_config,
|
|
642
659
|
skip_indexing=skip_indexing,
|
|
643
660
|
)
|
|
644
|
-
enlisted_sources.append((listing, file_path))
|
|
661
|
+
enlisted_sources.append((listing, client, file_path))
|
|
645
662
|
|
|
646
663
|
if only_index:
|
|
647
664
|
# sometimes we don't really need listing result (e.g on indexing process)
|
|
@@ -649,10 +666,16 @@ class Catalog:
|
|
|
649
666
|
return None
|
|
650
667
|
|
|
651
668
|
dsrc_all: list[DataSource] = []
|
|
652
|
-
for listing, file_path in enlisted_sources:
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
669
|
+
for listing, client, file_path in enlisted_sources:
|
|
670
|
+
if not listing:
|
|
671
|
+
nodes = [Node.from_file(client.get_file_info(file_path))]
|
|
672
|
+
dir_only = False
|
|
673
|
+
else:
|
|
674
|
+
nodes = listing.expand_path(file_path)
|
|
675
|
+
dir_only = file_path.endswith("/")
|
|
676
|
+
dsrc_all.extend(
|
|
677
|
+
DataSource(listing, client, node, dir_only) for node in nodes
|
|
678
|
+
)
|
|
656
679
|
return dsrc_all
|
|
657
680
|
|
|
658
681
|
def enlist_sources_grouped(
|
|
@@ -667,7 +690,7 @@ class Catalog:
|
|
|
667
690
|
|
|
668
691
|
def _row_to_node(d: dict[str, Any]) -> Node:
|
|
669
692
|
del d["file__source"]
|
|
670
|
-
return Node.
|
|
693
|
+
return Node.from_row(d)
|
|
671
694
|
|
|
672
695
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
673
696
|
client_config = client_config or self.client_config
|
|
@@ -677,7 +700,7 @@ class Catalog:
|
|
|
677
700
|
edatachain_data = parse_edatachain_file(src)
|
|
678
701
|
indexed_sources = []
|
|
679
702
|
for ds in edatachain_data:
|
|
680
|
-
listing, source_path = self.enlist_source(
|
|
703
|
+
listing, _, source_path = self.enlist_source(
|
|
681
704
|
ds["data-source"]["uri"],
|
|
682
705
|
update,
|
|
683
706
|
client_config=client_config,
|
|
@@ -701,6 +724,7 @@ class Catalog:
|
|
|
701
724
|
client = self.get_client(source, **client_config)
|
|
702
725
|
uri = client.uri
|
|
703
726
|
dataset_name, _, _, _ = get_listing(uri, self.session)
|
|
727
|
+
assert dataset_name
|
|
704
728
|
listing = Listing(
|
|
705
729
|
self.metastore.clone(),
|
|
706
730
|
self.warehouse.clone(),
|
|
@@ -713,6 +737,7 @@ class Catalog:
|
|
|
713
737
|
indexed_sources.append(
|
|
714
738
|
(
|
|
715
739
|
listing,
|
|
740
|
+
client,
|
|
716
741
|
source,
|
|
717
742
|
[_row_to_node(r) for r in rows],
|
|
718
743
|
ds_name,
|
|
@@ -722,25 +747,28 @@ class Catalog:
|
|
|
722
747
|
|
|
723
748
|
enlisted_sources.append((False, True, indexed_sources))
|
|
724
749
|
else:
|
|
725
|
-
listing, source_path = self.enlist_source(
|
|
750
|
+
listing, client, source_path = self.enlist_source(
|
|
726
751
|
src, update, client_config=client_config
|
|
727
752
|
)
|
|
728
|
-
enlisted_sources.append((False, False, (listing, source_path)))
|
|
753
|
+
enlisted_sources.append((False, False, (listing, client, source_path)))
|
|
729
754
|
|
|
730
755
|
node_groups = []
|
|
731
756
|
for is_datachain, is_dataset, payload in enlisted_sources: # Opt: parallel
|
|
732
757
|
if is_dataset:
|
|
733
758
|
for (
|
|
734
759
|
listing,
|
|
760
|
+
client,
|
|
735
761
|
source_path,
|
|
736
762
|
nodes,
|
|
737
763
|
dataset_name,
|
|
738
764
|
dataset_version,
|
|
739
765
|
) in payload:
|
|
740
|
-
|
|
766
|
+
assert listing
|
|
767
|
+
dsrc = [DataSource(listing, client, node) for node in nodes]
|
|
741
768
|
node_groups.append(
|
|
742
769
|
NodeGroup(
|
|
743
770
|
listing,
|
|
771
|
+
client,
|
|
744
772
|
dsrc,
|
|
745
773
|
source_path,
|
|
746
774
|
dataset_name=dataset_name,
|
|
@@ -749,18 +777,30 @@ class Catalog:
|
|
|
749
777
|
)
|
|
750
778
|
elif is_datachain:
|
|
751
779
|
for listing, source_path, paths in payload:
|
|
752
|
-
|
|
780
|
+
assert listing
|
|
781
|
+
dsrc = [
|
|
782
|
+
DataSource(listing, listing.client, listing.resolve_path(p))
|
|
783
|
+
for p in paths
|
|
784
|
+
]
|
|
753
785
|
node_groups.append(
|
|
754
|
-
NodeGroup(
|
|
786
|
+
NodeGroup(
|
|
787
|
+
listing,
|
|
788
|
+
listing.client,
|
|
789
|
+
dsrc,
|
|
790
|
+
source_path,
|
|
791
|
+
is_edatachain=True,
|
|
792
|
+
)
|
|
755
793
|
)
|
|
756
794
|
else:
|
|
757
|
-
listing, source_path = payload
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
795
|
+
listing, client, source_path = payload
|
|
796
|
+
if not listing:
|
|
797
|
+
nodes = [Node.from_file(client.get_file_info(source_path))]
|
|
798
|
+
as_container = False
|
|
799
|
+
else:
|
|
800
|
+
as_container = source_path.endswith("/")
|
|
801
|
+
nodes = listing.expand_path(source_path, use_glob=not no_glob)
|
|
802
|
+
dsrc = [DataSource(listing, client, n, as_container) for n in nodes]
|
|
803
|
+
node_groups.append(NodeGroup(listing, client, dsrc, source_path))
|
|
764
804
|
|
|
765
805
|
return node_groups
|
|
766
806
|
|
|
@@ -1196,10 +1236,16 @@ class Catalog:
|
|
|
1196
1236
|
|
|
1197
1237
|
return q.to_db_records()
|
|
1198
1238
|
|
|
1199
|
-
def signed_url(
|
|
1239
|
+
def signed_url(
|
|
1240
|
+
self,
|
|
1241
|
+
source: str,
|
|
1242
|
+
path: str,
|
|
1243
|
+
version_id: Optional[str] = None,
|
|
1244
|
+
client_config=None,
|
|
1245
|
+
) -> str:
|
|
1200
1246
|
client_config = client_config or self.client_config
|
|
1201
1247
|
client = Client.get_client(source, self.cache, **client_config)
|
|
1202
|
-
return client.url(path)
|
|
1248
|
+
return client.url(path, version_id=version_id)
|
|
1203
1249
|
|
|
1204
1250
|
def export_dataset_table(
|
|
1205
1251
|
self,
|
|
@@ -4,21 +4,19 @@ from datachain.node import DirType, NodeWithPath
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class DataSource:
|
|
7
|
-
def __init__(self, listing, node, as_container=False):
|
|
7
|
+
def __init__(self, listing, client, node, as_container=False):
|
|
8
8
|
self.listing = listing
|
|
9
|
+
self.client = client
|
|
9
10
|
self.node = node
|
|
10
11
|
self.as_container = (
|
|
11
12
|
as_container # Indicates whether a .tar file is handled as a container
|
|
12
13
|
)
|
|
13
14
|
|
|
14
|
-
def get_full_path(self):
|
|
15
|
-
return self.get_node_full_path(self.node)
|
|
16
|
-
|
|
17
15
|
def get_node_full_path(self, node):
|
|
18
|
-
return self.
|
|
16
|
+
return self.client.get_full_path(node.full_path)
|
|
19
17
|
|
|
20
18
|
def get_node_full_path_from_path(self, full_path):
|
|
21
|
-
return self.
|
|
19
|
+
return self.client.get_full_path(full_path)
|
|
22
20
|
|
|
23
21
|
def is_single_object(self):
|
|
24
22
|
return self.node.dir_type == DirType.FILE or (
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
from urllib.parse import parse_qs, urlsplit, urlunsplit
|
|
2
3
|
|
|
3
4
|
from adlfs import AzureBlobFileSystem
|
|
4
5
|
from tqdm import tqdm
|
|
@@ -25,6 +26,16 @@ class AzureClient(Client):
|
|
|
25
26
|
size=v.get("size", ""),
|
|
26
27
|
)
|
|
27
28
|
|
|
29
|
+
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Generate a signed URL for the given path.
|
|
32
|
+
"""
|
|
33
|
+
version_id = kwargs.pop("version_id", None)
|
|
34
|
+
result = self.fs.sign(
|
|
35
|
+
self.get_full_path(path, version_id), expiration=expires, **kwargs
|
|
36
|
+
)
|
|
37
|
+
return result + (f"&versionid={version_id}" if version_id else "")
|
|
38
|
+
|
|
28
39
|
async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
|
|
29
40
|
prefix = start_prefix
|
|
30
41
|
if prefix:
|
|
@@ -57,4 +68,13 @@ class AzureClient(Client):
|
|
|
57
68
|
finally:
|
|
58
69
|
result_queue.put_nowait(None)
|
|
59
70
|
|
|
71
|
+
@classmethod
|
|
72
|
+
def version_path(cls, path: str, version_id: Optional[str]) -> str:
|
|
73
|
+
parts = list(urlsplit(path))
|
|
74
|
+
query = parse_qs(parts[3])
|
|
75
|
+
if "versionid" in query:
|
|
76
|
+
raise ValueError("path already includes a version query")
|
|
77
|
+
parts[3] = f"versionid={version_id}" if version_id else ""
|
|
78
|
+
return urlunsplit(parts)
|
|
79
|
+
|
|
60
80
|
_fetch_default = _fetch_flat
|
|
@@ -137,6 +137,10 @@ class Client(ABC):
|
|
|
137
137
|
fs.invalidate_cache()
|
|
138
138
|
return fs
|
|
139
139
|
|
|
140
|
+
@classmethod
|
|
141
|
+
def version_path(cls, path: str, version_id: Optional[str]) -> str:
|
|
142
|
+
return path
|
|
143
|
+
|
|
140
144
|
@classmethod
|
|
141
145
|
def from_name(
|
|
142
146
|
cls,
|
|
@@ -198,17 +202,37 @@ class Client(ABC):
|
|
|
198
202
|
return self._fs
|
|
199
203
|
|
|
200
204
|
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
201
|
-
return self.fs.sign(
|
|
205
|
+
return self.fs.sign(
|
|
206
|
+
self.get_full_path(path, kwargs.pop("version_id", None)),
|
|
207
|
+
expiration=expires,
|
|
208
|
+
**kwargs,
|
|
209
|
+
)
|
|
202
210
|
|
|
203
211
|
async def get_current_etag(self, file: "File") -> str:
|
|
204
|
-
|
|
212
|
+
kwargs = {}
|
|
213
|
+
if self.fs.version_aware:
|
|
214
|
+
kwargs["version_id"] = file.version
|
|
215
|
+
info = await self.fs._info(
|
|
216
|
+
self.get_full_path(file.path, file.version), **kwargs
|
|
217
|
+
)
|
|
205
218
|
return self.info_to_file(info, "").etag
|
|
206
219
|
|
|
207
|
-
|
|
208
|
-
|
|
220
|
+
def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
|
|
221
|
+
info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
|
|
222
|
+
return self.info_to_file(info, path)
|
|
223
|
+
|
|
224
|
+
async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
|
|
225
|
+
return await self.fs._size(
|
|
226
|
+
self.version_path(path, version_id), version_id=version_id
|
|
227
|
+
)
|
|
209
228
|
|
|
210
|
-
async def get_file(self, lpath, rpath, callback):
|
|
211
|
-
return await self.fs._get_file(
|
|
229
|
+
async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
|
|
230
|
+
return await self.fs._get_file(
|
|
231
|
+
self.version_path(lpath, version_id),
|
|
232
|
+
rpath,
|
|
233
|
+
callback=callback,
|
|
234
|
+
version_id=version_id,
|
|
235
|
+
)
|
|
212
236
|
|
|
213
237
|
async def scandir(
|
|
214
238
|
self, start_prefix: str, method: str = "default"
|
|
@@ -315,8 +339,8 @@ class Client(ABC):
|
|
|
315
339
|
def rel_path(self, path: str) -> str:
|
|
316
340
|
return self.fs.split_path(path)[1]
|
|
317
341
|
|
|
318
|
-
def get_full_path(self, rel_path: str) -> str:
|
|
319
|
-
return f"{self.PREFIX}{self.name}/{rel_path}"
|
|
342
|
+
def get_full_path(self, rel_path: str, version_id: Optional[str] = None) -> str:
|
|
343
|
+
return self.version_path(f"{self.PREFIX}{self.name}/{rel_path}", version_id)
|
|
320
344
|
|
|
321
345
|
@abstractmethod
|
|
322
346
|
def info_to_file(self, v: dict[str, Any], parent: str) -> "File": ...
|
|
@@ -362,7 +386,9 @@ class Client(ABC):
|
|
|
362
386
|
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
363
387
|
return open(cache_path, mode="rb")
|
|
364
388
|
assert not file.location
|
|
365
|
-
return FileWrapper(
|
|
389
|
+
return FileWrapper(
|
|
390
|
+
self.fs.open(self.get_full_path(file.path, file.version)), cb
|
|
391
|
+
) # type: ignore[return-value]
|
|
366
392
|
|
|
367
393
|
def download(self, file: "File", *, callback: Callback = DEFAULT_CALLBACK) -> None:
|
|
368
394
|
sync(get_loop(), functools.partial(self._download, file, callback=callback))
|
|
@@ -38,9 +38,13 @@ class GCSClient(Client):
|
|
|
38
38
|
If the client is anonymous, a public URL is returned instead
|
|
39
39
|
(see https://cloud.google.com/storage/docs/access-public-data#api-link).
|
|
40
40
|
"""
|
|
41
|
+
version_id = kwargs.pop("version_id", None)
|
|
41
42
|
if self.fs.storage_options.get("token") == "anon":
|
|
42
|
-
|
|
43
|
-
|
|
43
|
+
query = f"?generation={version_id}" if version_id else ""
|
|
44
|
+
return f"https://storage.googleapis.com/{self.name}/{path}{query}"
|
|
45
|
+
return self.fs.sign(
|
|
46
|
+
self.get_full_path(path, version_id), expiration=expires, **kwargs
|
|
47
|
+
)
|
|
44
48
|
|
|
45
49
|
@staticmethod
|
|
46
50
|
def parse_timestamp(timestamp: str) -> datetime:
|
|
@@ -131,3 +135,7 @@ class GCSClient(Client):
|
|
|
131
135
|
last_modified=self.parse_timestamp(v["updated"]),
|
|
132
136
|
size=v.get("size", ""),
|
|
133
137
|
)
|
|
138
|
+
|
|
139
|
+
@classmethod
|
|
140
|
+
def version_path(cls, path: str, version_id: Optional[str]) -> str:
|
|
141
|
+
return f"{path}#{version_id}" if version_id else path
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import posixpath
|
|
3
3
|
from datetime import datetime, timezone
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
@@ -105,10 +105,10 @@ class FileClient(Client):
|
|
|
105
105
|
info = self.fs.info(self.get_full_path(file.path))
|
|
106
106
|
return self.info_to_file(info, "").etag
|
|
107
107
|
|
|
108
|
-
async def get_size(self, path: str) -> int:
|
|
108
|
+
async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
|
|
109
109
|
return self.fs.size(path)
|
|
110
110
|
|
|
111
|
-
async def get_file(self, lpath, rpath, callback):
|
|
111
|
+
async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
|
|
112
112
|
return self.fs.get_file(lpath, rpath, callback=callback)
|
|
113
113
|
|
|
114
114
|
async def ls_dir(self, path):
|
|
@@ -117,7 +117,7 @@ class FileClient(Client):
|
|
|
117
117
|
def rel_path(self, path):
|
|
118
118
|
return posixpath.relpath(path, self.name)
|
|
119
119
|
|
|
120
|
-
def get_full_path(self, rel_path):
|
|
120
|
+
def get_full_path(self, rel_path, version_id: Optional[str] = None):
|
|
121
121
|
full_path = Path(self.name, rel_path).as_posix()
|
|
122
122
|
if rel_path.endswith("/") or not rel_path:
|
|
123
123
|
full_path += "/"
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from typing import Any, Optional, cast
|
|
3
|
+
from urllib.parse import parse_qs, urlsplit, urlunsplit
|
|
3
4
|
|
|
4
5
|
from botocore.exceptions import NoCredentialsError
|
|
5
6
|
from s3fs import S3FileSystem
|
|
@@ -121,6 +122,15 @@ class ClientS3(Client):
|
|
|
121
122
|
size=v["Size"],
|
|
122
123
|
)
|
|
123
124
|
|
|
125
|
+
@classmethod
|
|
126
|
+
def version_path(cls, path: str, version_id: Optional[str]) -> str:
|
|
127
|
+
parts = list(urlsplit(path))
|
|
128
|
+
query = parse_qs(parts[3])
|
|
129
|
+
if "versionId" in query:
|
|
130
|
+
raise ValueError("path already includes a version query")
|
|
131
|
+
parts[3] = f"versionId={version_id}" if version_id else ""
|
|
132
|
+
return urlunsplit(parts)
|
|
133
|
+
|
|
124
134
|
async def _fetch_dir(
|
|
125
135
|
self,
|
|
126
136
|
prefix,
|
|
@@ -32,7 +32,7 @@ from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_dat
|
|
|
32
32
|
from datachain.lib.dataset_info import DatasetInfo
|
|
33
33
|
from datachain.lib.file import ArrowRow, File, FileType, get_file_type
|
|
34
34
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
35
|
-
from datachain.lib.listing import get_listing, list_bucket, ls
|
|
35
|
+
from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
|
|
36
36
|
from datachain.lib.listing_info import ListingInfo
|
|
37
37
|
from datachain.lib.meta_formats import read_meta
|
|
38
38
|
from datachain.lib.model_store import ModelStore
|
|
@@ -438,6 +438,18 @@ class DataChain:
|
|
|
438
438
|
uri, session, update=update
|
|
439
439
|
)
|
|
440
440
|
|
|
441
|
+
# ds_name is None if object is a file, we don't want to use cache
|
|
442
|
+
# or do listing in that case - just read that single object
|
|
443
|
+
if not list_ds_name:
|
|
444
|
+
dc = cls.from_values(
|
|
445
|
+
session=session,
|
|
446
|
+
settings=settings,
|
|
447
|
+
in_memory=in_memory,
|
|
448
|
+
file=[get_file_info(list_uri, cache, client_config=client_config)],
|
|
449
|
+
)
|
|
450
|
+
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
451
|
+
return dc
|
|
452
|
+
|
|
441
453
|
if update or not list_ds_exists:
|
|
442
454
|
(
|
|
443
455
|
cls.from_records(
|
|
@@ -1634,7 +1646,7 @@ class DataChain:
|
|
|
1634
1646
|
output: OutputType = None,
|
|
1635
1647
|
object_name: str = "",
|
|
1636
1648
|
**fr_map,
|
|
1637
|
-
) -> "
|
|
1649
|
+
) -> "Self":
|
|
1638
1650
|
"""Generate chain from list of values.
|
|
1639
1651
|
|
|
1640
1652
|
Example:
|
|
@@ -1647,7 +1659,7 @@ class DataChain:
|
|
|
1647
1659
|
def _func_fr() -> Iterator[tuple_type]: # type: ignore[valid-type]
|
|
1648
1660
|
yield from tuples
|
|
1649
1661
|
|
|
1650
|
-
chain =
|
|
1662
|
+
chain = cls.from_records(
|
|
1651
1663
|
DataChain.DEFAULT_FILE_RECORD,
|
|
1652
1664
|
session=session,
|
|
1653
1665
|
settings=settings,
|