datachain 0.8.2__tar.gz → 0.8.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.8.2 → datachain-0.8.4}/.github/workflows/tests-studio.yml +1 -1
- {datachain-0.8.2 → datachain-0.8.4}/.github/workflows/tests.yml +1 -1
- {datachain-0.8.2 → datachain-0.8.4}/.pre-commit-config.yaml +1 -1
- {datachain-0.8.2 → datachain-0.8.4}/PKG-INFO +6 -6
- {datachain-0.8.2 → datachain-0.8.4}/mkdocs.yml +1 -0
- {datachain-0.8.2 → datachain-0.8.4}/pyproject.toml +5 -5
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/cache.py +4 -2
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/catalog/catalog.py +100 -54
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/catalog/datasource.py +4 -6
- datachain-0.8.4/src/datachain/cli/__init__.py +311 -0
- datachain-0.8.4/src/datachain/cli/commands/__init__.py +29 -0
- datachain-0.8.4/src/datachain/cli/commands/datasets.py +129 -0
- datachain-0.8.4/src/datachain/cli/commands/du.py +14 -0
- datachain-0.8.4/src/datachain/cli/commands/index.py +12 -0
- datachain-0.8.4/src/datachain/cli/commands/ls.py +169 -0
- datachain-0.8.4/src/datachain/cli/commands/misc.py +28 -0
- datachain-0.8.4/src/datachain/cli/commands/query.py +53 -0
- datachain-0.8.4/src/datachain/cli/commands/show.py +38 -0
- datachain-0.8.4/src/datachain/cli/parser/__init__.py +547 -0
- datachain-0.8.4/src/datachain/cli/parser/job.py +120 -0
- datachain-0.8.4/src/datachain/cli/parser/studio.py +126 -0
- datachain-0.8.4/src/datachain/cli/parser/utils.py +63 -0
- datachain-0.8.2/src/datachain/cli_utils.py → datachain-0.8.4/src/datachain/cli/utils.py +27 -1
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/client/azure.py +21 -1
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/client/fsspec.py +45 -13
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/client/gcs.py +10 -2
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/client/local.py +4 -4
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/client/s3.py +10 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/dataset.py +1 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/__init__.py +2 -2
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/conditional.py +52 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/func.py +5 -1
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/arrow.py +4 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/dc.py +18 -3
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/file.py +1 -1
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/listing.py +36 -3
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/signal_schema.py +89 -27
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/listing.py +1 -5
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/node.py +27 -1
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/progress.py +2 -2
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/session.py +1 -1
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/studio.py +58 -38
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/utils.py +1 -1
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain.egg-info/PKG-INFO +6 -6
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain.egg-info/SOURCES.txt +15 -2
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain.egg-info/requires.txt +5 -5
- {datachain-0.8.2 → datachain-0.8.4}/tests/conftest.py +1 -1
- datachain-0.8.4/tests/func/fake-service-account-credentials.json +9 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_catalog.py +150 -12
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_datachain.py +6 -2
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_pull.py +1 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/test_cli_e2e.py +6 -6
- {datachain-0.8.2 → datachain-0.8.4}/tests/test_cli_studio.py +18 -15
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_arrow.py +9 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_datachain.py +13 -5
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_signal_schema.py +280 -32
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/sql/test_conditional.py +43 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_cli_parsing.py +2 -17
- datachain-0.8.4/tests/unit/test_client_gcs.py +14 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_client_s3.py +6 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_config.py +9 -9
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_func.py +19 -1
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_listing.py +1 -1
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_session.py +5 -0
- datachain-0.8.2/src/datachain/cli.py +0 -1475
- datachain-0.8.2/tests/unit/test_client_gcs.py +0 -6
- {datachain-0.8.2 → datachain-0.8.4}/.cruft.json +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/.gitattributes +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/.github/codecov.yaml +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/.github/dependabot.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/.github/workflows/release.yml +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/.gitignore +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/LICENSE +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/README.rst +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/assets/datachain.svg +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/contributing.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/examples.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/index.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/overrides/main.html +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/quick-start.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/references/datachain.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/references/datatype.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/references/file.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/references/index.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/references/sql.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/references/torch.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/references/udf.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/docs/tutorials.md +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/multimodal/wds.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/noxfile.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/setup.cfg +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/__main__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/asyn.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/client/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/client/hf.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/config.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/error.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/array.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/base.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/numeric.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/path.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/random.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/string.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/func/window.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/job.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/clip.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/diff.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/hf.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/image.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/settings.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/tar.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/text.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/udf.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/model/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/model/bbox.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/model/pose.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/model/segment.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/py.typed +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/batch.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/dataset.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/metrics.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/params.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/queue.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/schema.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/udf.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/query/utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/remote/studio.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/types.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/sql/utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/telemetry.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/data.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/examples/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/examples/test_examples.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/examples/wds_data.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_client.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_datasets.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_listing.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_ls.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_metrics.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_pytorch.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_query.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_session.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/func/test_toolkit.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/scripts/feature_class.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/test_atomicity.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/test_query_e2e.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/test_telemetry.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_asyn.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_cache.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_catalog.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_client.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_dataset.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_metastore.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_query.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_query_params.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_serializer.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_utils.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.8.2 → datachain-0.8.4}/tests/utils.py +0 -0
|
@@ -138,7 +138,7 @@ jobs:
|
|
|
138
138
|
matrix:
|
|
139
139
|
os: [ubuntu-latest, windows-latest]
|
|
140
140
|
pyv: ['3.9', '3.12']
|
|
141
|
-
group: ['get_started', '
|
|
141
|
+
group: ['get_started', 'computer_vision', 'llm_and_nlp', 'multimodal']
|
|
142
142
|
exclude:
|
|
143
143
|
- {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
|
|
144
144
|
- {os: ubuntu-latest, pyv: '3.12', group: 'multimodal'}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.4
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -50,7 +50,7 @@ Requires-Dist: websockets
|
|
|
50
50
|
Provides-Extra: docs
|
|
51
51
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
52
52
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
53
|
-
Requires-Dist: mkdocs-material
|
|
53
|
+
Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
|
|
54
54
|
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
55
55
|
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
56
56
|
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
@@ -72,7 +72,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
|
72
72
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
73
73
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
74
74
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
75
|
-
Requires-Dist: pytest-servers[all]>=0.5.
|
|
75
|
+
Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
|
|
76
76
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
77
77
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
78
78
|
Requires-Dist: virtualenv; extra == "tests"
|
|
@@ -84,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
|
|
|
84
84
|
Requires-Dist: scipy; extra == "tests"
|
|
85
85
|
Provides-Extra: dev
|
|
86
86
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
87
|
-
Requires-Dist: mypy==1.14.
|
|
87
|
+
Requires-Dist: mypy==1.14.1; extra == "dev"
|
|
88
88
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
89
89
|
Requires-Dist: types-pytz; extra == "dev"
|
|
90
90
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -95,11 +95,11 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
95
95
|
Requires-Dist: defusedxml; extra == "examples"
|
|
96
96
|
Requires-Dist: accelerate; extra == "examples"
|
|
97
97
|
Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
|
|
98
|
-
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
98
|
+
Requires-Dist: unstructured[pdf]<0.16.12; extra == "examples"
|
|
99
99
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
100
100
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
101
101
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
102
|
-
Requires-Dist: ultralytics==8.3.
|
|
102
|
+
Requires-Dist: ultralytics==8.3.55; extra == "examples"
|
|
103
103
|
|
|
104
104
|
================
|
|
105
105
|
|logo| DataChain
|
|
@@ -56,7 +56,7 @@ dependencies = [
|
|
|
56
56
|
docs = [
|
|
57
57
|
"mkdocs>=1.5.2",
|
|
58
58
|
"mkdocs-gen-files>=0.5.0",
|
|
59
|
-
"mkdocs-material
|
|
59
|
+
"mkdocs-material==9.5.22",
|
|
60
60
|
"mkdocs-section-index>=0.3.6",
|
|
61
61
|
"mkdocstrings-python>=1.6.3",
|
|
62
62
|
"mkdocs-literate-nav>=0.6.1"
|
|
@@ -83,7 +83,7 @@ tests = [
|
|
|
83
83
|
"pytest-sugar>=0.9.6",
|
|
84
84
|
"pytest-cov>=4.1.0",
|
|
85
85
|
"pytest-mock>=3.12.0",
|
|
86
|
-
"pytest-servers[all]>=0.5.
|
|
86
|
+
"pytest-servers[all]>=0.5.9",
|
|
87
87
|
"pytest-benchmark[histogram]",
|
|
88
88
|
"pytest-xdist>=3.3.1",
|
|
89
89
|
"virtualenv",
|
|
@@ -96,7 +96,7 @@ tests = [
|
|
|
96
96
|
]
|
|
97
97
|
dev = [
|
|
98
98
|
"datachain[docs,tests]",
|
|
99
|
-
"mypy==1.14.
|
|
99
|
+
"mypy==1.14.1",
|
|
100
100
|
"types-python-dateutil",
|
|
101
101
|
"types-pytz",
|
|
102
102
|
"types-PyYAML",
|
|
@@ -108,11 +108,11 @@ examples = [
|
|
|
108
108
|
"defusedxml",
|
|
109
109
|
"accelerate",
|
|
110
110
|
"unstructured_ingest[embed-huggingface]",
|
|
111
|
-
"unstructured[pdf]",
|
|
111
|
+
"unstructured[pdf]<0.16.12",
|
|
112
112
|
"pdfplumber==0.11.4",
|
|
113
113
|
"huggingface_hub[hf_transfer]",
|
|
114
114
|
"onnx==1.16.1",
|
|
115
|
-
"ultralytics==8.3.
|
|
115
|
+
"ultralytics==8.3.55"
|
|
116
116
|
]
|
|
117
117
|
|
|
118
118
|
[project.urls]
|
|
@@ -61,14 +61,16 @@ class DataChainCache:
|
|
|
61
61
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
62
62
|
size = file.size
|
|
63
63
|
if size < 0:
|
|
64
|
-
size = await client.get_size(from_path)
|
|
64
|
+
size = await client.get_size(from_path, version_id=file.version)
|
|
65
65
|
cb = callback or TqdmCallback(
|
|
66
66
|
tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
|
|
67
67
|
tqdm_cls=Tqdm,
|
|
68
68
|
size=size,
|
|
69
69
|
)
|
|
70
70
|
try:
|
|
71
|
-
await client.get_file(
|
|
71
|
+
await client.get_file(
|
|
72
|
+
from_path, tmp_info, callback=cb, version_id=file.version
|
|
73
|
+
)
|
|
72
74
|
finally:
|
|
73
75
|
if not callback:
|
|
74
76
|
cb.close()
|
|
@@ -240,7 +240,8 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
240
240
|
class NodeGroup:
|
|
241
241
|
"""Class for a group of nodes from the same source"""
|
|
242
242
|
|
|
243
|
-
listing: "Listing"
|
|
243
|
+
listing: Optional["Listing"]
|
|
244
|
+
client: "Client"
|
|
244
245
|
sources: list[DataSource]
|
|
245
246
|
|
|
246
247
|
# The source path within the bucket
|
|
@@ -268,9 +269,7 @@ class NodeGroup:
|
|
|
268
269
|
Download this node group to cache.
|
|
269
270
|
"""
|
|
270
271
|
if self.sources:
|
|
271
|
-
self.
|
|
272
|
-
self.iternodes(recursive), shared_progress_bar=pbar
|
|
273
|
-
)
|
|
272
|
+
self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
|
|
274
273
|
|
|
275
274
|
|
|
276
275
|
def check_output_dataset_file(
|
|
@@ -375,7 +374,7 @@ def collect_nodes_for_cp(
|
|
|
375
374
|
|
|
376
375
|
# Collect all sources to process
|
|
377
376
|
for node_group in node_groups:
|
|
378
|
-
listing: Listing = node_group.listing
|
|
377
|
+
listing: Optional[Listing] = node_group.listing
|
|
379
378
|
valid_sources: list[DataSource] = []
|
|
380
379
|
for dsrc in node_group.sources:
|
|
381
380
|
if dsrc.is_single_object():
|
|
@@ -383,6 +382,7 @@ def collect_nodes_for_cp(
|
|
|
383
382
|
total_files += 1
|
|
384
383
|
valid_sources.append(dsrc)
|
|
385
384
|
else:
|
|
385
|
+
assert listing
|
|
386
386
|
node = dsrc.node
|
|
387
387
|
if not recursive:
|
|
388
388
|
print(f"{node.full_path} is a directory (not copied).")
|
|
@@ -433,37 +433,51 @@ def instantiate_node_groups(
|
|
|
433
433
|
)
|
|
434
434
|
|
|
435
435
|
output_dir = output
|
|
436
|
+
output_file = None
|
|
436
437
|
if copy_to_filename:
|
|
437
438
|
output_dir = os.path.dirname(output)
|
|
438
439
|
if not output_dir:
|
|
439
440
|
output_dir = "."
|
|
441
|
+
output_file = os.path.basename(output)
|
|
440
442
|
|
|
441
443
|
# Instantiate these nodes
|
|
442
444
|
for node_group in node_groups:
|
|
443
445
|
if not node_group.sources:
|
|
444
446
|
continue
|
|
445
|
-
listing: Listing = node_group.listing
|
|
447
|
+
listing: Optional[Listing] = node_group.listing
|
|
446
448
|
source_path: str = node_group.source_path
|
|
447
449
|
|
|
448
450
|
copy_dir_contents = always_copy_dir_contents or source_path.endswith("/")
|
|
449
|
-
|
|
450
|
-
node_group.sources
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
listing.
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
451
|
+
if not listing:
|
|
452
|
+
source = node_group.sources[0]
|
|
453
|
+
client = source.client
|
|
454
|
+
node = NodeWithPath(source.node, [output_file or source.node.path])
|
|
455
|
+
instantiated_nodes = [node]
|
|
456
|
+
if not virtual_only:
|
|
457
|
+
node.instantiate(
|
|
458
|
+
client, output_dir, instantiate_progress_bar, force=force
|
|
459
|
+
)
|
|
460
|
+
else:
|
|
461
|
+
instantiated_nodes = listing.collect_nodes_to_instantiate(
|
|
462
|
+
node_group.sources,
|
|
463
|
+
copy_to_filename,
|
|
464
|
+
recursive,
|
|
465
|
+
copy_dir_contents,
|
|
466
|
+
source_path,
|
|
467
|
+
node_group.is_edatachain,
|
|
468
|
+
node_group.is_dataset,
|
|
465
469
|
)
|
|
470
|
+
if not virtual_only:
|
|
471
|
+
listing.instantiate_nodes(
|
|
472
|
+
instantiated_nodes,
|
|
473
|
+
output_dir,
|
|
474
|
+
total_files,
|
|
475
|
+
force=force,
|
|
476
|
+
shared_progress_bar=instantiate_progress_bar,
|
|
477
|
+
)
|
|
478
|
+
|
|
466
479
|
node_group.instantiated_nodes = instantiated_nodes
|
|
480
|
+
|
|
467
481
|
if instantiate_progress_bar:
|
|
468
482
|
instantiate_progress_bar.close()
|
|
469
483
|
|
|
@@ -592,7 +606,7 @@ class Catalog:
|
|
|
592
606
|
client_config=None,
|
|
593
607
|
object_name="file",
|
|
594
608
|
skip_indexing=False,
|
|
595
|
-
) -> tuple["Listing", str]:
|
|
609
|
+
) -> tuple[Optional["Listing"], "Client", str]:
|
|
596
610
|
from datachain.lib.dc import DataChain
|
|
597
611
|
from datachain.listing import Listing
|
|
598
612
|
|
|
@@ -603,16 +617,19 @@ class Catalog:
|
|
|
603
617
|
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
604
618
|
source, self.session, update=update
|
|
605
619
|
)
|
|
620
|
+
lst = None
|
|
621
|
+
client = Client.get_client(list_uri, self.cache, **self.client_config)
|
|
622
|
+
|
|
623
|
+
if list_ds_name:
|
|
624
|
+
lst = Listing(
|
|
625
|
+
self.metastore.clone(),
|
|
626
|
+
self.warehouse.clone(),
|
|
627
|
+
client,
|
|
628
|
+
dataset_name=list_ds_name,
|
|
629
|
+
object_name=object_name,
|
|
630
|
+
)
|
|
606
631
|
|
|
607
|
-
lst
|
|
608
|
-
self.metastore.clone(),
|
|
609
|
-
self.warehouse.clone(),
|
|
610
|
-
Client.get_client(list_uri, self.cache, **self.client_config),
|
|
611
|
-
dataset_name=list_ds_name,
|
|
612
|
-
object_name=object_name,
|
|
613
|
-
)
|
|
614
|
-
|
|
615
|
-
return lst, list_path
|
|
632
|
+
return lst, client, list_path
|
|
616
633
|
|
|
617
634
|
def _remove_dataset_rows_and_warehouse_info(
|
|
618
635
|
self, dataset: DatasetRecord, version: int, **kwargs
|
|
@@ -635,13 +652,13 @@ class Catalog:
|
|
|
635
652
|
) -> Optional[list["DataSource"]]:
|
|
636
653
|
enlisted_sources = []
|
|
637
654
|
for src in sources: # Opt: parallel
|
|
638
|
-
listing, file_path = self.enlist_source(
|
|
655
|
+
listing, client, file_path = self.enlist_source(
|
|
639
656
|
src,
|
|
640
657
|
update,
|
|
641
658
|
client_config=client_config or self.client_config,
|
|
642
659
|
skip_indexing=skip_indexing,
|
|
643
660
|
)
|
|
644
|
-
enlisted_sources.append((listing, file_path))
|
|
661
|
+
enlisted_sources.append((listing, client, file_path))
|
|
645
662
|
|
|
646
663
|
if only_index:
|
|
647
664
|
# sometimes we don't really need listing result (e.g on indexing process)
|
|
@@ -649,10 +666,16 @@ class Catalog:
|
|
|
649
666
|
return None
|
|
650
667
|
|
|
651
668
|
dsrc_all: list[DataSource] = []
|
|
652
|
-
for listing, file_path in enlisted_sources:
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
669
|
+
for listing, client, file_path in enlisted_sources:
|
|
670
|
+
if not listing:
|
|
671
|
+
nodes = [Node.from_file(client.get_file_info(file_path))]
|
|
672
|
+
dir_only = False
|
|
673
|
+
else:
|
|
674
|
+
nodes = listing.expand_path(file_path)
|
|
675
|
+
dir_only = file_path.endswith("/")
|
|
676
|
+
dsrc_all.extend(
|
|
677
|
+
DataSource(listing, client, node, dir_only) for node in nodes
|
|
678
|
+
)
|
|
656
679
|
return dsrc_all
|
|
657
680
|
|
|
658
681
|
def enlist_sources_grouped(
|
|
@@ -667,7 +690,7 @@ class Catalog:
|
|
|
667
690
|
|
|
668
691
|
def _row_to_node(d: dict[str, Any]) -> Node:
|
|
669
692
|
del d["file__source"]
|
|
670
|
-
return Node.
|
|
693
|
+
return Node.from_row(d)
|
|
671
694
|
|
|
672
695
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
673
696
|
client_config = client_config or self.client_config
|
|
@@ -677,7 +700,7 @@ class Catalog:
|
|
|
677
700
|
edatachain_data = parse_edatachain_file(src)
|
|
678
701
|
indexed_sources = []
|
|
679
702
|
for ds in edatachain_data:
|
|
680
|
-
listing, source_path = self.enlist_source(
|
|
703
|
+
listing, _, source_path = self.enlist_source(
|
|
681
704
|
ds["data-source"]["uri"],
|
|
682
705
|
update,
|
|
683
706
|
client_config=client_config,
|
|
@@ -701,6 +724,7 @@ class Catalog:
|
|
|
701
724
|
client = self.get_client(source, **client_config)
|
|
702
725
|
uri = client.uri
|
|
703
726
|
dataset_name, _, _, _ = get_listing(uri, self.session)
|
|
727
|
+
assert dataset_name
|
|
704
728
|
listing = Listing(
|
|
705
729
|
self.metastore.clone(),
|
|
706
730
|
self.warehouse.clone(),
|
|
@@ -713,6 +737,7 @@ class Catalog:
|
|
|
713
737
|
indexed_sources.append(
|
|
714
738
|
(
|
|
715
739
|
listing,
|
|
740
|
+
client,
|
|
716
741
|
source,
|
|
717
742
|
[_row_to_node(r) for r in rows],
|
|
718
743
|
ds_name,
|
|
@@ -722,25 +747,28 @@ class Catalog:
|
|
|
722
747
|
|
|
723
748
|
enlisted_sources.append((False, True, indexed_sources))
|
|
724
749
|
else:
|
|
725
|
-
listing, source_path = self.enlist_source(
|
|
750
|
+
listing, client, source_path = self.enlist_source(
|
|
726
751
|
src, update, client_config=client_config
|
|
727
752
|
)
|
|
728
|
-
enlisted_sources.append((False, False, (listing, source_path)))
|
|
753
|
+
enlisted_sources.append((False, False, (listing, client, source_path)))
|
|
729
754
|
|
|
730
755
|
node_groups = []
|
|
731
756
|
for is_datachain, is_dataset, payload in enlisted_sources: # Opt: parallel
|
|
732
757
|
if is_dataset:
|
|
733
758
|
for (
|
|
734
759
|
listing,
|
|
760
|
+
client,
|
|
735
761
|
source_path,
|
|
736
762
|
nodes,
|
|
737
763
|
dataset_name,
|
|
738
764
|
dataset_version,
|
|
739
765
|
) in payload:
|
|
740
|
-
|
|
766
|
+
assert listing
|
|
767
|
+
dsrc = [DataSource(listing, client, node) for node in nodes]
|
|
741
768
|
node_groups.append(
|
|
742
769
|
NodeGroup(
|
|
743
770
|
listing,
|
|
771
|
+
client,
|
|
744
772
|
dsrc,
|
|
745
773
|
source_path,
|
|
746
774
|
dataset_name=dataset_name,
|
|
@@ -749,18 +777,30 @@ class Catalog:
|
|
|
749
777
|
)
|
|
750
778
|
elif is_datachain:
|
|
751
779
|
for listing, source_path, paths in payload:
|
|
752
|
-
|
|
780
|
+
assert listing
|
|
781
|
+
dsrc = [
|
|
782
|
+
DataSource(listing, listing.client, listing.resolve_path(p))
|
|
783
|
+
for p in paths
|
|
784
|
+
]
|
|
753
785
|
node_groups.append(
|
|
754
|
-
NodeGroup(
|
|
786
|
+
NodeGroup(
|
|
787
|
+
listing,
|
|
788
|
+
listing.client,
|
|
789
|
+
dsrc,
|
|
790
|
+
source_path,
|
|
791
|
+
is_edatachain=True,
|
|
792
|
+
)
|
|
755
793
|
)
|
|
756
794
|
else:
|
|
757
|
-
listing, source_path = payload
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
795
|
+
listing, client, source_path = payload
|
|
796
|
+
if not listing:
|
|
797
|
+
nodes = [Node.from_file(client.get_file_info(source_path))]
|
|
798
|
+
as_container = False
|
|
799
|
+
else:
|
|
800
|
+
as_container = source_path.endswith("/")
|
|
801
|
+
nodes = listing.expand_path(source_path, use_glob=not no_glob)
|
|
802
|
+
dsrc = [DataSource(listing, client, n, as_container) for n in nodes]
|
|
803
|
+
node_groups.append(NodeGroup(listing, client, dsrc, source_path))
|
|
764
804
|
|
|
765
805
|
return node_groups
|
|
766
806
|
|
|
@@ -1196,10 +1236,16 @@ class Catalog:
|
|
|
1196
1236
|
|
|
1197
1237
|
return q.to_db_records()
|
|
1198
1238
|
|
|
1199
|
-
def signed_url(
|
|
1239
|
+
def signed_url(
|
|
1240
|
+
self,
|
|
1241
|
+
source: str,
|
|
1242
|
+
path: str,
|
|
1243
|
+
version_id: Optional[str] = None,
|
|
1244
|
+
client_config=None,
|
|
1245
|
+
) -> str:
|
|
1200
1246
|
client_config = client_config or self.client_config
|
|
1201
1247
|
client = Client.get_client(source, self.cache, **client_config)
|
|
1202
|
-
return client.url(path)
|
|
1248
|
+
return client.url(path, version_id=version_id)
|
|
1203
1249
|
|
|
1204
1250
|
def export_dataset_table(
|
|
1205
1251
|
self,
|
|
@@ -4,21 +4,19 @@ from datachain.node import DirType, NodeWithPath
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class DataSource:
|
|
7
|
-
def __init__(self, listing, node, as_container=False):
|
|
7
|
+
def __init__(self, listing, client, node, as_container=False):
|
|
8
8
|
self.listing = listing
|
|
9
|
+
self.client = client
|
|
9
10
|
self.node = node
|
|
10
11
|
self.as_container = (
|
|
11
12
|
as_container # Indicates whether a .tar file is handled as a container
|
|
12
13
|
)
|
|
13
14
|
|
|
14
|
-
def get_full_path(self):
|
|
15
|
-
return self.get_node_full_path(self.node)
|
|
16
|
-
|
|
17
15
|
def get_node_full_path(self, node):
|
|
18
|
-
return self.
|
|
16
|
+
return self.client.get_full_path(node.full_path)
|
|
19
17
|
|
|
20
18
|
def get_node_full_path_from_path(self, full_path):
|
|
21
|
-
return self.
|
|
19
|
+
return self.client.get_full_path(full_path)
|
|
22
20
|
|
|
23
21
|
def is_single_object(self):
|
|
24
22
|
return self.node.dir_type == DirType.FILE or (
|