datachain 0.14.0__tar.gz → 0.14.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.14.0/src/datachain.egg-info → datachain-0.14.1}/PKG-INFO +2 -2
- {datachain-0.14.0 → datachain-0.14.1}/pyproject.toml +3 -2
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/catalog/catalog.py +1 -1
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/fsspec.py +3 -3
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/json.py +1 -1
- datachain-0.14.1/src/datachain/lib/dc/storage.py +170 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/dataset.py +39 -16
- {datachain-0.14.0 → datachain-0.14.1/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/test_ls.py +1 -1
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_datachain.py +85 -5
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_ls.py +1 -1
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_datachain.py +4 -4
- datachain-0.14.0/src/datachain/lib/dc/storage.py +0 -118
- {datachain-0.14.0 → datachain-0.14.1}/.cruft.json +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.gitattributes +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.github/codecov.yaml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.github/dependabot.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.github/workflows/release.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.gitignore +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/.pre-commit-config.yaml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/LICENSE +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/README.rst +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/contributing.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/examples.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/index.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/overrides/main.html +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/quick-start.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/file.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/index.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/pose.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/segment.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/datachain.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/func.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/index.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/remotes.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/toolkit.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/torch.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/references/udf.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/docs/tutorials.md +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/mkdocs.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/noxfile.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/setup.cfg +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/__main__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/asyn.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cache.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/local.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/config.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/dataset.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/error.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/fs/reference.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/fs/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/array.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/base.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/conditional.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/func.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/numeric.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/path.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/random.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/string.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/window.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/job.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/datachain.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/file.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/udf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/video.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/listing.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/bbox.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/pose.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/segment.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/node.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/progress.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/py.typed +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/batch.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/params.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/schema.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/session.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/udf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/script_meta.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/studio.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/conftest.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/data.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/examples/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/data/lena.jpg +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_client.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_data_storage.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_datasets.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_file.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_hf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_image.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_listing.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_pull.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_query.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_session.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_toolkit.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_video.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_warehouse.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/test_atomicity.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/test_cli_studio.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/test_import_time.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/test_telemetry.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/model/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_client.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_config.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_func.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_query.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_session.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.14.0 → datachain-0.14.1}/tests/utils.py +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.14.
|
|
3
|
+
Version: 0.14.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
-
License: Apache-2.0
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
8
|
Project-URL: Issues, https://github.com/iterative/datachain/issues
|
|
9
9
|
Project-URL: Source, https://github.com/iterative/datachain
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
[build-system]
|
|
2
|
-
requires = ["setuptools>=
|
|
2
|
+
requires = ["setuptools>=77", "setuptools_scm[toml]>=6.3.1"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "datachain"
|
|
7
7
|
description = "Wrangle unstructured AI data at scale"
|
|
8
8
|
readme = "README.rst"
|
|
9
|
-
license =
|
|
9
|
+
license = "Apache-2.0"
|
|
10
|
+
license-files = ["LICENSE"]
|
|
10
11
|
authors = [{name = "Dmitry Petrov", email = "support@dvc.org"}]
|
|
11
12
|
classifiers = [
|
|
12
13
|
"Programming Language :: Python :: 3",
|
|
@@ -89,9 +89,9 @@ class Client(ABC):
|
|
|
89
89
|
from .local import FileClient
|
|
90
90
|
from .s3 import ClientS3
|
|
91
91
|
|
|
92
|
-
protocol = urlparse(
|
|
92
|
+
protocol = urlparse(os.fspath(url)).scheme
|
|
93
93
|
|
|
94
|
-
if not protocol or _is_win_local_path(
|
|
94
|
+
if not protocol or _is_win_local_path(os.fspath(url)):
|
|
95
95
|
return FileClient
|
|
96
96
|
if protocol == ClientS3.protocol:
|
|
97
97
|
return ClientS3
|
|
@@ -122,7 +122,7 @@ class Client(ABC):
|
|
|
122
122
|
source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
|
|
123
123
|
) -> "Client":
|
|
124
124
|
cls = Client.get_implementation(source)
|
|
125
|
-
storage_url, _ = cls.split_url(
|
|
125
|
+
storage_url, _ = cls.split_url(os.fspath(source))
|
|
126
126
|
if os.name == "nt":
|
|
127
127
|
storage_url = storage_url.removeprefix("/")
|
|
128
128
|
|
|
@@ -64,7 +64,7 @@ def from_json(
|
|
|
64
64
|
from .storage import from_storage
|
|
65
65
|
|
|
66
66
|
if schema_from == "auto":
|
|
67
|
-
schema_from =
|
|
67
|
+
schema_from = os.fspath(path)
|
|
68
68
|
|
|
69
69
|
def jmespath_to_name(s: str):
|
|
70
70
|
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import (
|
|
3
|
+
TYPE_CHECKING,
|
|
4
|
+
Optional,
|
|
5
|
+
Union,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from datachain.lib.file import (
|
|
9
|
+
FileType,
|
|
10
|
+
get_file_type,
|
|
11
|
+
)
|
|
12
|
+
from datachain.lib.listing import (
|
|
13
|
+
get_file_info,
|
|
14
|
+
get_listing,
|
|
15
|
+
list_bucket,
|
|
16
|
+
ls,
|
|
17
|
+
)
|
|
18
|
+
from datachain.query import Session
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from .datachain import DataChain
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def from_storage(
|
|
25
|
+
uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
|
|
26
|
+
*,
|
|
27
|
+
type: FileType = "binary",
|
|
28
|
+
session: Optional[Session] = None,
|
|
29
|
+
settings: Optional[dict] = None,
|
|
30
|
+
in_memory: bool = False,
|
|
31
|
+
recursive: Optional[bool] = True,
|
|
32
|
+
object_name: str = "file",
|
|
33
|
+
update: bool = False,
|
|
34
|
+
anon: bool = False,
|
|
35
|
+
client_config: Optional[dict] = None,
|
|
36
|
+
) -> "DataChain":
|
|
37
|
+
"""Get data from storage(s) as a list of file with all file attributes.
|
|
38
|
+
It returns the chain itself as usual.
|
|
39
|
+
|
|
40
|
+
Parameters:
|
|
41
|
+
uri : storage URI with directory or list of URIs.
|
|
42
|
+
URIs must start with storage prefix such
|
|
43
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
44
|
+
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
45
|
+
recursive : search recursively for the given path.
|
|
46
|
+
object_name : Created object column name.
|
|
47
|
+
update : force storage reindexing. Default is False.
|
|
48
|
+
anon : If True, we will treat cloud bucket as public one
|
|
49
|
+
client_config : Optional client configuration for the storage client.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
DataChain: A DataChain object containing the file information.
|
|
53
|
+
|
|
54
|
+
Examples:
|
|
55
|
+
Simple call from s3:
|
|
56
|
+
```python
|
|
57
|
+
import datachain as dc
|
|
58
|
+
chain = dc.from_storage("s3://my-bucket/my-dir")
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Multiple URIs:
|
|
62
|
+
```python
|
|
63
|
+
chain = dc.from_storage([
|
|
64
|
+
"s3://bucket1/dir1",
|
|
65
|
+
"s3://bucket2/dir2"
|
|
66
|
+
])
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
With AWS S3-compatible storage:
|
|
70
|
+
```python
|
|
71
|
+
chain = dc.from_storage(
|
|
72
|
+
"s3://my-bucket/my-dir",
|
|
73
|
+
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
74
|
+
)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Pass existing session
|
|
78
|
+
```py
|
|
79
|
+
session = Session.get()
|
|
80
|
+
chain = dc.from_storage([
|
|
81
|
+
"path/to/dir1",
|
|
82
|
+
"path/to/dir2"
|
|
83
|
+
], session=session, recursive=True)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Note:
|
|
87
|
+
When using multiple URIs with `update=True`, the function optimizes by
|
|
88
|
+
avoiding redundant updates for URIs pointing to the same storage location.
|
|
89
|
+
"""
|
|
90
|
+
from .datachain import DataChain
|
|
91
|
+
from .datasets import from_dataset
|
|
92
|
+
from .records import from_records
|
|
93
|
+
from .values import from_values
|
|
94
|
+
|
|
95
|
+
file_type = get_file_type(type)
|
|
96
|
+
|
|
97
|
+
if anon:
|
|
98
|
+
client_config = (client_config or {}) | {"anon": True}
|
|
99
|
+
session = Session.get(session, client_config=client_config, in_memory=in_memory)
|
|
100
|
+
cache = session.catalog.cache
|
|
101
|
+
client_config = session.catalog.client_config
|
|
102
|
+
|
|
103
|
+
uris = uri if isinstance(uri, (list, tuple)) else [uri]
|
|
104
|
+
|
|
105
|
+
if not uris:
|
|
106
|
+
raise ValueError("No URIs provided")
|
|
107
|
+
|
|
108
|
+
storage_chain = None
|
|
109
|
+
listed_ds_name = set()
|
|
110
|
+
file_values = []
|
|
111
|
+
|
|
112
|
+
for single_uri in uris:
|
|
113
|
+
list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
|
|
114
|
+
single_uri, session, update=update
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# list_ds_name is None if object is a file, we don't want to use cache
|
|
118
|
+
# or do listing in that case - just read that single object
|
|
119
|
+
if not list_ds_name:
|
|
120
|
+
file_values.append(
|
|
121
|
+
get_file_info(list_uri, cache, client_config=client_config)
|
|
122
|
+
)
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
dc = from_dataset(list_ds_name, session=session, settings=settings)
|
|
126
|
+
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
127
|
+
|
|
128
|
+
if update or not list_ds_exists:
|
|
129
|
+
|
|
130
|
+
def lst_fn(ds_name, lst_uri):
|
|
131
|
+
# disable prefetch for listing, as it pre-downloads all files
|
|
132
|
+
(
|
|
133
|
+
from_records(
|
|
134
|
+
DataChain.DEFAULT_FILE_RECORD,
|
|
135
|
+
session=session,
|
|
136
|
+
settings=settings,
|
|
137
|
+
in_memory=in_memory,
|
|
138
|
+
)
|
|
139
|
+
.settings(prefetch=0)
|
|
140
|
+
.gen(
|
|
141
|
+
list_bucket(lst_uri, cache, client_config=client_config),
|
|
142
|
+
output={f"{object_name}": file_type},
|
|
143
|
+
)
|
|
144
|
+
.save(ds_name, listing=True)
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
dc._query.add_before_steps(
|
|
148
|
+
lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
chain = ls(dc, list_path, recursive=recursive, object_name=object_name)
|
|
152
|
+
|
|
153
|
+
storage_chain = storage_chain.union(chain) if storage_chain else chain
|
|
154
|
+
listed_ds_name.add(list_ds_name)
|
|
155
|
+
|
|
156
|
+
if file_values:
|
|
157
|
+
file_chain = from_values(
|
|
158
|
+
session=session,
|
|
159
|
+
settings=settings,
|
|
160
|
+
in_memory=in_memory,
|
|
161
|
+
file=file_values,
|
|
162
|
+
)
|
|
163
|
+
file_chain.signals_schema = file_chain.signals_schema.mutate(
|
|
164
|
+
{f"{object_name}": file_type}
|
|
165
|
+
)
|
|
166
|
+
storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain
|
|
167
|
+
|
|
168
|
+
assert storage_chain is not None
|
|
169
|
+
|
|
170
|
+
return storage_chain
|
|
@@ -47,6 +47,7 @@ from datachain.error import (
|
|
|
47
47
|
QueryScriptCancelError,
|
|
48
48
|
)
|
|
49
49
|
from datachain.func.base import Function
|
|
50
|
+
from datachain.lib.listing import is_listing_dataset
|
|
50
51
|
from datachain.lib.udf import UDFAdapter, _get_cache
|
|
51
52
|
from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
|
|
52
53
|
from datachain.query.schema import C, UDFParamSpec, normalize_param
|
|
@@ -151,13 +152,6 @@ def step_result(
|
|
|
151
152
|
)
|
|
152
153
|
|
|
153
154
|
|
|
154
|
-
class StartingStep(ABC):
|
|
155
|
-
"""An initial query processing step, referencing a data source."""
|
|
156
|
-
|
|
157
|
-
@abstractmethod
|
|
158
|
-
def apply(self) -> "StepResult": ...
|
|
159
|
-
|
|
160
|
-
|
|
161
155
|
@frozen
|
|
162
156
|
class Step(ABC):
|
|
163
157
|
"""A query processing step (filtering, mutation, etc.)"""
|
|
@@ -170,7 +164,7 @@ class Step(ABC):
|
|
|
170
164
|
|
|
171
165
|
|
|
172
166
|
@frozen
|
|
173
|
-
class QueryStep
|
|
167
|
+
class QueryStep:
|
|
174
168
|
catalog: "Catalog"
|
|
175
169
|
dataset_name: str
|
|
176
170
|
dataset_version: int
|
|
@@ -1097,26 +1091,42 @@ class DatasetQuery:
|
|
|
1097
1091
|
self.temp_table_names: list[str] = []
|
|
1098
1092
|
self.dependencies: set[DatasetDependencyType] = set()
|
|
1099
1093
|
self.table = self.get_table()
|
|
1100
|
-
self.starting_step:
|
|
1094
|
+
self.starting_step: Optional[QueryStep] = None
|
|
1101
1095
|
self.name: Optional[str] = None
|
|
1102
1096
|
self.version: Optional[int] = None
|
|
1103
1097
|
self.feature_schema: Optional[dict] = None
|
|
1104
1098
|
self.column_types: Optional[dict[str, Any]] = None
|
|
1099
|
+
self.before_steps: list[Callable] = []
|
|
1105
1100
|
|
|
1106
|
-
self.
|
|
1101
|
+
self.list_ds_name: Optional[str] = None
|
|
1107
1102
|
|
|
1108
|
-
|
|
1109
|
-
|
|
1103
|
+
self.name = name
|
|
1104
|
+
self.dialect = self.catalog.warehouse.db.dialect
|
|
1105
|
+
if version:
|
|
1106
|
+
self.version = version
|
|
1107
|
+
|
|
1108
|
+
if is_listing_dataset(name):
|
|
1109
|
+
# not setting query step yet as listing dataset might not exist at
|
|
1110
|
+
# this point
|
|
1111
|
+
self.list_ds_name = name
|
|
1112
|
+
elif fallback_to_studio and is_token_set():
|
|
1113
|
+
self._set_starting_step(
|
|
1114
|
+
self.catalog.get_dataset_with_remote_fallback(name, version)
|
|
1115
|
+
)
|
|
1110
1116
|
else:
|
|
1111
|
-
|
|
1117
|
+
self._set_starting_step(self.catalog.get_dataset(name))
|
|
1118
|
+
|
|
1119
|
+
def _set_starting_step(self, ds: "DatasetRecord") -> None:
|
|
1120
|
+
if not self.version:
|
|
1121
|
+
self.version = ds.latest_version
|
|
1112
1122
|
|
|
1113
|
-
self.
|
|
1123
|
+
self.starting_step = QueryStep(self.catalog, ds.name, self.version)
|
|
1124
|
+
|
|
1125
|
+
# at this point we know our starting dataset so setting up schemas
|
|
1114
1126
|
self.feature_schema = ds.get_version(self.version).feature_schema
|
|
1115
1127
|
self.column_types = copy(ds.schema)
|
|
1116
1128
|
if "sys__id" in self.column_types:
|
|
1117
1129
|
self.column_types.pop("sys__id")
|
|
1118
|
-
self.starting_step = QueryStep(self.catalog, name, self.version)
|
|
1119
|
-
self.dialect = self.catalog.warehouse.db.dialect
|
|
1120
1130
|
|
|
1121
1131
|
def __iter__(self):
|
|
1122
1132
|
return iter(self.db_results())
|
|
@@ -1180,11 +1190,23 @@ class DatasetQuery:
|
|
|
1180
1190
|
col.table = self.table
|
|
1181
1191
|
return col
|
|
1182
1192
|
|
|
1193
|
+
def add_before_steps(self, fn: Callable) -> None:
|
|
1194
|
+
"""
|
|
1195
|
+
Setting custom function to be run before applying steps
|
|
1196
|
+
"""
|
|
1197
|
+
self.before_steps.append(fn)
|
|
1198
|
+
|
|
1183
1199
|
def apply_steps(self) -> QueryGenerator:
|
|
1184
1200
|
"""
|
|
1185
1201
|
Apply the steps in the query and return the resulting
|
|
1186
1202
|
sqlalchemy.SelectBase.
|
|
1187
1203
|
"""
|
|
1204
|
+
for fn in self.before_steps:
|
|
1205
|
+
fn()
|
|
1206
|
+
|
|
1207
|
+
if self.list_ds_name:
|
|
1208
|
+
# at this point we know what is our starting listing dataset name
|
|
1209
|
+
self._set_starting_step(self.catalog.get_dataset(self.list_ds_name)) # type: ignore [arg-type]
|
|
1188
1210
|
query = self.clone()
|
|
1189
1211
|
|
|
1190
1212
|
index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
|
|
@@ -1203,6 +1225,7 @@ class DatasetQuery:
|
|
|
1203
1225
|
query = query.filter(C.sys__rand % total == index)
|
|
1204
1226
|
query.steps = query.steps[-1:] + query.steps[:-1]
|
|
1205
1227
|
|
|
1228
|
+
assert query.starting_step
|
|
1206
1229
|
result = query.starting_step.apply()
|
|
1207
1230
|
self.dependencies.update(result.dependencies)
|
|
1208
1231
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.14.
|
|
3
|
+
Version: 0.14.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
-
License: Apache-2.0
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
8
|
Project-URL: Issues, https://github.com/iterative/datachain/issues
|
|
9
9
|
Project-URL: Source, https://github.com/iterative/datachain
|
|
@@ -9,6 +9,7 @@ import uuid
|
|
|
9
9
|
from collections.abc import Iterator
|
|
10
10
|
from datetime import datetime, timedelta, timezone
|
|
11
11
|
from pathlib import Path
|
|
12
|
+
from unittest.mock import patch
|
|
12
13
|
|
|
13
14
|
import numpy as np
|
|
14
15
|
import pandas as pd
|
|
@@ -152,7 +153,7 @@ def test_from_storage_partials(cloud_test_catalog):
|
|
|
152
153
|
return name
|
|
153
154
|
|
|
154
155
|
dogs_uri = f"{src_uri}/dogs"
|
|
155
|
-
dc.from_storage(dogs_uri, session=session)
|
|
156
|
+
dc.from_storage(dogs_uri, session=session).exec()
|
|
156
157
|
assert _get_listing_datasets(session) == [
|
|
157
158
|
f"{_list_dataset_name(dogs_uri)}@v1",
|
|
158
159
|
]
|
|
@@ -162,7 +163,7 @@ def test_from_storage_partials(cloud_test_catalog):
|
|
|
162
163
|
f"{_list_dataset_name(dogs_uri)}@v1",
|
|
163
164
|
]
|
|
164
165
|
|
|
165
|
-
dc.from_storage(src_uri, session=session)
|
|
166
|
+
dc.from_storage(src_uri, session=session).exec()
|
|
166
167
|
assert _get_listing_datasets(session) == sorted(
|
|
167
168
|
[
|
|
168
169
|
f"{_list_dataset_name(dogs_uri)}@v1",
|
|
@@ -170,7 +171,7 @@ def test_from_storage_partials(cloud_test_catalog):
|
|
|
170
171
|
]
|
|
171
172
|
)
|
|
172
173
|
|
|
173
|
-
dc.from_storage(f"{src_uri}/cats", session=session)
|
|
174
|
+
dc.from_storage(f"{src_uri}/cats", session=session).exec()
|
|
174
175
|
assert _get_listing_datasets(session) == sorted(
|
|
175
176
|
[
|
|
176
177
|
f"{_list_dataset_name(dogs_uri)}@v1",
|
|
@@ -196,14 +197,14 @@ def test_from_storage_partials_with_update(cloud_test_catalog):
|
|
|
196
197
|
return name
|
|
197
198
|
|
|
198
199
|
uri = f"{src_uri}/cats"
|
|
199
|
-
dc.from_storage(uri, session=session)
|
|
200
|
+
dc.from_storage(uri, session=session).exec()
|
|
200
201
|
assert _get_listing_datasets(session) == sorted(
|
|
201
202
|
[
|
|
202
203
|
f"{_list_dataset_name(uri)}@v1",
|
|
203
204
|
]
|
|
204
205
|
)
|
|
205
206
|
|
|
206
|
-
dc.from_storage(uri, session=session, update=True)
|
|
207
|
+
dc.from_storage(uri, session=session, update=True).exec()
|
|
207
208
|
assert _get_listing_datasets(session) == sorted(
|
|
208
209
|
[
|
|
209
210
|
f"{_list_dataset_name(uri)}@v1",
|
|
@@ -369,6 +370,85 @@ def test_export_images_files(test_session, tmp_dir, tmp_path, use_cache):
|
|
|
369
370
|
assert images_equal(img["data"], exported_img)
|
|
370
371
|
|
|
371
372
|
|
|
373
|
+
@pytest.mark.parametrize("use_cache", [True, False])
|
|
374
|
+
def test_from_storage_multiple_uris_files(test_session, tmp_dir, tmp_path, use_cache):
|
|
375
|
+
images = [
|
|
376
|
+
{"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
|
|
377
|
+
{"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
|
|
378
|
+
]
|
|
379
|
+
|
|
380
|
+
for img in images:
|
|
381
|
+
img["data"].save(tmp_path / img["name"])
|
|
382
|
+
|
|
383
|
+
dc.from_storage(
|
|
384
|
+
[
|
|
385
|
+
f"file://{tmp_path}/img1.jpg",
|
|
386
|
+
f"file://{tmp_path}/img2.jpg",
|
|
387
|
+
],
|
|
388
|
+
session=test_session,
|
|
389
|
+
anon=True,
|
|
390
|
+
update=True,
|
|
391
|
+
).to_storage(tmp_dir / "output", placement="filename")
|
|
392
|
+
|
|
393
|
+
for img in images:
|
|
394
|
+
exported_img = Image.open(tmp_dir / "output" / img["name"])
|
|
395
|
+
assert images_equal(img["data"], exported_img)
|
|
396
|
+
|
|
397
|
+
chain = dc.from_storage(
|
|
398
|
+
[
|
|
399
|
+
f"file://{tmp_path}/img1.jpg",
|
|
400
|
+
f"file://{tmp_path}/img2.jpg",
|
|
401
|
+
f"file://{tmp_dir}/output/*",
|
|
402
|
+
]
|
|
403
|
+
)
|
|
404
|
+
assert chain.count() == 4
|
|
405
|
+
|
|
406
|
+
chain = dc.from_storage([f"file://{tmp_dir}/output/*"])
|
|
407
|
+
assert chain.count() == 2
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
@pytest.mark.parametrize(
|
|
411
|
+
"cloud_type",
|
|
412
|
+
["s3", "azure", "gs"],
|
|
413
|
+
indirect=True,
|
|
414
|
+
)
|
|
415
|
+
def test_from_storage_multiple_uris_cache(cloud_test_catalog):
|
|
416
|
+
ctc = cloud_test_catalog
|
|
417
|
+
src_uri = ctc.src_uri
|
|
418
|
+
session = ctc.session
|
|
419
|
+
|
|
420
|
+
with pytest.raises(ValueError):
|
|
421
|
+
dc.from_storage([]) # No URIs provided
|
|
422
|
+
|
|
423
|
+
with patch(
|
|
424
|
+
"datachain.lib.dc.storage.get_listing", wraps=dc.lib.listing.get_listing
|
|
425
|
+
) as mock_get_listing:
|
|
426
|
+
chain = dc.from_storage(
|
|
427
|
+
[
|
|
428
|
+
f"{src_uri}/cats",
|
|
429
|
+
f"{src_uri}/dogs",
|
|
430
|
+
f"{src_uri}/cats/cat*",
|
|
431
|
+
f"{src_uri}/dogs/dog*",
|
|
432
|
+
],
|
|
433
|
+
session=session,
|
|
434
|
+
update=True,
|
|
435
|
+
).exec()
|
|
436
|
+
assert chain.count() == 11
|
|
437
|
+
|
|
438
|
+
files = chain.collect("file")
|
|
439
|
+
assert {f.name for f in files} == {
|
|
440
|
+
"cat1",
|
|
441
|
+
"cat2",
|
|
442
|
+
"dog1",
|
|
443
|
+
"dog2",
|
|
444
|
+
"dog3",
|
|
445
|
+
"dog4",
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
# Verify from_records was called exactly twice
|
|
449
|
+
assert mock_get_listing.call_count == 4 # TODO FIX THIS
|
|
450
|
+
|
|
451
|
+
|
|
372
452
|
def test_from_storage_path_object(test_session, tmp_dir, tmp_path):
|
|
373
453
|
images = [
|
|
374
454
|
{"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
|
|
@@ -32,7 +32,7 @@ def test_ls_no_args(cloud_test_catalog, cloud_type, capsys):
|
|
|
32
32
|
catalog = session.catalog
|
|
33
33
|
src = cloud_test_catalog.src_uri
|
|
34
34
|
|
|
35
|
-
dc.from_storage(src, session=session).
|
|
35
|
+
dc.from_storage(src, session=session).exec()
|
|
36
36
|
ls([], catalog=catalog)
|
|
37
37
|
captured = capsys.readouterr()
|
|
38
38
|
assert captured.out == f"{src}/@v1\n"
|
|
@@ -339,7 +339,7 @@ def test_listings(test_session, tmp_dir):
|
|
|
339
339
|
df.to_parquet(tmp_dir / "df.parquet")
|
|
340
340
|
|
|
341
341
|
uri = tmp_dir.as_uri()
|
|
342
|
-
dc.from_storage(uri, session=test_session)
|
|
342
|
+
dc.from_storage(uri, session=test_session).exec()
|
|
343
343
|
|
|
344
344
|
# check that listing is not returned as normal dataset
|
|
345
345
|
assert not any(
|
|
@@ -370,13 +370,13 @@ def test_listings_reindex(test_session, tmp_dir):
|
|
|
370
370
|
|
|
371
371
|
uri = tmp_dir.as_uri()
|
|
372
372
|
|
|
373
|
-
dc.from_storage(uri, session=test_session)
|
|
373
|
+
dc.from_storage(uri, session=test_session).exec()
|
|
374
374
|
assert len(list(dc.listings(session=test_session).collect("listing"))) == 1
|
|
375
375
|
|
|
376
|
-
dc.from_storage(uri, session=test_session)
|
|
376
|
+
dc.from_storage(uri, session=test_session).exec()
|
|
377
377
|
assert len(list(dc.listings(session=test_session).collect("listing"))) == 1
|
|
378
378
|
|
|
379
|
-
dc.from_storage(uri, session=test_session, update=True)
|
|
379
|
+
dc.from_storage(uri, session=test_session, update=True).exec()
|
|
380
380
|
listings = list(dc.listings(session=test_session).collect("listing"))
|
|
381
381
|
assert len(listings) == 2
|
|
382
382
|
listings.sort(key=lambda lst: lst.version)
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
import os.path
|
|
2
|
-
from typing import (
|
|
3
|
-
TYPE_CHECKING,
|
|
4
|
-
Optional,
|
|
5
|
-
Union,
|
|
6
|
-
)
|
|
7
|
-
|
|
8
|
-
from datachain.lib.file import (
|
|
9
|
-
File,
|
|
10
|
-
FileType,
|
|
11
|
-
get_file_type,
|
|
12
|
-
)
|
|
13
|
-
from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
|
|
14
|
-
from datachain.query import Session
|
|
15
|
-
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from .datachain import DataChain
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def from_storage(
|
|
21
|
-
uri: Union[str, os.PathLike[str]],
|
|
22
|
-
*,
|
|
23
|
-
type: FileType = "binary",
|
|
24
|
-
session: Optional[Session] = None,
|
|
25
|
-
settings: Optional[dict] = None,
|
|
26
|
-
in_memory: bool = False,
|
|
27
|
-
recursive: Optional[bool] = True,
|
|
28
|
-
object_name: str = "file",
|
|
29
|
-
update: bool = False,
|
|
30
|
-
anon: bool = False,
|
|
31
|
-
client_config: Optional[dict] = None,
|
|
32
|
-
) -> "DataChain":
|
|
33
|
-
"""Get data from a storage as a list of file with all file attributes.
|
|
34
|
-
It returns the chain itself as usual.
|
|
35
|
-
|
|
36
|
-
Parameters:
|
|
37
|
-
uri : storage URI with directory. URI must start with storage prefix such
|
|
38
|
-
as `s3://`, `gs://`, `az://` or "file:///"
|
|
39
|
-
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
40
|
-
recursive : search recursively for the given path.
|
|
41
|
-
object_name : Created object column name.
|
|
42
|
-
update : force storage reindexing. Default is False.
|
|
43
|
-
anon : If True, we will treat cloud bucket as public one
|
|
44
|
-
client_config : Optional client configuration for the storage client.
|
|
45
|
-
|
|
46
|
-
Example:
|
|
47
|
-
Simple call from s3
|
|
48
|
-
```py
|
|
49
|
-
import datachain as dc
|
|
50
|
-
chain = dc.from_storage("s3://my-bucket/my-dir")
|
|
51
|
-
```
|
|
52
|
-
|
|
53
|
-
With AWS S3-compatible storage
|
|
54
|
-
```py
|
|
55
|
-
import datachain as dc
|
|
56
|
-
chain = dc.from_storage(
|
|
57
|
-
"s3://my-bucket/my-dir",
|
|
58
|
-
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
59
|
-
)
|
|
60
|
-
```
|
|
61
|
-
|
|
62
|
-
Pass existing session
|
|
63
|
-
```py
|
|
64
|
-
session = Session.get()
|
|
65
|
-
import datachain as dc
|
|
66
|
-
chain = dc.from_storage("s3://my-bucket/my-dir", session=session)
|
|
67
|
-
```
|
|
68
|
-
"""
|
|
69
|
-
from .datachain import DataChain
|
|
70
|
-
from .datasets import from_dataset
|
|
71
|
-
from .records import from_records
|
|
72
|
-
from .values import from_values
|
|
73
|
-
|
|
74
|
-
file_type = get_file_type(type)
|
|
75
|
-
|
|
76
|
-
if anon:
|
|
77
|
-
client_config = (client_config or {}) | {"anon": True}
|
|
78
|
-
session = Session.get(session, client_config=client_config, in_memory=in_memory)
|
|
79
|
-
cache = session.catalog.cache
|
|
80
|
-
client_config = session.catalog.client_config
|
|
81
|
-
|
|
82
|
-
list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
|
|
83
|
-
uri, session, update=update
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
# ds_name is None if object is a file, we don't want to use cache
|
|
87
|
-
# or do listing in that case - just read that single object
|
|
88
|
-
if not list_ds_name:
|
|
89
|
-
dc = from_values(
|
|
90
|
-
session=session,
|
|
91
|
-
settings=settings,
|
|
92
|
-
in_memory=in_memory,
|
|
93
|
-
file=[get_file_info(list_uri, cache, client_config=client_config)],
|
|
94
|
-
)
|
|
95
|
-
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
96
|
-
return dc
|
|
97
|
-
|
|
98
|
-
if update or not list_ds_exists:
|
|
99
|
-
# disable prefetch for listing, as it pre-downloads all files
|
|
100
|
-
(
|
|
101
|
-
from_records(
|
|
102
|
-
DataChain.DEFAULT_FILE_RECORD,
|
|
103
|
-
session=session,
|
|
104
|
-
settings=settings,
|
|
105
|
-
in_memory=in_memory,
|
|
106
|
-
)
|
|
107
|
-
.settings(prefetch=0)
|
|
108
|
-
.gen(
|
|
109
|
-
list_bucket(list_uri, cache, client_config=client_config),
|
|
110
|
-
output={f"{object_name}": File},
|
|
111
|
-
)
|
|
112
|
-
.save(list_ds_name, listing=True)
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
dc = from_dataset(list_ds_name, session=session, settings=settings)
|
|
116
|
-
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
117
|
-
|
|
118
|
-
return ls(dc, list_path, recursive=recursive, object_name=object_name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|