datachain 0.10.0__tar.gz → 0.11.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.10.0 → datachain-0.11.11}/.github/workflows/benchmarks.yml +2 -2
- {datachain-0.10.0 → datachain-0.11.11}/.github/workflows/release.yml +2 -2
- {datachain-0.10.0 → datachain-0.11.11}/.github/workflows/tests.yml +15 -7
- {datachain-0.10.0 → datachain-0.11.11}/.pre-commit-config.yaml +1 -1
- {datachain-0.10.0 → datachain-0.11.11}/PKG-INFO +5 -4
- datachain-0.11.11/docs/references/remotes.md +346 -0
- {datachain-0.10.0 → datachain-0.11.11}/mkdocs.yml +1 -0
- {datachain-0.10.0 → datachain-0.11.11}/noxfile.py +14 -19
- {datachain-0.10.0 → datachain-0.11.11}/pyproject.toml +5 -6
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/__init__.py +1 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/show.py +12 -1
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/parser/utils.py +6 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/data_model.py +6 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/dc.py +91 -20
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/file.py +52 -11
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/signal_schema.py +194 -15
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/nodes_thread_pool.py +32 -11
- datachain-0.11.11/src/datachain/script_meta.py +147 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/utils.py +3 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/PKG-INFO +5 -4
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/SOURCES.txt +4 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/requires.txt +5 -3
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_datachain.py +311 -15
- datachain-0.11.11/tests/func/test_hidden_field.py +70 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_datachain.py +80 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_signal_schema.py +140 -0
- datachain-0.11.11/tests/unit/test_script_meta.py +119 -0
- {datachain-0.10.0 → datachain-0.11.11}/.cruft.json +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/.gitattributes +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/.github/codecov.yaml +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/.github/dependabot.yml +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/.gitignore +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/LICENSE +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/README.rst +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/assets/datachain.svg +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/contributing.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/examples.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/index.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/overrides/main.html +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/quick-start.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/file.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/index.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/pose.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/segment.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/datachain.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/func.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/index.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/toolkit.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/torch.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/references/udf.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/docs/tutorials.md +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/multimodal/wds.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/setup.cfg +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/__main__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/asyn.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cache.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/utils.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/azure.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/gcs.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/hf.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/local.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/s3.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/config.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/dataset.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/error.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/fs/reference.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/array.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/base.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/conditional.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/func.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/numeric.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/path.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/random.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/string.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/window.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/job.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/clip.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/hf.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/image.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/listing.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/settings.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/tar.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/text.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/udf.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/utils.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/video.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/listing.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/bbox.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/pose.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/segment.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/node.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/progress.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/py.typed +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/batch.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/dataset.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/metrics.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/params.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/queue.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/schema.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/session.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/udf.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/utils.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/remote/studio.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/types.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/utils.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/studio.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/telemetry.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/conftest.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/data.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/examples/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/examples/test_examples.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/examples/wds_data.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_catalog.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_client.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_data_storage.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_datasets.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_file.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_hf.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_listing.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_ls.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_metrics.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_pull.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_pytorch.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_query.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_session.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_toolkit.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_warehouse.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/scripts/feature_class.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/test_atomicity.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/test_cli_e2e.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/test_cli_studio.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/test_query_e2e.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/test_telemetry.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_video.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_asyn.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_cache.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_catalog.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_client.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_config.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_dataset.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_func.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_listing.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_metastore.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_query.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_query_params.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_serializer.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_session.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_utils.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.10.0 → datachain-0.11.11}/tests/utils.py +0 -0
|
@@ -19,10 +19,10 @@ jobs:
|
|
|
19
19
|
runs-on: ubuntu-latest
|
|
20
20
|
steps:
|
|
21
21
|
- uses: actions/checkout@v4
|
|
22
|
-
- name: Set up Python 3.
|
|
22
|
+
- name: Set up Python 3.13
|
|
23
23
|
uses: actions/setup-python@v5
|
|
24
24
|
with:
|
|
25
|
-
python-version: '3.
|
|
25
|
+
python-version: '3.13'
|
|
26
26
|
|
|
27
27
|
- name: Setup uv
|
|
28
28
|
uses: astral-sh/setup-uv@v5
|
|
@@ -60,16 +60,16 @@ jobs:
|
|
|
60
60
|
fail-fast: false
|
|
61
61
|
matrix:
|
|
62
62
|
os: [ubuntu-latest-8-cores]
|
|
63
|
-
pyv: ['3.9', '3.10', '3.11', '3.12']
|
|
63
|
+
pyv: ['3.9', '3.10', '3.11', '3.12', '3.13']
|
|
64
64
|
include:
|
|
65
65
|
- os: macos-latest
|
|
66
66
|
pyv: '3.9'
|
|
67
67
|
- os: macos-latest
|
|
68
|
-
pyv: '3.
|
|
68
|
+
pyv: '3.13'
|
|
69
69
|
- os: windows-latest
|
|
70
70
|
pyv: '3.9'
|
|
71
71
|
- os: windows-latest
|
|
72
|
-
pyv: '3.
|
|
72
|
+
pyv: '3.13'
|
|
73
73
|
|
|
74
74
|
steps:
|
|
75
75
|
- name: Check out the repository
|
|
@@ -80,6 +80,14 @@ jobs:
|
|
|
80
80
|
|
|
81
81
|
- name: Set up FFmpeg
|
|
82
82
|
uses: AnimMouse/setup-ffmpeg@v1
|
|
83
|
+
id: ffmpeg-install
|
|
84
|
+
continue-on-error: ${{ runner.os == 'macOS' }}
|
|
85
|
+
|
|
86
|
+
# https://github.com/AnimMouse/setup-ffmpeg/issues/5
|
|
87
|
+
- if: steps.ffmpeg-install.outcome == 'failure' && runner.os == 'macOS'
|
|
88
|
+
run: brew install ffmpeg
|
|
89
|
+
env:
|
|
90
|
+
HOMEBREW_NO_AUTO_UPDATE: "1"
|
|
83
91
|
|
|
84
92
|
- name: Set up Python ${{ matrix.pyv }}
|
|
85
93
|
uses: actions/setup-python@v5
|
|
@@ -132,14 +140,14 @@ jobs:
|
|
|
132
140
|
fail-fast: false
|
|
133
141
|
matrix:
|
|
134
142
|
os: [ubuntu-latest, windows-latest]
|
|
135
|
-
pyv: ['3.9', '3.
|
|
143
|
+
pyv: ['3.9', '3.13']
|
|
136
144
|
group: ['get_started', 'computer_vision', 'llm_and_nlp', 'multimodal']
|
|
137
145
|
exclude:
|
|
138
146
|
- {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
|
|
139
|
-
- {os: ubuntu-latest, pyv: '3.
|
|
147
|
+
- {os: ubuntu-latest, pyv: '3.13', group: 'multimodal'}
|
|
140
148
|
include:
|
|
141
149
|
- {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
|
|
142
|
-
- {os: ubuntu-latest-4-cores, pyv: "3.
|
|
150
|
+
- {os: ubuntu-latest-4-cores, pyv: "3.13", group: multimodal}
|
|
143
151
|
|
|
144
152
|
steps:
|
|
145
153
|
- uses: actions/checkout@v4
|
|
@@ -163,7 +171,7 @@ jobs:
|
|
|
163
171
|
|
|
164
172
|
# HF runs against actual API - thus run it only once
|
|
165
173
|
- name: Set hf token
|
|
166
|
-
if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.
|
|
174
|
+
if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.13'
|
|
167
175
|
run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
|
|
168
176
|
|
|
169
177
|
- name: Run examples
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
16
|
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
17
|
Requires-Python: >=3.9
|
|
17
18
|
Description-Content-Type: text/x-rst
|
|
@@ -49,6 +50,7 @@ Requires-Dist: platformdirs
|
|
|
49
50
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
50
51
|
Requires-Dist: tabulate
|
|
51
52
|
Requires-Dist: websockets
|
|
53
|
+
Requires-Dist: tomli; python_version < "3.11"
|
|
52
54
|
Provides-Extra: docs
|
|
53
55
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
54
56
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -70,9 +72,8 @@ Provides-Extra: hf
|
|
|
70
72
|
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
71
73
|
Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
|
|
72
74
|
Provides-Extra: video
|
|
73
|
-
Requires-Dist: av<14; extra == "video"
|
|
74
75
|
Requires-Dist: ffmpeg-python; extra == "video"
|
|
75
|
-
Requires-Dist: imageio[ffmpeg]; extra == "video"
|
|
76
|
+
Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
|
|
76
77
|
Requires-Dist: opencv-python; extra == "video"
|
|
77
78
|
Provides-Extra: tests
|
|
78
79
|
Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
|
|
@@ -102,7 +103,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
102
103
|
Requires-Dist: defusedxml; extra == "examples"
|
|
103
104
|
Requires-Dist: accelerate; extra == "examples"
|
|
104
105
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
105
|
-
Requires-Dist: ultralytics==8.3.
|
|
106
|
+
Requires-Dist: ultralytics==8.3.82; extra == "examples"
|
|
106
107
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
107
108
|
|
|
108
109
|
================
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
# Interacting with remote storage
|
|
2
|
+
|
|
3
|
+
DataChain supports reading and writing data from different remote storages using methods like `DataChain.from_storage` and `DataChain.to_storage`. The supported storages includes: local file system, AWS S3 storage, Google Cloud Storage, Azure Blob Storage, Hugging Face and more.
|
|
4
|
+
|
|
5
|
+
Example implementation for reading and writing data from/to different remote storages:
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from datachain import DataChain
|
|
9
|
+
|
|
10
|
+
dc = DataChain.from_storage("s3://bucket-name/path/to/data")
|
|
11
|
+
dc.to_storage("gs://bucket-name/path/to/data")
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
DataChain uses [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) to interact with different remote storages. You can pass the following fsspec-supported URIs to `from_storage` and `to_storage` methods.
|
|
15
|
+
|
|
16
|
+
- Local file system: `file://path/to/data`
|
|
17
|
+
- AWS S3 storage: `s3://bucket-name/path/to/data`
|
|
18
|
+
- Google Cloud Storage: `gs://bucket-name/path/to/data`
|
|
19
|
+
- Azure Blob Storage: `az://container-name/path/to/data`
|
|
20
|
+
- Hugging Face: `hf://dataset-name`
|
|
21
|
+
|
|
22
|
+
## Extra configuration
|
|
23
|
+
For the configuration parameters to the filesystem, you can pass the key and value pair as client_config dictionary that will be passed to the respective filesystem.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
### AWS S3 compatible storage
|
|
27
|
+
|
|
28
|
+
DataChain uses [s3fs](https://s3fs.readthedocs.io/en/latest/) to interact with AWS S3 storage. Authentication can be configured using standard AWS credential locations, such as `~/.aws/credentials` and `~/.aws/config`. You can also pass the following configuration parameters to the s3fs filesystem as `client_config` dictionary.
|
|
29
|
+
|
|
30
|
+
- `anon`: `bool` (default: `False`)
|
|
31
|
+
|
|
32
|
+
Whether to use anonymous connection (public buckets only). If `False`,
|
|
33
|
+
uses the key/secret given, or boto's credential resolver (client_kwargs,
|
|
34
|
+
environment, variables, config files, EC2 IAM server, in that order)
|
|
35
|
+
|
|
36
|
+
- `endpoint_url`: `string` (default: `None`)
|
|
37
|
+
|
|
38
|
+
Use this endpoint URL, if specified. Needed for connecting to non-AWS
|
|
39
|
+
S3 buckets. Takes precedence over `endpoint_url` in client_kwargs.
|
|
40
|
+
|
|
41
|
+
- `key`: `string` (default: `None`)
|
|
42
|
+
|
|
43
|
+
If not anonymous, use this access key ID, if specified. Takes precedence
|
|
44
|
+
over `aws_access_key_id` in client_kwargs.
|
|
45
|
+
|
|
46
|
+
- `secret`: `string` (default: `None`)
|
|
47
|
+
|
|
48
|
+
If not anonymous, use this secret access key, if specified. Takes
|
|
49
|
+
precedence over `aws_secret_access_key` in client_kwargs.
|
|
50
|
+
|
|
51
|
+
- `token`: `string` (default: `None`)
|
|
52
|
+
|
|
53
|
+
If not anonymous, use this security token, if specified
|
|
54
|
+
|
|
55
|
+
- `use_ssl`: `bool` (default: `True`)
|
|
56
|
+
|
|
57
|
+
Whether to use SSL in connections to S3; may be faster without, but
|
|
58
|
+
insecure. If `use_ssl` is also set in `client_kwargs`,
|
|
59
|
+
the value set in `client_kwargs` will take priority.
|
|
60
|
+
|
|
61
|
+
- `s3_additional_kwargs`: `dict` (default: `{}`)
|
|
62
|
+
|
|
63
|
+
Dict of parameters that are used when calling s3 api
|
|
64
|
+
methods. Typically used for things like "ServerSideEncryption".
|
|
65
|
+
|
|
66
|
+
- `client_kwargs`: `dict` (default: `{}`)
|
|
67
|
+
|
|
68
|
+
Dict of parameters for the botocore client.
|
|
69
|
+
|
|
70
|
+
- `requester_pays`: `bool` (default: `False`)
|
|
71
|
+
|
|
72
|
+
If RequesterPays buckets are supported.
|
|
73
|
+
|
|
74
|
+
- `default_block_size`: `int` (default: `None`)
|
|
75
|
+
|
|
76
|
+
If given, the default block size value used for `open()`, if no
|
|
77
|
+
specific value is given at all time. The built-in default is 5MB.
|
|
78
|
+
|
|
79
|
+
- `default_fill_cache`: `bool` (default: `True`)
|
|
80
|
+
|
|
81
|
+
Whether to use cache filling with open by default. Refer to `S3File.open`.
|
|
82
|
+
|
|
83
|
+
- `default_cache_type`: `string` (default: `"readahead"`)
|
|
84
|
+
|
|
85
|
+
If given, the default cache_type value used for `open()`. Set to `None`
|
|
86
|
+
if no caching is desired. See fsspec's documentation for other available
|
|
87
|
+
`cache_type` values. Default cache_type is `"readahead"`.
|
|
88
|
+
|
|
89
|
+
- `version_aware`: `bool` (default: `False`)
|
|
90
|
+
|
|
91
|
+
Whether to support bucket versioning. If enable this will require the
|
|
92
|
+
user to have the necessary IAM permissions for dealing with versioned
|
|
93
|
+
objects. Note that in the event that you only need to work with the
|
|
94
|
+
latest version of objects in a versioned bucket, and do not need the
|
|
95
|
+
VersionId for those objects, you should set `version_aware` to `False`
|
|
96
|
+
for performance reasons. When set to `True`, filesystem instances will
|
|
97
|
+
use the S3 `ListObjectVersions` API call to list directory contents,
|
|
98
|
+
which requires listing all historical object versions.
|
|
99
|
+
|
|
100
|
+
- `cache_regions`: `bool` (default: `False`)
|
|
101
|
+
|
|
102
|
+
Whether to cache bucket regions or not. Whenever a new bucket is used,
|
|
103
|
+
it will first find out which region it belongs and then use the client
|
|
104
|
+
for that region.
|
|
105
|
+
|
|
106
|
+
- `asynchronous`: `bool` (default: `False`)
|
|
107
|
+
|
|
108
|
+
Whether this instance is to be used from inside coroutines.
|
|
109
|
+
|
|
110
|
+
- `config_kwargs`: `dict` (default: `{}`)
|
|
111
|
+
|
|
112
|
+
Dict of parameters passed to `botocore.client.Config`.
|
|
113
|
+
|
|
114
|
+
- `kwargs`: `dict` (default: `{}`)
|
|
115
|
+
|
|
116
|
+
Other parameters for core session.
|
|
117
|
+
|
|
118
|
+
- `session`: `aiobotocore.session.AioSession` (default: `None`)
|
|
119
|
+
|
|
120
|
+
Aiobotocore `AioSession` object to be used for all connections.
|
|
121
|
+
This session will be used inplace of creating a new session inside S3FileSystem.
|
|
122
|
+
|
|
123
|
+
For example: `aiobotocore.session.AioSession(profile='test_user')`
|
|
124
|
+
|
|
125
|
+
- `max_concurrency`: `int` (default: `1`)
|
|
126
|
+
|
|
127
|
+
The maximum number of concurrent transfers to use per file for multipart
|
|
128
|
+
upload (`put()`) operations. Defaults to `1` (sequential). When used in
|
|
129
|
+
conjunction with `S3FileSystem.put(batch_size=...)` the maximum number of
|
|
130
|
+
simultaneous connections is `max_concurrency * batch_size`. We may extend
|
|
131
|
+
this parameter to affect `pipe()`, `cat()` and `get()`. Increasing this
|
|
132
|
+
value will result in higher memory usage during multipart upload operations (by
|
|
133
|
+
`max_concurrency * chunksize` bytes per file).
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
Example:
|
|
137
|
+
```python
|
|
138
|
+
chain = DataChain.from_storage(
|
|
139
|
+
"s3://my-bucket/my-dir",
|
|
140
|
+
client_config = {
|
|
141
|
+
"endpoint_url": "<minio-endpoint-url>",
|
|
142
|
+
"key": "<minio-access-key",
|
|
143
|
+
"secret": "<minio-secret-key"
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Google Cloud Storage
|
|
149
|
+
|
|
150
|
+
DataChain uses [gcsfs](https://gcsfs.readthedocs.io/en/latest/) to interact with Google Cloud Storage. Authentication can be achieved by using any of the method described at [gcsfs documentation](https://gcsfs.readthedocs.io/en/latest/#credentials). You can also pass the following configuration parameters to the gcsfs filesystem as client_config dictionary.
|
|
151
|
+
|
|
152
|
+
- `project`: `string` (default: `None`)
|
|
153
|
+
|
|
154
|
+
The project to work under. Note that this is not the same as, but often
|
|
155
|
+
very similar to, the project name. This is required in order to list all
|
|
156
|
+
the buckets you have access to within a project and to create/delete
|
|
157
|
+
buckets, or update their access policies. If `token='google_default'`,
|
|
158
|
+
the value is overridden by the default, if `token='anon'`, the value is
|
|
159
|
+
ignored.
|
|
160
|
+
|
|
161
|
+
- `access`: `string` (default: `None`)
|
|
162
|
+
|
|
163
|
+
One of `"read_only"`, `"read_write"`, `"full_control"`. Full control implies
|
|
164
|
+
read/write as well as modifying metadata, e.g., access control.
|
|
165
|
+
|
|
166
|
+
- `token`: `None`, `dict` or `string` (default: `None`)
|
|
167
|
+
|
|
168
|
+
The token to use for authentication. If `None`, the default is used. If
|
|
169
|
+
a string, it is interpreted as a path to a token file. If a dict, it is
|
|
170
|
+
interpreted as a token dictionary, such as that provided by Google Cloud
|
|
171
|
+
Platform. See also description of authentication methods, from link above.
|
|
172
|
+
|
|
173
|
+
- `consistency`: `string` (default: `None`)
|
|
174
|
+
|
|
175
|
+
One of `"none"`, `"size"`, `"md5"`. Check method when writing files.
|
|
176
|
+
Can be overridden in `open()`.
|
|
177
|
+
|
|
178
|
+
- `cache_timeout`: `float` (default: `None`)
|
|
179
|
+
|
|
180
|
+
Cache expiration time in seconds for object metadata cache. Set
|
|
181
|
+
`cache_timeout <= 0` for no caching, `None` for no cache expiration.
|
|
182
|
+
|
|
183
|
+
- `secure_serialize`: `bool` (default: `None`)
|
|
184
|
+
|
|
185
|
+
Whether to use secure serialization. This is a deprecated option and
|
|
186
|
+
will be removed in future versions.
|
|
187
|
+
|
|
188
|
+
- `requester_pays`: `bool` or `str` (default: `False`)
|
|
189
|
+
|
|
190
|
+
Whether to use requester-pays requests. This will include your
|
|
191
|
+
project ID `project` in requests as the `userProject`, and you'll be
|
|
192
|
+
billed for accessing data from requester-pays buckets. Optionally,
|
|
193
|
+
pass a project-id here as a string to use that as the `userProject`.
|
|
194
|
+
|
|
195
|
+
- `session_kwargs`: `dict` (default: `{}`)
|
|
196
|
+
|
|
197
|
+
Passed on to `aiohttp.ClientSession`. Can contain, for example, proxy
|
|
198
|
+
settings.
|
|
199
|
+
|
|
200
|
+
- `endpoint_url`: `string` (default: `None`)
|
|
201
|
+
|
|
202
|
+
If given, use this URL (format: `protocol://host:port`, *without* any
|
|
203
|
+
path part) for communication. If not given, defaults to the value
|
|
204
|
+
of environment variable `"STORAGE_EMULATOR_HOST"`; if that is not set
|
|
205
|
+
either, will use the standard Google endpoint.
|
|
206
|
+
|
|
207
|
+
- `default_location`: `str` (default: `None`)
|
|
208
|
+
|
|
209
|
+
Default location where buckets are created, like `"US"` or `"EUROPE-WEST3"`.
|
|
210
|
+
You can find a list of all available locations here:
|
|
211
|
+
https://cloud.google.com/storage/docs/locations#available-locations
|
|
212
|
+
|
|
213
|
+
- `version_aware`: `bool` (default: `False`)
|
|
214
|
+
|
|
215
|
+
Whether to support object versioning. If enabled this will require the
|
|
216
|
+
user to have the necessary permissions for dealing with versioned objects.
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
### Azure Blob Storage
|
|
220
|
+
|
|
221
|
+
DataChain uses [adlfs](https://fsspec.github.io/adlfs/) to interact with Azure Blob Storage. Authentication can be achieved by using any of the method described at [adlfs documentation](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials). You can also pass the following configuration parameters to the adlfs filesystem as client_config dictionary.
|
|
222
|
+
|
|
223
|
+
- `account_name`: `str` (default: `None`)
|
|
224
|
+
|
|
225
|
+
The storage account name. This is used to authenticate requests
|
|
226
|
+
signed with an account key and to construct the storage endpoint. It
|
|
227
|
+
is required unless a connection string is given, or if a custom
|
|
228
|
+
domain is used with anonymous authentication.
|
|
229
|
+
|
|
230
|
+
- `account_key`: `str` (default: `None`)
|
|
231
|
+
|
|
232
|
+
The storage account key. This is used for shared key authentication.
|
|
233
|
+
If any of account key, sas token or client_id is specified, anonymous access
|
|
234
|
+
will be used.
|
|
235
|
+
|
|
236
|
+
- `sas_token`: `str` (default: `None`)
|
|
237
|
+
|
|
238
|
+
A shared access signature token to use to authenticate requests
|
|
239
|
+
instead of the account key. If account key and sas token are both
|
|
240
|
+
specified, account key will be used to sign. If any of account key, sas token
|
|
241
|
+
or client_id are specified, anonymous access will be used.
|
|
242
|
+
|
|
243
|
+
- `request_session`: `requests.Session` (default: `None`)
|
|
244
|
+
|
|
245
|
+
The session object to use for http requests.
|
|
246
|
+
|
|
247
|
+
- `connection_string`: `str` (default: `None`)
|
|
248
|
+
|
|
249
|
+
If specified, this will override all other parameters besides
|
|
250
|
+
request session. See
|
|
251
|
+
http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/
|
|
252
|
+
for the connection string format.
|
|
253
|
+
|
|
254
|
+
- `credential`: `azure.core.credentials_async.AsyncTokenCredential` or SAS token (default: `None`)
|
|
255
|
+
|
|
256
|
+
The credentials with which to authenticate. Optional if the account URL already has a SAS token.
|
|
257
|
+
Can include an instance of TokenCredential class from azure.identity.aio.
|
|
258
|
+
|
|
259
|
+
- `blocksize`: `int` (default: `None`)
|
|
260
|
+
|
|
261
|
+
The block size to use for download/upload operations. Defaults to hardcoded value of
|
|
262
|
+
`BlockBlobService.MAX_BLOCK_SIZE`
|
|
263
|
+
|
|
264
|
+
- `client_id`: `str` (default: `None`)
|
|
265
|
+
|
|
266
|
+
Client ID to use when authenticating using an AD Service Principal client/secret.
|
|
267
|
+
|
|
268
|
+
- `client_secret`: `str` (default: `None`)
|
|
269
|
+
|
|
270
|
+
Client secret to use when authenticating using an AD Service Principal client/secret.
|
|
271
|
+
|
|
272
|
+
- `tenant_id`: `str` (default: `None`)
|
|
273
|
+
|
|
274
|
+
Tenant ID to use when authenticating using an AD Service Principal client/secret.
|
|
275
|
+
|
|
276
|
+
- `anon`: `boolean` (default: `None`)
|
|
277
|
+
|
|
278
|
+
The value to use for whether to attempt anonymous access if no other credential is
|
|
279
|
+
passed. By default (`None`), the `AZURE_STORAGE_ANON` environment variable is
|
|
280
|
+
checked. False values (`false`, `0`, `f`) will resolve to `False` and
|
|
281
|
+
anonymous access will not be attempted. Otherwise the value for `anon` resolves
|
|
282
|
+
to `True`.
|
|
283
|
+
|
|
284
|
+
- `default_fill_cache`: `bool` (default: `True`)
|
|
285
|
+
|
|
286
|
+
Whether to use cache filling with open by default
|
|
287
|
+
|
|
288
|
+
- `default_cache_type`: `string` (default: `"bytes"`)
|
|
289
|
+
|
|
290
|
+
If given, the default cache_type value used for `open()`. Set to `None` if no caching
|
|
291
|
+
is desired. Docs in fsspec.
|
|
292
|
+
|
|
293
|
+
- `version_aware`: `bool` (default: `False`)
|
|
294
|
+
|
|
295
|
+
Whether to support blob versioning. If enable this will require the user to have the
|
|
296
|
+
necessary permissions for dealing with versioned blobs.
|
|
297
|
+
|
|
298
|
+
- `assume_container_exists`: `bool` (default: `None`)
|
|
299
|
+
|
|
300
|
+
Set this to `True` to not check for existence of containers at all, assuming they exist.
|
|
301
|
+
`None` (default) means to warn in case of a failure when checking for existence of a container.
|
|
302
|
+
`False` throws if retrieving container properties fails, which might happen if your
|
|
303
|
+
authentication is only valid at the storage container level, and not the
|
|
304
|
+
storage account level.
|
|
305
|
+
|
|
306
|
+
- `max_concurrency`: `int` (default: `None`)
|
|
307
|
+
|
|
308
|
+
The number of concurrent connections to use when uploading or downloading a blob.
|
|
309
|
+
If `None` it will be inferred from `fsspec.asyn._get_batch_size()`.
|
|
310
|
+
|
|
311
|
+
- `timeout`: `int` (default: `None`)
|
|
312
|
+
|
|
313
|
+
Sets the server-side timeout when uploading or downloading a blob.
|
|
314
|
+
|
|
315
|
+
- `connection_timeout`: `int` (default: `None`)
|
|
316
|
+
|
|
317
|
+
The number of seconds the client will wait to establish a connection to the server
|
|
318
|
+
when uploading or downloading a blob.
|
|
319
|
+
|
|
320
|
+
- `read_timeout`: `int` (default: `None`)
|
|
321
|
+
|
|
322
|
+
The number of seconds the client will wait, between consecutive read operations,
|
|
323
|
+
for a response from the server while uploading or downloading a blob.
|
|
324
|
+
|
|
325
|
+
- `account_host`: `str` (default: `None`)
|
|
326
|
+
|
|
327
|
+
The storage account host. This string is the entire url to the for the storage
|
|
328
|
+
after the `https://`, i.e. `"https://{account_host}"`. This parameter is only
|
|
329
|
+
required for Azure clouds where account urls do not end with `"blob.core.windows.net"`.
|
|
330
|
+
Note that the `account_name` parameter is still required.
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
### Hugging Face
|
|
334
|
+
|
|
335
|
+
DataChain uses [huggingface_hub](https://pypi.org/project/huggingface-hub/) to interact with Hugging Face. You can pass the following parameters to client config to interact with Hugging Face.
|
|
336
|
+
|
|
337
|
+
- `token`: `str` or `bool` (default: `None`)
|
|
338
|
+
|
|
339
|
+
A valid user access token (string). Defaults to the locally saved
|
|
340
|
+
token, which is the recommended method for authentication (see
|
|
341
|
+
https://huggingface.co/docs/huggingface_hub/quick-start#authentication).
|
|
342
|
+
To disable authentication, pass `False`.
|
|
343
|
+
|
|
344
|
+
- `endpoint`: `str` (default: `None`)
|
|
345
|
+
|
|
346
|
+
Endpoint of the Hub. Defaults to `https://huggingface.co`.
|
|
@@ -84,6 +84,7 @@ nav:
|
|
|
84
84
|
- Torch: references/torch.md
|
|
85
85
|
- Functions: references/func.md
|
|
86
86
|
- Toolkit: references/toolkit.md
|
|
87
|
+
- 📡 Interacting with remote storage: references/remotes.md
|
|
87
88
|
- 🤝 Contributing: contributing.md
|
|
88
89
|
|
|
89
90
|
- DataChain Website ↗: https://datachain.ai" target="_blank"
|
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
"""Automation using nox."""
|
|
2
|
+
# /// script
|
|
3
|
+
# dependencies = ["nox"]
|
|
4
|
+
# ///
|
|
2
5
|
|
|
3
6
|
import glob
|
|
4
|
-
import os
|
|
5
7
|
|
|
6
8
|
import nox
|
|
7
9
|
|
|
8
10
|
nox.options.default_venv_backend = "uv|virtualenv"
|
|
9
11
|
nox.options.reuse_existing_virtualenvs = True
|
|
10
12
|
nox.options.sessions = "lint", "tests"
|
|
13
|
+
|
|
14
|
+
project = nox.project.load_toml()
|
|
15
|
+
python_versions = nox.project.python_versions(project)
|
|
11
16
|
locations = "src", "tests"
|
|
12
17
|
|
|
13
18
|
|
|
@@ -29,12 +34,12 @@ def bench(session: nox.Session) -> None:
|
|
|
29
34
|
)
|
|
30
35
|
|
|
31
36
|
|
|
32
|
-
@nox.session(python=
|
|
37
|
+
@nox.session(python=python_versions)
|
|
33
38
|
def tests(session: nox.Session) -> None:
|
|
34
39
|
session.install(".[tests]")
|
|
35
40
|
env = {"COVERAGE_FILE": f".coverage.{session.python}"}
|
|
36
|
-
if session.python
|
|
37
|
-
# improve performance of tests in Python
|
|
41
|
+
if session.python in ("3.12", "3.13"):
|
|
42
|
+
# improve performance of tests in Python>=3.12 when used with coverage
|
|
38
43
|
# https://github.com/nedbat/coveragepy/issues/1665
|
|
39
44
|
# https://github.com/python/cpython/issues/107674
|
|
40
45
|
env["COVERAGE_CORE"] = "sysmon"
|
|
@@ -68,21 +73,7 @@ def build(session: nox.Session) -> None:
|
|
|
68
73
|
session.run("twine", "check", *dists, silent=True)
|
|
69
74
|
|
|
70
75
|
|
|
71
|
-
@nox.session
|
|
72
|
-
def dev(session: nox.Session) -> None:
|
|
73
|
-
"""Sets up a python development environment for the project."""
|
|
74
|
-
args = session.posargs or ("venv",)
|
|
75
|
-
venv_dir = os.fsdecode(os.path.abspath(args[0]))
|
|
76
|
-
|
|
77
|
-
session.log(f"Setting up virtual environment in {venv_dir}")
|
|
78
|
-
session.install("virtualenv")
|
|
79
|
-
session.run("virtualenv", venv_dir, silent=True)
|
|
80
|
-
|
|
81
|
-
python = os.path.join(venv_dir, "bin/python")
|
|
82
|
-
session.run(python, "-m", "pip", "install", "-e", ".[dev]", external=True)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
@nox.session(python=["3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"])
|
|
76
|
+
@nox.session(python=python_versions)
|
|
86
77
|
def examples(session: nox.Session) -> None:
|
|
87
78
|
session.install(".[examples]")
|
|
88
79
|
session.run(
|
|
@@ -93,3 +84,7 @@ def examples(session: nox.Session) -> None:
|
|
|
93
84
|
"examples",
|
|
94
85
|
*session.posargs,
|
|
95
86
|
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == "__main__":
|
|
90
|
+
nox.main()
|
|
@@ -14,6 +14,7 @@ classifiers = [
|
|
|
14
14
|
"Programming Language :: Python :: 3.10",
|
|
15
15
|
"Programming Language :: Python :: 3.11",
|
|
16
16
|
"Programming Language :: Python :: 3.12",
|
|
17
|
+
"Programming Language :: Python :: 3.13",
|
|
17
18
|
"Development Status :: 2 - Pre-Alpha"
|
|
18
19
|
]
|
|
19
20
|
requires-python = ">=3.9"
|
|
@@ -51,7 +52,8 @@ dependencies = [
|
|
|
51
52
|
"platformdirs",
|
|
52
53
|
"dvc-studio-client>=0.21,<1",
|
|
53
54
|
"tabulate",
|
|
54
|
-
"websockets"
|
|
55
|
+
"websockets",
|
|
56
|
+
"tomli;python_version<'3.11'"
|
|
55
57
|
]
|
|
56
58
|
|
|
57
59
|
[project.optional-dependencies]
|
|
@@ -81,11 +83,8 @@ hf = [
|
|
|
81
83
|
"datasets[audio,vision]>=2.21.0"
|
|
82
84
|
]
|
|
83
85
|
video = [
|
|
84
|
-
# Use 'av<14' because of incompatibility with imageio
|
|
85
|
-
# See https://github.com/PyAV-Org/PyAV/discussions/1700
|
|
86
|
-
"av<14",
|
|
87
86
|
"ffmpeg-python",
|
|
88
|
-
"imageio[ffmpeg]",
|
|
87
|
+
"imageio[ffmpeg,pyav]>=2.37.0",
|
|
89
88
|
"opencv-python"
|
|
90
89
|
]
|
|
91
90
|
tests = [
|
|
@@ -118,7 +117,7 @@ examples = [
|
|
|
118
117
|
"defusedxml",
|
|
119
118
|
"accelerate",
|
|
120
119
|
"huggingface_hub[hf_transfer]",
|
|
121
|
-
"ultralytics==8.3.
|
|
120
|
+
"ultralytics==8.3.82",
|
|
122
121
|
"open_clip_torch"
|
|
123
122
|
]
|
|
124
123
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from typing import TYPE_CHECKING, Optional
|
|
3
3
|
|
|
4
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
5
|
+
|
|
4
6
|
if TYPE_CHECKING:
|
|
5
7
|
from datachain.catalog import Catalog
|
|
6
8
|
|
|
@@ -14,6 +16,7 @@ def show(
|
|
|
14
16
|
columns: Sequence[str] = (),
|
|
15
17
|
no_collapse: bool = False,
|
|
16
18
|
schema: bool = False,
|
|
19
|
+
include_hidden: bool = False,
|
|
17
20
|
) -> None:
|
|
18
21
|
from datachain import Session
|
|
19
22
|
from datachain.lib.dc import DataChain
|
|
@@ -23,6 +26,13 @@ def show(
|
|
|
23
26
|
dataset = catalog.get_dataset(name)
|
|
24
27
|
dataset_version = dataset.get_version(version or dataset.latest_version)
|
|
25
28
|
|
|
29
|
+
if include_hidden:
|
|
30
|
+
hidden_fields = []
|
|
31
|
+
else:
|
|
32
|
+
hidden_fields = SignalSchema.get_flatten_hidden_fields(
|
|
33
|
+
dataset_version.feature_schema
|
|
34
|
+
)
|
|
35
|
+
|
|
26
36
|
query = (
|
|
27
37
|
DatasetQuery(name=name, version=version, catalog=catalog)
|
|
28
38
|
.select(*columns)
|
|
@@ -30,7 +40,8 @@ def show(
|
|
|
30
40
|
.offset(offset)
|
|
31
41
|
)
|
|
32
42
|
records = query.to_db_records()
|
|
33
|
-
show_records(records, collapse_columns=not no_collapse)
|
|
43
|
+
show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
|
|
44
|
+
|
|
34
45
|
if schema and dataset_version.feature_schema:
|
|
35
46
|
print("\nSchema:")
|
|
36
47
|
session = Session.get(catalog=catalog)
|
|
@@ -26,6 +26,7 @@ class DataModel(BaseModel):
|
|
|
26
26
|
"""Pydantic model wrapper that registers model with `DataChain`."""
|
|
27
27
|
|
|
28
28
|
_version: ClassVar[int] = 1
|
|
29
|
+
_hidden_fields: ClassVar[list[str]] = []
|
|
29
30
|
|
|
30
31
|
@classmethod
|
|
31
32
|
def __pydantic_init_subclass__(cls):
|
|
@@ -41,6 +42,11 @@ class DataModel(BaseModel):
|
|
|
41
42
|
for val in models:
|
|
42
43
|
ModelStore.register(val)
|
|
43
44
|
|
|
45
|
+
@classmethod
|
|
46
|
+
def hidden_fields(cls) -> list[str]:
|
|
47
|
+
"""Returns a list of fields that should be hidden from the user."""
|
|
48
|
+
return cls._hidden_fields
|
|
49
|
+
|
|
44
50
|
|
|
45
51
|
def is_chain_type(t: type) -> bool:
|
|
46
52
|
"""Return true if type is supported by `DataChain`."""
|