datachain 0.11.0__tar.gz → 0.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.11.0 → datachain-0.12.0}/.github/workflows/benchmarks.yml +2 -2
- {datachain-0.11.0 → datachain-0.12.0}/.github/workflows/release.yml +2 -2
- {datachain-0.11.0 → datachain-0.12.0}/.github/workflows/tests.yml +11 -10
- {datachain-0.11.0 → datachain-0.12.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.11.0 → datachain-0.12.0}/PKG-INFO +6 -4
- {datachain-0.11.0 → datachain-0.12.0}/docs/examples.md +4 -6
- {datachain-0.11.0 → datachain-0.12.0}/docs/quick-start.md +1 -1
- datachain-0.12.0/docs/references/remotes.md +346 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/openimage-detect.py +3 -7
- {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/ultralytics-bbox.py +1 -9
- {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/ultralytics-pose.py +1 -9
- {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/ultralytics-segment.py +1 -9
- {datachain-0.11.0 → datachain-0.12.0}/mkdocs.yml +1 -0
- {datachain-0.11.0 → datachain-0.12.0}/noxfile.py +28 -19
- {datachain-0.11.0 → datachain-0.12.0}/pyproject.toml +7 -7
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/catalog/catalog.py +33 -5
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/catalog/loader.py +19 -13
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/__init__.py +3 -1
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/show.py +12 -1
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/parser/studio.py +13 -1
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/parser/utils.py +6 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/fsspec.py +12 -16
- datachain-0.12.0/src/datachain/client/hf.py +60 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/local.py +1 -4
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/warehouse.py +3 -8
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/dataset.py +8 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/error.py +0 -12
- datachain-0.12.0/src/datachain/fs/utils.py +30 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/__init__.py +5 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/func.py +2 -1
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/data_model.py +6 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/dc.py +114 -28
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/file.py +100 -25
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/image.py +30 -6
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/listing.py +21 -39
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/signal_schema.py +194 -15
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/video.py +7 -5
- datachain-0.12.0/src/datachain/model/bbox.py +253 -0
- datachain-0.12.0/src/datachain/model/pose.py +100 -0
- datachain-0.12.0/src/datachain/model/segment.py +51 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/model/ultralytics/bbox.py +9 -9
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/model/ultralytics/pose.py +7 -7
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/model/ultralytics/segment.py +7 -7
- datachain-0.12.0/src/datachain/model/utils.py +191 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/nodes_thread_pool.py +32 -11
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/dataset.py +4 -2
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/studio.py +8 -6
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/utils.py +3 -16
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/PKG-INFO +6 -4
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/SOURCES.txt +20 -3
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/requires.txt +4 -3
- {datachain-0.11.0 → datachain-0.12.0}/tests/conftest.py +49 -3
- datachain-0.12.0/tests/func/data/lena.jpg +0 -0
- datachain-0.12.0/tests/func/model/data/running-mask0.png +0 -0
- datachain-0.12.0/tests/func/model/data/running-mask1.png +0 -0
- datachain-0.12.0/tests/func/model/data/running.jpg +0 -0
- datachain-0.12.0/tests/func/model/data/ships.jpg +0 -0
- datachain-0.12.0/tests/func/model/test_yolo.py +2427 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_client.py +0 -19
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_datachain.py +309 -18
- datachain-0.12.0/tests/func/test_hidden_field.py +70 -0
- datachain-0.12.0/tests/func/test_image.py +68 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_ls.py +0 -9
- {datachain-0.11.0/tests/unit/lib → datachain-0.12.0/tests/func}/test_video.py +35 -21
- datachain-0.12.0/tests/test_import_time.py +84 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_datachain.py +100 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_file.py +14 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_image.py +1 -4
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_signal_schema.py +140 -0
- datachain-0.12.0/tests/unit/model/test_bbox.py +259 -0
- datachain-0.11.0/tests/unit/lib/test_models.py → datachain-0.12.0/tests/unit/model/test_pose.py +72 -51
- datachain-0.12.0/tests/unit/model/test_segment.py +53 -0
- datachain-0.12.0/tests/unit/model/test_utils.py +92 -0
- datachain-0.12.0/tests/unit/sql/__init__.py +0 -0
- datachain-0.12.0/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/utils.py +0 -8
- datachain-0.11.0/src/datachain/client/hf.py +0 -38
- datachain-0.11.0/src/datachain/model/bbox.py +0 -102
- datachain-0.11.0/src/datachain/model/pose.py +0 -88
- datachain-0.11.0/src/datachain/model/segment.py +0 -47
- {datachain-0.11.0 → datachain-0.12.0}/.cruft.json +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/.gitattributes +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/.github/codecov.yaml +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/.github/dependabot.yml +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/.gitignore +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/LICENSE +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/README.rst +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/contributing.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/index.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/overrides/main.html +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/datachain.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/func.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/index.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/toolkit.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/torch.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/references/udf.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/docs/tutorials.md +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/setup.cfg +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/__main__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/asyn.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cache.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/config.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/array.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/base.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/path.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/random.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/string.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/window.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/job.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/listing.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/node.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/progress.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/py.typed +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/params.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/session.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/data.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/examples/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/__init__.py +0 -0
- {datachain-0.11.0/tests/unit/lib → datachain-0.12.0/tests/func}/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.11.0/tests/unit → datachain-0.12.0/tests/func/model}/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_file.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_hf.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_listing.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_pull.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_query.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_session.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/test_atomicity.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/test_cli_studio.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/test_telemetry.py +0 -0
- {datachain-0.11.0/tests/unit/lib → datachain-0.12.0/tests/unit}/__init__.py +0 -0
- {datachain-0.11.0/tests/unit/sql → datachain-0.12.0/tests/unit/lib}/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.11.0/tests/unit/sql/sqlite → datachain-0.12.0/tests/unit/model}/__init__.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_client.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_config.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_func.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_query.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_session.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_warehouse.py +0 -0
|
@@ -19,10 +19,10 @@ jobs:
|
|
|
19
19
|
runs-on: ubuntu-latest
|
|
20
20
|
steps:
|
|
21
21
|
- uses: actions/checkout@v4
|
|
22
|
-
- name: Set up Python 3.
|
|
22
|
+
- name: Set up Python 3.13
|
|
23
23
|
uses: actions/setup-python@v5
|
|
24
24
|
with:
|
|
25
|
-
python-version: '3.
|
|
25
|
+
python-version: '3.13'
|
|
26
26
|
|
|
27
27
|
- name: Setup uv
|
|
28
28
|
uses: astral-sh/setup-uv@v5
|
|
@@ -60,16 +60,16 @@ jobs:
|
|
|
60
60
|
fail-fast: false
|
|
61
61
|
matrix:
|
|
62
62
|
os: [ubuntu-latest-8-cores]
|
|
63
|
-
pyv: ['3.9', '3.10', '3.11', '3.12']
|
|
63
|
+
pyv: ['3.9', '3.10', '3.11', '3.12', '3.13']
|
|
64
64
|
include:
|
|
65
65
|
- os: macos-latest
|
|
66
66
|
pyv: '3.9'
|
|
67
67
|
- os: macos-latest
|
|
68
|
-
pyv: '3.
|
|
68
|
+
pyv: '3.13'
|
|
69
69
|
- os: windows-latest
|
|
70
70
|
pyv: '3.9'
|
|
71
71
|
- os: windows-latest
|
|
72
|
-
pyv: '3.
|
|
72
|
+
pyv: '3.13'
|
|
73
73
|
|
|
74
74
|
steps:
|
|
75
75
|
- name: Check out the repository
|
|
@@ -109,7 +109,7 @@ jobs:
|
|
|
109
109
|
shell: bash
|
|
110
110
|
|
|
111
111
|
- name: Run E2E tests
|
|
112
|
-
run: nox -s
|
|
112
|
+
run: nox -s e2e-${{ matrix.pyv }}
|
|
113
113
|
shell: bash
|
|
114
114
|
|
|
115
115
|
- name: Upload coverage report
|
|
@@ -132,14 +132,16 @@ jobs:
|
|
|
132
132
|
fail-fast: false
|
|
133
133
|
matrix:
|
|
134
134
|
os: [ubuntu-latest, windows-latest]
|
|
135
|
-
pyv: ['3.9', '3.
|
|
136
|
-
group: ['get_started', 'computer_vision', '
|
|
135
|
+
pyv: ['3.9', '3.13']
|
|
136
|
+
group: ['get_started', 'computer_vision', 'multimodal']
|
|
137
137
|
exclude:
|
|
138
138
|
- {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
|
|
139
|
-
- {os: ubuntu-latest, pyv: '3.
|
|
139
|
+
- {os: ubuntu-latest, pyv: '3.13', group: 'multimodal'}
|
|
140
140
|
include:
|
|
141
|
+
# HF runs against actual API - thus run it only once
|
|
142
|
+
- {os: ubuntu-latest, pyv: "3.13", group: llm_and_nlp}
|
|
141
143
|
- {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
|
|
142
|
-
- {os: ubuntu-latest-4-cores, pyv: "3.
|
|
144
|
+
- {os: ubuntu-latest-4-cores, pyv: "3.13", group: multimodal}
|
|
143
145
|
|
|
144
146
|
steps:
|
|
145
147
|
- uses: actions/checkout@v4
|
|
@@ -161,9 +163,8 @@ jobs:
|
|
|
161
163
|
- name: Install nox
|
|
162
164
|
run: uv pip install nox --system
|
|
163
165
|
|
|
164
|
-
# HF runs against actual API - thus run it only once
|
|
165
166
|
- name: Set hf token
|
|
166
|
-
if: matrix.
|
|
167
|
+
if: matrix.group == 'llm_and_nlp'
|
|
167
168
|
run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
|
|
168
169
|
|
|
169
170
|
- name: Run examples
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
16
|
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
17
|
Requires-Python: >=3.9
|
|
17
18
|
Description-Content-Type: text/x-rst
|
|
@@ -70,10 +71,10 @@ Requires-Dist: usearch; extra == "vector"
|
|
|
70
71
|
Provides-Extra: hf
|
|
71
72
|
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
72
73
|
Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
|
|
74
|
+
Requires-Dist: fsspec>=2024.12.0; extra == "hf"
|
|
73
75
|
Provides-Extra: video
|
|
74
|
-
Requires-Dist: av<14; extra == "video"
|
|
75
76
|
Requires-Dist: ffmpeg-python; extra == "video"
|
|
76
|
-
Requires-Dist: imageio[ffmpeg]; extra == "video"
|
|
77
|
+
Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
|
|
77
78
|
Requires-Dist: opencv-python; extra == "video"
|
|
78
79
|
Provides-Extra: tests
|
|
79
80
|
Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
|
|
@@ -90,6 +91,7 @@ Requires-Dist: hypothesis; extra == "tests"
|
|
|
90
91
|
Requires-Dist: aiotools>=1.7.0; extra == "tests"
|
|
91
92
|
Requires-Dist: requests-mock; extra == "tests"
|
|
92
93
|
Requires-Dist: scipy; extra == "tests"
|
|
94
|
+
Requires-Dist: ultralytics; extra == "tests"
|
|
93
95
|
Provides-Extra: dev
|
|
94
96
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
95
97
|
Requires-Dist: mypy==1.15.0; extra == "dev"
|
|
@@ -103,7 +105,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
103
105
|
Requires-Dist: defusedxml; extra == "examples"
|
|
104
106
|
Requires-Dist: accelerate; extra == "examples"
|
|
105
107
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
106
|
-
Requires-Dist: ultralytics
|
|
108
|
+
Requires-Dist: ultralytics; extra == "examples"
|
|
107
109
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
108
110
|
|
|
109
111
|
================
|
|
@@ -13,7 +13,7 @@ title: Examples
|
|
|
13
13
|
For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”:
|
|
14
14
|
|
|
15
15
|
```python
|
|
16
|
-
from datachain
|
|
16
|
+
from datachain import Column, DataChain, File # (1)!
|
|
17
17
|
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
|
|
18
18
|
|
|
19
19
|
images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
|
|
@@ -80,12 +80,10 @@ In the below example, we are calling a Mixtral 8x22b model to judge the “servi
|
|
|
80
80
|
# $ export MISTRAL_API_KEY='your key'
|
|
81
81
|
|
|
82
82
|
import os
|
|
83
|
-
from datachain
|
|
84
|
-
from datachain.lib.dc import Column, DataChain
|
|
83
|
+
from datachain import Column, DataChain, DataModel, Feature
|
|
85
84
|
from mistralai.client import MistralClient
|
|
86
85
|
from mistralai.models.chat_completion import ChatMessage
|
|
87
86
|
from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel
|
|
88
|
-
from datachain.lib.data_model import DataModel
|
|
89
87
|
|
|
90
88
|
prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
|
|
91
89
|
api_key = os.environ["MISTRAL_API_KEY"]
|
|
@@ -189,7 +187,7 @@ DataChain library understands common annotation formats (JSON, CSV, webdataset a
|
|
|
189
187
|
Here is an example of reading a simple CSV file where schema is heuristically derived from the header:
|
|
190
188
|
|
|
191
189
|
```python
|
|
192
|
-
from datachain
|
|
190
|
+
from datachain import DataChain
|
|
193
191
|
|
|
194
192
|
uri="gs://datachain-demo/chatbot-csv/"
|
|
195
193
|
csv_dataset = DataChain.from_csv(uri)
|
|
@@ -234,7 +232,7 @@ However, Datachain can easily parse the entire COCO structure via several readin
|
|
|
234
232
|
|
|
235
233
|
```python
|
|
236
234
|
|
|
237
|
-
from datachain
|
|
235
|
+
from datachain import Column, DataChain
|
|
238
236
|
|
|
239
237
|
images_uri="gs://datachain-demo/coco2017/images/val/"
|
|
240
238
|
captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json"
|
|
@@ -138,7 +138,7 @@ chain = (
|
|
|
138
138
|
)
|
|
139
139
|
|
|
140
140
|
successful_chain = chain.filter(Column("is_success") == True)
|
|
141
|
-
successful_chain.
|
|
141
|
+
successful_chain.to_storage("./output_mistral")
|
|
142
142
|
|
|
143
143
|
print(f"{successful_chain.count()} files were exported")
|
|
144
144
|
```
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
# Interacting with remote storage
|
|
2
|
+
|
|
3
|
+
DataChain supports reading and writing data from different remote storages using methods like `DataChain.from_storage` and `DataChain.to_storage`. The supported storages includes: local file system, AWS S3 storage, Google Cloud Storage, Azure Blob Storage, Hugging Face and more.
|
|
4
|
+
|
|
5
|
+
Example implementation for reading and writing data from/to different remote storages:
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from datachain import DataChain
|
|
9
|
+
|
|
10
|
+
dc = DataChain.from_storage("s3://bucket-name/path/to/data")
|
|
11
|
+
dc.to_storage("gs://bucket-name/path/to/data")
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
DataChain uses [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) to interact with different remote storages. You can pass the following fsspec-supported URIs to `from_storage` and `to_storage` methods.
|
|
15
|
+
|
|
16
|
+
- Local file system: `file://path/to/data`
|
|
17
|
+
- AWS S3 storage: `s3://bucket-name/path/to/data`
|
|
18
|
+
- Google Cloud Storage: `gs://bucket-name/path/to/data`
|
|
19
|
+
- Azure Blob Storage: `az://container-name/path/to/data`
|
|
20
|
+
- Hugging Face: `hf://dataset-name`
|
|
21
|
+
|
|
22
|
+
## Extra configuration
|
|
23
|
+
For the configuration parameters to the filesystem, you can pass the key and value pair as client_config dictionary that will be passed to the respective filesystem.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
### AWS S3 compatible storage
|
|
27
|
+
|
|
28
|
+
DataChain uses [s3fs](https://s3fs.readthedocs.io/en/latest/) to interact with AWS S3 storage. Authentication can be configured using standard AWS credential locations, such as `~/.aws/credentials` and `~/.aws/config`. You can also pass the following configuration parameters to the s3fs filesystem as `client_config` dictionary.
|
|
29
|
+
|
|
30
|
+
- `anon`: `bool` (default: `False`)
|
|
31
|
+
|
|
32
|
+
Whether to use anonymous connection (public buckets only). If `False`,
|
|
33
|
+
uses the key/secret given, or boto's credential resolver (client_kwargs,
|
|
34
|
+
environment, variables, config files, EC2 IAM server, in that order)
|
|
35
|
+
|
|
36
|
+
- `endpoint_url`: `string` (default: `None`)
|
|
37
|
+
|
|
38
|
+
Use this endpoint URL, if specified. Needed for connecting to non-AWS
|
|
39
|
+
S3 buckets. Takes precedence over `endpoint_url` in client_kwargs.
|
|
40
|
+
|
|
41
|
+
- `key`: `string` (default: `None`)
|
|
42
|
+
|
|
43
|
+
If not anonymous, use this access key ID, if specified. Takes precedence
|
|
44
|
+
over `aws_access_key_id` in client_kwargs.
|
|
45
|
+
|
|
46
|
+
- `secret`: `string` (default: `None`)
|
|
47
|
+
|
|
48
|
+
If not anonymous, use this secret access key, if specified. Takes
|
|
49
|
+
precedence over `aws_secret_access_key` in client_kwargs.
|
|
50
|
+
|
|
51
|
+
- `token`: `string` (default: `None`)
|
|
52
|
+
|
|
53
|
+
If not anonymous, use this security token, if specified
|
|
54
|
+
|
|
55
|
+
- `use_ssl`: `bool` (default: `True`)
|
|
56
|
+
|
|
57
|
+
Whether to use SSL in connections to S3; may be faster without, but
|
|
58
|
+
insecure. If `use_ssl` is also set in `client_kwargs`,
|
|
59
|
+
the value set in `client_kwargs` will take priority.
|
|
60
|
+
|
|
61
|
+
- `s3_additional_kwargs`: `dict` (default: `{}`)
|
|
62
|
+
|
|
63
|
+
Dict of parameters that are used when calling s3 api
|
|
64
|
+
methods. Typically used for things like "ServerSideEncryption".
|
|
65
|
+
|
|
66
|
+
- `client_kwargs`: `dict` (default: `{}`)
|
|
67
|
+
|
|
68
|
+
Dict of parameters for the botocore client.
|
|
69
|
+
|
|
70
|
+
- `requester_pays`: `bool` (default: `False`)
|
|
71
|
+
|
|
72
|
+
If RequesterPays buckets are supported.
|
|
73
|
+
|
|
74
|
+
- `default_block_size`: `int` (default: `None`)
|
|
75
|
+
|
|
76
|
+
If given, the default block size value used for `open()`, if no
|
|
77
|
+
specific value is given at all time. The built-in default is 5MB.
|
|
78
|
+
|
|
79
|
+
- `default_fill_cache`: `bool` (default: `True`)
|
|
80
|
+
|
|
81
|
+
Whether to use cache filling with open by default. Refer to `S3File.open`.
|
|
82
|
+
|
|
83
|
+
- `default_cache_type`: `string` (default: `"readahead"`)
|
|
84
|
+
|
|
85
|
+
If given, the default cache_type value used for `open()`. Set to `None`
|
|
86
|
+
if no caching is desired. See fsspec's documentation for other available
|
|
87
|
+
`cache_type` values. Default cache_type is `"readahead"`.
|
|
88
|
+
|
|
89
|
+
- `version_aware`: `bool` (default: `False`)
|
|
90
|
+
|
|
91
|
+
Whether to support bucket versioning. If enable this will require the
|
|
92
|
+
user to have the necessary IAM permissions for dealing with versioned
|
|
93
|
+
objects. Note that in the event that you only need to work with the
|
|
94
|
+
latest version of objects in a versioned bucket, and do not need the
|
|
95
|
+
VersionId for those objects, you should set `version_aware` to `False`
|
|
96
|
+
for performance reasons. When set to `True`, filesystem instances will
|
|
97
|
+
use the S3 `ListObjectVersions` API call to list directory contents,
|
|
98
|
+
which requires listing all historical object versions.
|
|
99
|
+
|
|
100
|
+
- `cache_regions`: `bool` (default: `False`)
|
|
101
|
+
|
|
102
|
+
Whether to cache bucket regions or not. Whenever a new bucket is used,
|
|
103
|
+
it will first find out which region it belongs and then use the client
|
|
104
|
+
for that region.
|
|
105
|
+
|
|
106
|
+
- `asynchronous`: `bool` (default: `False`)
|
|
107
|
+
|
|
108
|
+
Whether this instance is to be used from inside coroutines.
|
|
109
|
+
|
|
110
|
+
- `config_kwargs`: `dict` (default: `{}`)
|
|
111
|
+
|
|
112
|
+
Dict of parameters passed to `botocore.client.Config`.
|
|
113
|
+
|
|
114
|
+
- `kwargs`: `dict` (default: `{}`)
|
|
115
|
+
|
|
116
|
+
Other parameters for core session.
|
|
117
|
+
|
|
118
|
+
- `session`: `aiobotocore.session.AioSession` (default: `None`)
|
|
119
|
+
|
|
120
|
+
Aiobotocore `AioSession` object to be used for all connections.
|
|
121
|
+
This session will be used inplace of creating a new session inside S3FileSystem.
|
|
122
|
+
|
|
123
|
+
For example: `aiobotocore.session.AioSession(profile='test_user')`
|
|
124
|
+
|
|
125
|
+
- `max_concurrency`: `int` (default: `1`)
|
|
126
|
+
|
|
127
|
+
The maximum number of concurrent transfers to use per file for multipart
|
|
128
|
+
upload (`put()`) operations. Defaults to `1` (sequential). When used in
|
|
129
|
+
conjunction with `S3FileSystem.put(batch_size=...)` the maximum number of
|
|
130
|
+
simultaneous connections is `max_concurrency * batch_size`. We may extend
|
|
131
|
+
this parameter to affect `pipe()`, `cat()` and `get()`. Increasing this
|
|
132
|
+
value will result in higher memory usage during multipart upload operations (by
|
|
133
|
+
`max_concurrency * chunksize` bytes per file).
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
Example:
|
|
137
|
+
```python
|
|
138
|
+
chain = DataChain.from_storage(
|
|
139
|
+
"s3://my-bucket/my-dir",
|
|
140
|
+
client_config = {
|
|
141
|
+
"endpoint_url": "<minio-endpoint-url>",
|
|
142
|
+
"key": "<minio-access-key",
|
|
143
|
+
"secret": "<minio-secret-key"
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Google Cloud Storage
|
|
149
|
+
|
|
150
|
+
DataChain uses [gcsfs](https://gcsfs.readthedocs.io/en/latest/) to interact with Google Cloud Storage. Authentication can be achieved by using any of the method described at [gcsfs documentation](https://gcsfs.readthedocs.io/en/latest/#credentials). You can also pass the following configuration parameters to the gcsfs filesystem as client_config dictionary.
|
|
151
|
+
|
|
152
|
+
- `project`: `string` (default: `None`)
|
|
153
|
+
|
|
154
|
+
The project to work under. Note that this is not the same as, but often
|
|
155
|
+
very similar to, the project name. This is required in order to list all
|
|
156
|
+
the buckets you have access to within a project and to create/delete
|
|
157
|
+
buckets, or update their access policies. If `token='google_default'`,
|
|
158
|
+
the value is overridden by the default, if `token='anon'`, the value is
|
|
159
|
+
ignored.
|
|
160
|
+
|
|
161
|
+
- `access`: `string` (default: `None`)
|
|
162
|
+
|
|
163
|
+
One of `"read_only"`, `"read_write"`, `"full_control"`. Full control implies
|
|
164
|
+
read/write as well as modifying metadata, e.g., access control.
|
|
165
|
+
|
|
166
|
+
- `token`: `None`, `dict` or `string` (default: `None`)
|
|
167
|
+
|
|
168
|
+
The token to use for authentication. If `None`, the default is used. If
|
|
169
|
+
a string, it is interpreted as a path to a token file. If a dict, it is
|
|
170
|
+
interpreted as a token dictionary, such as that provided by Google Cloud
|
|
171
|
+
Platform. See also description of authentication methods, from link above.
|
|
172
|
+
|
|
173
|
+
- `consistency`: `string` (default: `None`)
|
|
174
|
+
|
|
175
|
+
One of `"none"`, `"size"`, `"md5"`. Check method when writing files.
|
|
176
|
+
Can be overridden in `open()`.
|
|
177
|
+
|
|
178
|
+
- `cache_timeout`: `float` (default: `None`)
|
|
179
|
+
|
|
180
|
+
Cache expiration time in seconds for object metadata cache. Set
|
|
181
|
+
`cache_timeout <= 0` for no caching, `None` for no cache expiration.
|
|
182
|
+
|
|
183
|
+
- `secure_serialize`: `bool` (default: `None`)
|
|
184
|
+
|
|
185
|
+
Whether to use secure serialization. This is a deprecated option and
|
|
186
|
+
will be removed in future versions.
|
|
187
|
+
|
|
188
|
+
- `requester_pays`: `bool` or `str` (default: `False`)
|
|
189
|
+
|
|
190
|
+
Whether to use requester-pays requests. This will include your
|
|
191
|
+
project ID `project` in requests as the `userProject`, and you'll be
|
|
192
|
+
billed for accessing data from requester-pays buckets. Optionally,
|
|
193
|
+
pass a project-id here as a string to use that as the `userProject`.
|
|
194
|
+
|
|
195
|
+
- `session_kwargs`: `dict` (default: `{}`)
|
|
196
|
+
|
|
197
|
+
Passed on to `aiohttp.ClientSession`. Can contain, for example, proxy
|
|
198
|
+
settings.
|
|
199
|
+
|
|
200
|
+
- `endpoint_url`: `string` (default: `None`)
|
|
201
|
+
|
|
202
|
+
If given, use this URL (format: `protocol://host:port`, *without* any
|
|
203
|
+
path part) for communication. If not given, defaults to the value
|
|
204
|
+
of environment variable `"STORAGE_EMULATOR_HOST"`; if that is not set
|
|
205
|
+
either, will use the standard Google endpoint.
|
|
206
|
+
|
|
207
|
+
- `default_location`: `str` (default: `None`)
|
|
208
|
+
|
|
209
|
+
Default location where buckets are created, like `"US"` or `"EUROPE-WEST3"`.
|
|
210
|
+
You can find a list of all available locations here:
|
|
211
|
+
https://cloud.google.com/storage/docs/locations#available-locations
|
|
212
|
+
|
|
213
|
+
- `version_aware`: `bool` (default: `False`)
|
|
214
|
+
|
|
215
|
+
Whether to support object versioning. If enabled this will require the
|
|
216
|
+
user to have the necessary permissions for dealing with versioned objects.
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
### Azure Blob Storage
|
|
220
|
+
|
|
221
|
+
DataChain uses [adlfs](https://fsspec.github.io/adlfs/) to interact with Azure Blob Storage. Authentication can be achieved by using any of the method described at [adlfs documentation](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials). You can also pass the following configuration parameters to the adlfs filesystem as client_config dictionary.
|
|
222
|
+
|
|
223
|
+
- `account_name`: `str` (default: `None`)
|
|
224
|
+
|
|
225
|
+
The storage account name. This is used to authenticate requests
|
|
226
|
+
signed with an account key and to construct the storage endpoint. It
|
|
227
|
+
is required unless a connection string is given, or if a custom
|
|
228
|
+
domain is used with anonymous authentication.
|
|
229
|
+
|
|
230
|
+
- `account_key`: `str` (default: `None`)
|
|
231
|
+
|
|
232
|
+
The storage account key. This is used for shared key authentication.
|
|
233
|
+
If any of account key, sas token or client_id is specified, anonymous access
|
|
234
|
+
will be used.
|
|
235
|
+
|
|
236
|
+
- `sas_token`: `str` (default: `None`)
|
|
237
|
+
|
|
238
|
+
A shared access signature token to use to authenticate requests
|
|
239
|
+
instead of the account key. If account key and sas token are both
|
|
240
|
+
specified, account key will be used to sign. If any of account key, sas token
|
|
241
|
+
or client_id are specified, anonymous access will be used.
|
|
242
|
+
|
|
243
|
+
- `request_session`: `requests.Session` (default: `None`)
|
|
244
|
+
|
|
245
|
+
The session object to use for http requests.
|
|
246
|
+
|
|
247
|
+
- `connection_string`: `str` (default: `None`)
|
|
248
|
+
|
|
249
|
+
If specified, this will override all other parameters besides
|
|
250
|
+
request session. See
|
|
251
|
+
http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/
|
|
252
|
+
for the connection string format.
|
|
253
|
+
|
|
254
|
+
- `credential`: `azure.core.credentials_async.AsyncTokenCredential` or SAS token (default: `None`)
|
|
255
|
+
|
|
256
|
+
The credentials with which to authenticate. Optional if the account URL already has a SAS token.
|
|
257
|
+
Can include an instance of TokenCredential class from azure.identity.aio.
|
|
258
|
+
|
|
259
|
+
- `blocksize`: `int` (default: `None`)
|
|
260
|
+
|
|
261
|
+
The block size to use for download/upload operations. Defaults to hardcoded value of
|
|
262
|
+
`BlockBlobService.MAX_BLOCK_SIZE`
|
|
263
|
+
|
|
264
|
+
- `client_id`: `str` (default: `None`)
|
|
265
|
+
|
|
266
|
+
Client ID to use when authenticating using an AD Service Principal client/secret.
|
|
267
|
+
|
|
268
|
+
- `client_secret`: `str` (default: `None`)
|
|
269
|
+
|
|
270
|
+
Client secret to use when authenticating using an AD Service Principal client/secret.
|
|
271
|
+
|
|
272
|
+
- `tenant_id`: `str` (default: `None`)
|
|
273
|
+
|
|
274
|
+
Tenant ID to use when authenticating using an AD Service Principal client/secret.
|
|
275
|
+
|
|
276
|
+
- `anon`: `boolean` (default: `None`)
|
|
277
|
+
|
|
278
|
+
The value to use for whether to attempt anonymous access if no other credential is
|
|
279
|
+
passed. By default (`None`), the `AZURE_STORAGE_ANON` environment variable is
|
|
280
|
+
checked. False values (`false`, `0`, `f`) will resolve to `False` and
|
|
281
|
+
anonymous access will not be attempted. Otherwise the value for `anon` resolves
|
|
282
|
+
to `True`.
|
|
283
|
+
|
|
284
|
+
- `default_fill_cache`: `bool` (default: `True`)
|
|
285
|
+
|
|
286
|
+
Whether to use cache filling with open by default
|
|
287
|
+
|
|
288
|
+
- `default_cache_type`: `string` (default: `"bytes"`)
|
|
289
|
+
|
|
290
|
+
If given, the default cache_type value used for `open()`. Set to `None` if no caching
|
|
291
|
+
is desired. Docs in fsspec.
|
|
292
|
+
|
|
293
|
+
- `version_aware`: `bool` (default: `False`)
|
|
294
|
+
|
|
295
|
+
Whether to support blob versioning. If enable this will require the user to have the
|
|
296
|
+
necessary permissions for dealing with versioned blobs.
|
|
297
|
+
|
|
298
|
+
- `assume_container_exists`: `bool` (default: `None`)
|
|
299
|
+
|
|
300
|
+
Set this to `True` to not check for existence of containers at all, assuming they exist.
|
|
301
|
+
`None` (default) means to warn in case of a failure when checking for existence of a container.
|
|
302
|
+
`False` throws if retrieving container properties fails, which might happen if your
|
|
303
|
+
authentication is only valid at the storage container level, and not the
|
|
304
|
+
storage account level.
|
|
305
|
+
|
|
306
|
+
- `max_concurrency`: `int` (default: `None`)
|
|
307
|
+
|
|
308
|
+
The number of concurrent connections to use when uploading or downloading a blob.
|
|
309
|
+
If `None` it will be inferred from `fsspec.asyn._get_batch_size()`.
|
|
310
|
+
|
|
311
|
+
- `timeout`: `int` (default: `None`)
|
|
312
|
+
|
|
313
|
+
Sets the server-side timeout when uploading or downloading a blob.
|
|
314
|
+
|
|
315
|
+
- `connection_timeout`: `int` (default: `None`)
|
|
316
|
+
|
|
317
|
+
The number of seconds the client will wait to establish a connection to the server
|
|
318
|
+
when uploading or downloading a blob.
|
|
319
|
+
|
|
320
|
+
- `read_timeout`: `int` (default: `None`)
|
|
321
|
+
|
|
322
|
+
The number of seconds the client will wait, between consecutive read operations,
|
|
323
|
+
for a response from the server while uploading or downloading a blob.
|
|
324
|
+
|
|
325
|
+
- `account_host`: `str` (default: `None`)
|
|
326
|
+
|
|
327
|
+
The storage account host. This string is the entire url to the for the storage
|
|
328
|
+
after the `https://`, i.e. `"https://{account_host}"`. This parameter is only
|
|
329
|
+
required for Azure clouds where account urls do not end with `"blob.core.windows.net"`.
|
|
330
|
+
Note that the `account_name` parameter is still required.
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
### Hugging Face
|
|
334
|
+
|
|
335
|
+
DataChain uses [huggingface_hub](https://pypi.org/project/huggingface-hub/) to interact with Hugging Face. You can pass the following parameters to client config to interact with Hugging Face.
|
|
336
|
+
|
|
337
|
+
- `token`: `str` or `bool` (default: `None`)
|
|
338
|
+
|
|
339
|
+
A valid user access token (string). Defaults to the locally saved
|
|
340
|
+
token, which is the recommended method for authentication (see
|
|
341
|
+
https://huggingface.co/docs/huggingface_hub/quick-start#authentication).
|
|
342
|
+
To disable authentication, pass `False`.
|
|
343
|
+
|
|
344
|
+
- `endpoint`: `str` (default: `None`)
|
|
345
|
+
|
|
346
|
+
Endpoint of the Hub. Defaults to `https://huggingface.co`.
|
|
@@ -22,13 +22,9 @@ def openimage_detect(args):
|
|
|
22
22
|
detections = json.load(stream_json).get("detections", [])
|
|
23
23
|
|
|
24
24
|
for i, detect in enumerate(detections):
|
|
25
|
-
bbox = model.BBox.
|
|
26
|
-
[
|
|
27
|
-
|
|
28
|
-
detect["XMax"] * img.width,
|
|
29
|
-
detect["YMin"] * img.height,
|
|
30
|
-
detect["YMax"] * img.height,
|
|
31
|
-
]
|
|
25
|
+
bbox = model.BBox.from_albumentations(
|
|
26
|
+
[detect[k] for k in ("XMin", "YMin", "XMax", "YMax")],
|
|
27
|
+
img_size=(img.width, img.height),
|
|
32
28
|
)
|
|
33
29
|
|
|
34
30
|
fstream = File(
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
os.environ["YOLO_VERBOSE"] = "false"
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from io import BytesIO
|
|
7
|
-
|
|
8
|
-
from PIL import Image
|
|
9
1
|
from ultralytics import YOLO
|
|
10
2
|
|
|
11
3
|
from datachain import C, DataChain, File
|
|
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloBBoxes
|
|
|
13
5
|
|
|
14
6
|
|
|
15
7
|
def process_bboxes(yolo: YOLO, file: File) -> YoloBBoxes:
|
|
16
|
-
results = yolo(
|
|
8
|
+
results = yolo(file.as_image_file().read(), verbose=False)
|
|
17
9
|
return YoloBBoxes.from_results(results)
|
|
18
10
|
|
|
19
11
|
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
os.environ["YOLO_VERBOSE"] = "false"
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from io import BytesIO
|
|
7
|
-
|
|
8
|
-
from PIL import Image
|
|
9
1
|
from ultralytics import YOLO
|
|
10
2
|
|
|
11
3
|
from datachain import C, DataChain, File
|
|
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloPoses
|
|
|
13
5
|
|
|
14
6
|
|
|
15
7
|
def process_poses(yolo: YOLO, file: File) -> YoloPoses:
|
|
16
|
-
results = yolo(
|
|
8
|
+
results = yolo(file.as_image_file().read(), verbose=False)
|
|
17
9
|
return YoloPoses.from_results(results)
|
|
18
10
|
|
|
19
11
|
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
os.environ["YOLO_VERBOSE"] = "false"
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from io import BytesIO
|
|
7
|
-
|
|
8
|
-
from PIL import Image
|
|
9
1
|
from ultralytics import YOLO
|
|
10
2
|
|
|
11
3
|
from datachain import C, DataChain, File
|
|
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloSegments
|
|
|
13
5
|
|
|
14
6
|
|
|
15
7
|
def process_segments(yolo: YOLO, file: File) -> YoloSegments:
|
|
16
|
-
results = yolo(
|
|
8
|
+
results = yolo(file.as_image_file().read(), verbose=False)
|
|
17
9
|
return YoloSegments.from_results(results)
|
|
18
10
|
|
|
19
11
|
|
|
@@ -84,6 +84,7 @@ nav:
|
|
|
84
84
|
- Torch: references/torch.md
|
|
85
85
|
- Functions: references/func.md
|
|
86
86
|
- Toolkit: references/toolkit.md
|
|
87
|
+
- 📡 Interacting with remote storage: references/remotes.md
|
|
87
88
|
- 🤝 Contributing: contributing.md
|
|
88
89
|
|
|
89
90
|
- DataChain Website ↗: https://datachain.ai" target="_blank"
|