datachain 0.11.11__tar.gz → 0.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.11.11 → datachain-0.13.0}/.github/workflows/tests.yml +5 -12
- {datachain-0.11.11 → datachain-0.13.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.11.11 → datachain-0.13.0}/PKG-INFO +4 -2
- {datachain-0.11.11 → datachain-0.13.0}/docs/examples.md +4 -6
- {datachain-0.11.11 → datachain-0.13.0}/docs/quick-start.md +1 -1
- {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/openimage-detect.py +3 -7
- {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/ultralytics-bbox.py +1 -9
- {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/ultralytics-pose.py +1 -9
- {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/ultralytics-segment.py +1 -9
- {datachain-0.11.11 → datachain-0.13.0}/noxfile.py +14 -0
- {datachain-0.11.11 → datachain-0.13.0}/pyproject.toml +5 -3
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/catalog/catalog.py +39 -7
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/catalog/loader.py +19 -13
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/__init__.py +2 -1
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/ls.py +8 -6
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/show.py +7 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/parser/studio.py +13 -1
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/fsspec.py +12 -16
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/gcs.py +1 -1
- datachain-0.13.0/src/datachain/client/hf.py +60 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/local.py +1 -4
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/s3.py +1 -1
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/metastore.py +6 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/warehouse.py +3 -8
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/dataset.py +8 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/error.py +0 -12
- datachain-0.13.0/src/datachain/fs/utils.py +30 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/__init__.py +5 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/func.py +2 -1
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/dc.py +59 -15
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/file.py +63 -18
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/image.py +30 -6
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/listing.py +21 -39
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/meta_formats.py +2 -2
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/signal_schema.py +65 -18
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/udf.py +3 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/udf_signature.py +17 -9
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/video.py +7 -5
- datachain-0.13.0/src/datachain/model/bbox.py +253 -0
- datachain-0.13.0/src/datachain/model/pose.py +100 -0
- datachain-0.13.0/src/datachain/model/segment.py +51 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/model/ultralytics/bbox.py +9 -9
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/model/ultralytics/pose.py +7 -7
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/model/ultralytics/segment.py +7 -7
- datachain-0.13.0/src/datachain/model/utils.py +191 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/dataset.py +8 -2
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/sqlite/base.py +2 -2
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/studio.py +8 -6
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/utils.py +0 -16
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/PKG-INFO +4 -2
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/SOURCES.txt +18 -3
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/requires.txt +3 -1
- {datachain-0.11.11 → datachain-0.13.0}/tests/conftest.py +49 -3
- datachain-0.13.0/tests/func/data/lena.jpg +0 -0
- datachain-0.13.0/tests/func/model/data/running-mask0.png +0 -0
- datachain-0.13.0/tests/func/model/data/running-mask1.png +0 -0
- datachain-0.13.0/tests/func/model/data/running.jpg +0 -0
- datachain-0.13.0/tests/func/model/data/ships.jpg +0 -0
- datachain-0.13.0/tests/func/model/test_yolo.py +2427 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_client.py +0 -19
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_datachain.py +35 -4
- datachain-0.13.0/tests/func/test_image.py +68 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_ls.py +0 -9
- {datachain-0.11.11/tests/unit/lib → datachain-0.13.0/tests/func}/test_video.py +35 -21
- datachain-0.13.0/tests/test_import_time.py +84 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_datachain.py +69 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_datachain_bootstrap.py +2 -2
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_file.py +14 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_image.py +1 -4
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_signal_schema.py +209 -26
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_udf_signature.py +17 -7
- datachain-0.13.0/tests/unit/model/test_bbox.py +259 -0
- datachain-0.11.11/tests/unit/lib/test_models.py → datachain-0.13.0/tests/unit/model/test_pose.py +72 -51
- datachain-0.13.0/tests/unit/model/test_segment.py +53 -0
- datachain-0.13.0/tests/unit/model/test_utils.py +92 -0
- datachain-0.13.0/tests/unit/sql/__init__.py +0 -0
- datachain-0.13.0/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_array.py +7 -2
- {datachain-0.11.11 → datachain-0.13.0}/tests/utils.py +0 -8
- datachain-0.11.11/src/datachain/client/hf.py +0 -38
- datachain-0.11.11/src/datachain/model/bbox.py +0 -102
- datachain-0.11.11/src/datachain/model/pose.py +0 -88
- datachain-0.11.11/src/datachain/model/segment.py +0 -47
- {datachain-0.11.11 → datachain-0.13.0}/.cruft.json +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.gitattributes +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.github/codecov.yaml +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.github/dependabot.yml +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.github/workflows/release.yml +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/.gitignore +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/LICENSE +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/README.rst +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/contributing.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/index.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/overrides/main.html +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/datachain.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/func.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/index.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/remotes.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/toolkit.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/torch.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/references/udf.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/docs/tutorials.md +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/mkdocs.yml +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/setup.cfg +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/__main__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/asyn.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cache.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/config.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/array.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/base.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/path.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/random.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/string.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/window.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/job.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/listing.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/node.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/progress.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/py.typed +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/params.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/session.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/data.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/examples/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/__init__.py +0 -0
- {datachain-0.11.11/tests/unit/lib → datachain-0.13.0/tests/func}/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.11.11/tests/unit → datachain-0.13.0/tests/func/model}/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_file.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_hf.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_listing.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_pull.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_query.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_session.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/test_atomicity.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/test_cli_studio.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/test_telemetry.py +0 -0
- {datachain-0.11.11/tests/unit/lib → datachain-0.13.0/tests/unit}/__init__.py +0 -0
- {datachain-0.11.11/tests/unit/sql → datachain-0.13.0/tests/unit/lib}/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.11.11/tests/unit/sql/sqlite → datachain-0.13.0/tests/unit/model}/__init__.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_client.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_config.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_func.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_query.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_session.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_warehouse.py +0 -0
|
@@ -80,14 +80,6 @@ jobs:
|
|
|
80
80
|
|
|
81
81
|
- name: Set up FFmpeg
|
|
82
82
|
uses: AnimMouse/setup-ffmpeg@v1
|
|
83
|
-
id: ffmpeg-install
|
|
84
|
-
continue-on-error: ${{ runner.os == 'macOS' }}
|
|
85
|
-
|
|
86
|
-
# https://github.com/AnimMouse/setup-ffmpeg/issues/5
|
|
87
|
-
- if: steps.ffmpeg-install.outcome == 'failure' && runner.os == 'macOS'
|
|
88
|
-
run: brew install ffmpeg
|
|
89
|
-
env:
|
|
90
|
-
HOMEBREW_NO_AUTO_UPDATE: "1"
|
|
91
83
|
|
|
92
84
|
- name: Set up Python ${{ matrix.pyv }}
|
|
93
85
|
uses: actions/setup-python@v5
|
|
@@ -117,7 +109,7 @@ jobs:
|
|
|
117
109
|
shell: bash
|
|
118
110
|
|
|
119
111
|
- name: Run E2E tests
|
|
120
|
-
run: nox -s
|
|
112
|
+
run: nox -s e2e-${{ matrix.pyv }}
|
|
121
113
|
shell: bash
|
|
122
114
|
|
|
123
115
|
- name: Upload coverage report
|
|
@@ -141,11 +133,13 @@ jobs:
|
|
|
141
133
|
matrix:
|
|
142
134
|
os: [ubuntu-latest, windows-latest]
|
|
143
135
|
pyv: ['3.9', '3.13']
|
|
144
|
-
group: ['get_started', 'computer_vision', '
|
|
136
|
+
group: ['get_started', 'computer_vision', 'multimodal']
|
|
145
137
|
exclude:
|
|
146
138
|
- {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
|
|
147
139
|
- {os: ubuntu-latest, pyv: '3.13', group: 'multimodal'}
|
|
148
140
|
include:
|
|
141
|
+
# HF runs against actual API - thus run it only once
|
|
142
|
+
- {os: ubuntu-latest, pyv: "3.13", group: llm_and_nlp}
|
|
149
143
|
- {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
|
|
150
144
|
- {os: ubuntu-latest-4-cores, pyv: "3.13", group: multimodal}
|
|
151
145
|
|
|
@@ -169,9 +163,8 @@ jobs:
|
|
|
169
163
|
- name: Install nox
|
|
170
164
|
run: uv pip install nox --system
|
|
171
165
|
|
|
172
|
-
# HF runs against actual API - thus run it only once
|
|
173
166
|
- name: Set hf token
|
|
174
|
-
if: matrix.
|
|
167
|
+
if: matrix.group == 'llm_and_nlp'
|
|
175
168
|
run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
|
|
176
169
|
|
|
177
170
|
- name: Run examples
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.13.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -71,6 +71,7 @@ Requires-Dist: usearch; extra == "vector"
|
|
|
71
71
|
Provides-Extra: hf
|
|
72
72
|
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
73
73
|
Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
|
|
74
|
+
Requires-Dist: fsspec>=2024.12.0; extra == "hf"
|
|
74
75
|
Provides-Extra: video
|
|
75
76
|
Requires-Dist: ffmpeg-python; extra == "video"
|
|
76
77
|
Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
|
|
@@ -90,6 +91,7 @@ Requires-Dist: hypothesis; extra == "tests"
|
|
|
90
91
|
Requires-Dist: aiotools>=1.7.0; extra == "tests"
|
|
91
92
|
Requires-Dist: requests-mock; extra == "tests"
|
|
92
93
|
Requires-Dist: scipy; extra == "tests"
|
|
94
|
+
Requires-Dist: ultralytics; extra == "tests"
|
|
93
95
|
Provides-Extra: dev
|
|
94
96
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
95
97
|
Requires-Dist: mypy==1.15.0; extra == "dev"
|
|
@@ -103,7 +105,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
103
105
|
Requires-Dist: defusedxml; extra == "examples"
|
|
104
106
|
Requires-Dist: accelerate; extra == "examples"
|
|
105
107
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
106
|
-
Requires-Dist: ultralytics
|
|
108
|
+
Requires-Dist: ultralytics; extra == "examples"
|
|
107
109
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
108
110
|
|
|
109
111
|
================
|
|
@@ -13,7 +13,7 @@ title: Examples
|
|
|
13
13
|
For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”:
|
|
14
14
|
|
|
15
15
|
```python
|
|
16
|
-
from datachain
|
|
16
|
+
from datachain import Column, DataChain, File # (1)!
|
|
17
17
|
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
|
|
18
18
|
|
|
19
19
|
images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
|
|
@@ -80,12 +80,10 @@ In the below example, we are calling a Mixtral 8x22b model to judge the “servi
|
|
|
80
80
|
# $ export MISTRAL_API_KEY='your key'
|
|
81
81
|
|
|
82
82
|
import os
|
|
83
|
-
from datachain
|
|
84
|
-
from datachain.lib.dc import Column, DataChain
|
|
83
|
+
from datachain import Column, DataChain, DataModel, Feature
|
|
85
84
|
from mistralai.client import MistralClient
|
|
86
85
|
from mistralai.models.chat_completion import ChatMessage
|
|
87
86
|
from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel
|
|
88
|
-
from datachain.lib.data_model import DataModel
|
|
89
87
|
|
|
90
88
|
prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
|
|
91
89
|
api_key = os.environ["MISTRAL_API_KEY"]
|
|
@@ -189,7 +187,7 @@ DataChain library understands common annotation formats (JSON, CSV, webdataset a
|
|
|
189
187
|
Here is an example of reading a simple CSV file where schema is heuristically derived from the header:
|
|
190
188
|
|
|
191
189
|
```python
|
|
192
|
-
from datachain
|
|
190
|
+
from datachain import DataChain
|
|
193
191
|
|
|
194
192
|
uri="gs://datachain-demo/chatbot-csv/"
|
|
195
193
|
csv_dataset = DataChain.from_csv(uri)
|
|
@@ -234,7 +232,7 @@ However, Datachain can easily parse the entire COCO structure via several readin
|
|
|
234
232
|
|
|
235
233
|
```python
|
|
236
234
|
|
|
237
|
-
from datachain
|
|
235
|
+
from datachain import Column, DataChain
|
|
238
236
|
|
|
239
237
|
images_uri="gs://datachain-demo/coco2017/images/val/"
|
|
240
238
|
captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json"
|
|
@@ -138,7 +138,7 @@ chain = (
|
|
|
138
138
|
)
|
|
139
139
|
|
|
140
140
|
successful_chain = chain.filter(Column("is_success") == True)
|
|
141
|
-
successful_chain.
|
|
141
|
+
successful_chain.to_storage("./output_mistral")
|
|
142
142
|
|
|
143
143
|
print(f"{successful_chain.count()} files were exported")
|
|
144
144
|
```
|
|
@@ -22,13 +22,9 @@ def openimage_detect(args):
|
|
|
22
22
|
detections = json.load(stream_json).get("detections", [])
|
|
23
23
|
|
|
24
24
|
for i, detect in enumerate(detections):
|
|
25
|
-
bbox = model.BBox.
|
|
26
|
-
[
|
|
27
|
-
|
|
28
|
-
detect["XMax"] * img.width,
|
|
29
|
-
detect["YMin"] * img.height,
|
|
30
|
-
detect["YMax"] * img.height,
|
|
31
|
-
]
|
|
25
|
+
bbox = model.BBox.from_albumentations(
|
|
26
|
+
[detect[k] for k in ("XMin", "YMin", "XMax", "YMax")],
|
|
27
|
+
img_size=(img.width, img.height),
|
|
32
28
|
)
|
|
33
29
|
|
|
34
30
|
fstream = File(
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
os.environ["YOLO_VERBOSE"] = "false"
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from io import BytesIO
|
|
7
|
-
|
|
8
|
-
from PIL import Image
|
|
9
1
|
from ultralytics import YOLO
|
|
10
2
|
|
|
11
3
|
from datachain import C, DataChain, File
|
|
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloBBoxes
|
|
|
13
5
|
|
|
14
6
|
|
|
15
7
|
def process_bboxes(yolo: YOLO, file: File) -> YoloBBoxes:
|
|
16
|
-
results = yolo(
|
|
8
|
+
results = yolo(file.as_image_file().read(), verbose=False)
|
|
17
9
|
return YoloBBoxes.from_results(results)
|
|
18
10
|
|
|
19
11
|
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
os.environ["YOLO_VERBOSE"] = "false"
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from io import BytesIO
|
|
7
|
-
|
|
8
|
-
from PIL import Image
|
|
9
1
|
from ultralytics import YOLO
|
|
10
2
|
|
|
11
3
|
from datachain import C, DataChain, File
|
|
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloPoses
|
|
|
13
5
|
|
|
14
6
|
|
|
15
7
|
def process_poses(yolo: YOLO, file: File) -> YoloPoses:
|
|
16
|
-
results = yolo(
|
|
8
|
+
results = yolo(file.as_image_file().read(), verbose=False)
|
|
17
9
|
return YoloPoses.from_results(results)
|
|
18
10
|
|
|
19
11
|
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
os.environ["YOLO_VERBOSE"] = "false"
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from io import BytesIO
|
|
7
|
-
|
|
8
|
-
from PIL import Image
|
|
9
1
|
from ultralytics import YOLO
|
|
10
2
|
|
|
11
3
|
from datachain import C, DataChain, File
|
|
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloSegments
|
|
|
13
5
|
|
|
14
6
|
|
|
15
7
|
def process_segments(yolo: YOLO, file: File) -> YoloSegments:
|
|
16
|
-
results = yolo(
|
|
8
|
+
results = yolo(file.as_image_file().read(), verbose=False)
|
|
17
9
|
return YoloSegments.from_results(results)
|
|
18
10
|
|
|
19
11
|
|
|
@@ -56,6 +56,20 @@ def tests(session: nox.Session) -> None:
|
|
|
56
56
|
)
|
|
57
57
|
|
|
58
58
|
|
|
59
|
+
@nox.session(python=python_versions)
|
|
60
|
+
def e2e(session: nox.Session) -> None:
|
|
61
|
+
session.install(".[tests]")
|
|
62
|
+
session.run(
|
|
63
|
+
"pytest",
|
|
64
|
+
"--durations=0",
|
|
65
|
+
"--numprocesses=logical",
|
|
66
|
+
"--dist=loadgroup",
|
|
67
|
+
"-m",
|
|
68
|
+
"e2e",
|
|
69
|
+
*session.posargs,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
59
73
|
@nox.session
|
|
60
74
|
def lint(session: nox.Session) -> None:
|
|
61
75
|
session.install("pre-commit")
|
|
@@ -80,7 +80,8 @@ vector = [
|
|
|
80
80
|
]
|
|
81
81
|
hf = [
|
|
82
82
|
"numba>=0.60.0",
|
|
83
|
-
"datasets[audio,vision]>=2.21.0"
|
|
83
|
+
"datasets[audio,vision]>=2.21.0",
|
|
84
|
+
"fsspec>=2024.12.0"
|
|
84
85
|
]
|
|
85
86
|
video = [
|
|
86
87
|
"ffmpeg-python",
|
|
@@ -101,7 +102,8 @@ tests = [
|
|
|
101
102
|
"hypothesis",
|
|
102
103
|
"aiotools>=1.7.0",
|
|
103
104
|
"requests-mock",
|
|
104
|
-
"scipy"
|
|
105
|
+
"scipy",
|
|
106
|
+
"ultralytics"
|
|
105
107
|
]
|
|
106
108
|
dev = [
|
|
107
109
|
"datachain[docs,tests]",
|
|
@@ -117,7 +119,7 @@ examples = [
|
|
|
117
119
|
"defusedxml",
|
|
118
120
|
"accelerate",
|
|
119
121
|
"huggingface_hub[hf_transfer]",
|
|
120
|
-
"ultralytics
|
|
122
|
+
"ultralytics",
|
|
121
123
|
"open_clip_torch"
|
|
122
124
|
]
|
|
123
125
|
|
|
@@ -25,7 +25,6 @@ from typing import (
|
|
|
25
25
|
)
|
|
26
26
|
from uuid import uuid4
|
|
27
27
|
|
|
28
|
-
import requests
|
|
29
28
|
import sqlalchemy as sa
|
|
30
29
|
from sqlalchemy import Column
|
|
31
30
|
from tqdm.auto import tqdm
|
|
@@ -54,7 +53,6 @@ from datachain.error import (
|
|
|
54
53
|
from datachain.lib.listing import get_listing
|
|
55
54
|
from datachain.node import DirType, Node, NodeWithPath
|
|
56
55
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
57
|
-
from datachain.remote.studio import StudioClient
|
|
58
56
|
from datachain.sql.types import DateTime, SQLType
|
|
59
57
|
from datachain.utils import DataChainDir
|
|
60
58
|
|
|
@@ -162,6 +160,8 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
162
160
|
max_threads: int = PULL_DATASET_MAX_THREADS,
|
|
163
161
|
progress_bar=None,
|
|
164
162
|
):
|
|
163
|
+
from datachain.remote.studio import StudioClient
|
|
164
|
+
|
|
165
165
|
super().__init__(max_threads)
|
|
166
166
|
self._check_dependencies()
|
|
167
167
|
self.metastore = metastore
|
|
@@ -234,6 +234,8 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
234
234
|
return df.drop("sys__id", axis=1)
|
|
235
235
|
|
|
236
236
|
def get_parquet_content(self, url: str):
|
|
237
|
+
import requests
|
|
238
|
+
|
|
237
239
|
while True:
|
|
238
240
|
if self.should_check_for_status():
|
|
239
241
|
self.check_for_status()
|
|
@@ -775,6 +777,8 @@ class Catalog:
|
|
|
775
777
|
validate_version: Optional[bool] = True,
|
|
776
778
|
listing: Optional[bool] = False,
|
|
777
779
|
uuid: Optional[str] = None,
|
|
780
|
+
description: Optional[str] = None,
|
|
781
|
+
labels: Optional[list[str]] = None,
|
|
778
782
|
) -> "DatasetRecord":
|
|
779
783
|
"""
|
|
780
784
|
Creates new dataset of a specific version.
|
|
@@ -801,6 +805,8 @@ class Catalog:
|
|
|
801
805
|
query_script=query_script,
|
|
802
806
|
schema=schema,
|
|
803
807
|
ignore_if_exists=True,
|
|
808
|
+
description=description,
|
|
809
|
+
labels=labels,
|
|
804
810
|
)
|
|
805
811
|
|
|
806
812
|
version = version or default_version
|
|
@@ -1130,6 +1136,8 @@ class Catalog:
|
|
|
1130
1136
|
raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
|
|
1131
1137
|
|
|
1132
1138
|
def get_remote_dataset(self, name: str) -> DatasetRecord:
|
|
1139
|
+
from datachain.remote.studio import StudioClient
|
|
1140
|
+
|
|
1133
1141
|
studio_client = StudioClient()
|
|
1134
1142
|
|
|
1135
1143
|
info_response = studio_client.dataset_info(name)
|
|
@@ -1164,8 +1172,27 @@ class Catalog:
|
|
|
1164
1172
|
|
|
1165
1173
|
return direct_dependencies
|
|
1166
1174
|
|
|
1167
|
-
def ls_datasets(
|
|
1168
|
-
|
|
1175
|
+
def ls_datasets(
|
|
1176
|
+
self, include_listing: bool = False, studio: bool = False
|
|
1177
|
+
) -> Iterator[DatasetListRecord]:
|
|
1178
|
+
from datachain.remote.studio import StudioClient
|
|
1179
|
+
|
|
1180
|
+
if studio:
|
|
1181
|
+
client = StudioClient()
|
|
1182
|
+
response = client.ls_datasets()
|
|
1183
|
+
if not response.ok:
|
|
1184
|
+
raise DataChainError(response.message)
|
|
1185
|
+
if not response.data:
|
|
1186
|
+
return
|
|
1187
|
+
|
|
1188
|
+
datasets: Iterator[DatasetListRecord] = (
|
|
1189
|
+
DatasetListRecord.from_dict(d)
|
|
1190
|
+
for d in response.data
|
|
1191
|
+
if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
|
|
1192
|
+
)
|
|
1193
|
+
else:
|
|
1194
|
+
datasets = self.metastore.list_datasets()
|
|
1195
|
+
|
|
1169
1196
|
for d in datasets:
|
|
1170
1197
|
if not d.is_bucket_listing or include_listing:
|
|
1171
1198
|
yield d
|
|
@@ -1173,9 +1200,12 @@ class Catalog:
|
|
|
1173
1200
|
def list_datasets_versions(
|
|
1174
1201
|
self,
|
|
1175
1202
|
include_listing: bool = False,
|
|
1203
|
+
studio: bool = False,
|
|
1176
1204
|
) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
|
|
1177
1205
|
"""Iterate over all dataset versions with related jobs."""
|
|
1178
|
-
datasets = list(
|
|
1206
|
+
datasets = list(
|
|
1207
|
+
self.ls_datasets(include_listing=include_listing, studio=studio)
|
|
1208
|
+
)
|
|
1179
1209
|
|
|
1180
1210
|
# preselect dataset versions jobs from db to avoid multiple queries
|
|
1181
1211
|
jobs_ids: set[str] = {
|
|
@@ -1345,6 +1375,8 @@ class Catalog:
|
|
|
1345
1375
|
if cp and not output:
|
|
1346
1376
|
raise ValueError("Please provide output directory for instantiation")
|
|
1347
1377
|
|
|
1378
|
+
from datachain.remote.studio import StudioClient
|
|
1379
|
+
|
|
1348
1380
|
studio_client = StudioClient()
|
|
1349
1381
|
|
|
1350
1382
|
try:
|
|
@@ -1580,7 +1612,7 @@ class Catalog:
|
|
|
1580
1612
|
except TerminationSignal as exc:
|
|
1581
1613
|
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1582
1614
|
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1583
|
-
|
|
1615
|
+
logger.info("Shutting down process %s, received %r", proc.pid, exc)
|
|
1584
1616
|
# Rather than forwarding the signal to the child, we try to shut it down
|
|
1585
1617
|
# gracefully. This is because we consider the script to be interactive
|
|
1586
1618
|
# and special, so we give it time to cleanup before exiting.
|
|
@@ -1595,7 +1627,7 @@ class Catalog:
|
|
|
1595
1627
|
if thread:
|
|
1596
1628
|
thread.join() # wait for the reader thread
|
|
1597
1629
|
|
|
1598
|
-
|
|
1630
|
+
logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1599
1631
|
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1600
1632
|
raise QueryScriptCancelError(
|
|
1601
1633
|
"Query script was canceled by user",
|
|
@@ -1,19 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from importlib import import_module
|
|
3
|
-
from typing import Any, Optional
|
|
4
|
-
|
|
5
|
-
from datachain.catalog import Catalog
|
|
6
|
-
from datachain.data_storage import (
|
|
7
|
-
AbstractMetastore,
|
|
8
|
-
AbstractWarehouse,
|
|
9
|
-
)
|
|
10
|
-
from datachain.data_storage.serializer import deserialize
|
|
11
|
-
from datachain.data_storage.sqlite import (
|
|
12
|
-
SQLiteMetastore,
|
|
13
|
-
SQLiteWarehouse,
|
|
14
|
-
)
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
4
|
+
|
|
15
5
|
from datachain.utils import get_envs_by_prefix
|
|
16
6
|
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from datachain.catalog import Catalog
|
|
9
|
+
from datachain.data_storage import AbstractMetastore, AbstractWarehouse
|
|
10
|
+
|
|
17
11
|
METASTORE_SERIALIZED = "DATACHAIN__METASTORE"
|
|
18
12
|
METASTORE_IMPORT_PATH = "DATACHAIN_METASTORE"
|
|
19
13
|
METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
|
|
@@ -27,6 +21,9 @@ IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
|
27
21
|
|
|
28
22
|
|
|
29
23
|
def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
|
|
24
|
+
from datachain.data_storage import AbstractMetastore
|
|
25
|
+
from datachain.data_storage.serializer import deserialize
|
|
26
|
+
|
|
30
27
|
metastore_serialized = os.environ.get(METASTORE_SERIALIZED)
|
|
31
28
|
if metastore_serialized:
|
|
32
29
|
metastore_obj = deserialize(metastore_serialized)
|
|
@@ -45,6 +42,8 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
|
|
|
45
42
|
}
|
|
46
43
|
|
|
47
44
|
if not metastore_import_path:
|
|
45
|
+
from datachain.data_storage.sqlite import SQLiteMetastore
|
|
46
|
+
|
|
48
47
|
metastore_args["in_memory"] = in_memory
|
|
49
48
|
return SQLiteMetastore(**metastore_args)
|
|
50
49
|
if in_memory:
|
|
@@ -62,6 +61,9 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
|
|
|
62
61
|
|
|
63
62
|
|
|
64
63
|
def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
64
|
+
from datachain.data_storage import AbstractWarehouse
|
|
65
|
+
from datachain.data_storage.serializer import deserialize
|
|
66
|
+
|
|
65
67
|
warehouse_serialized = os.environ.get(WAREHOUSE_SERIALIZED)
|
|
66
68
|
if warehouse_serialized:
|
|
67
69
|
warehouse_obj = deserialize(warehouse_serialized)
|
|
@@ -80,6 +82,8 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
|
80
82
|
}
|
|
81
83
|
|
|
82
84
|
if not warehouse_import_path:
|
|
85
|
+
from datachain.data_storage.sqlite import SQLiteWarehouse
|
|
86
|
+
|
|
83
87
|
warehouse_args["in_memory"] = in_memory
|
|
84
88
|
return SQLiteWarehouse(**warehouse_args)
|
|
85
89
|
if in_memory:
|
|
@@ -121,7 +125,7 @@ def get_distributed_class(**kwargs):
|
|
|
121
125
|
|
|
122
126
|
def get_catalog(
|
|
123
127
|
client_config: Optional[dict[str, Any]] = None, in_memory: bool = False
|
|
124
|
-
) -> Catalog:
|
|
128
|
+
) -> "Catalog":
|
|
125
129
|
"""
|
|
126
130
|
Function that creates Catalog instance with appropriate metastore
|
|
127
131
|
and warehouse classes. Metastore class can be provided with env variable
|
|
@@ -133,6 +137,8 @@ def get_catalog(
|
|
|
133
137
|
and name of variable after, e.g. if it accepts team_id as kwargs
|
|
134
138
|
we can provide DATACHAIN_METASTORE_ARG_TEAM_ID=12345 env variable.
|
|
135
139
|
"""
|
|
140
|
+
from datachain.catalog import Catalog
|
|
141
|
+
|
|
136
142
|
return Catalog(
|
|
137
143
|
metastore=get_metastore(in_memory=in_memory),
|
|
138
144
|
warehouse=get_warehouse(in_memory=in_memory),
|
|
@@ -6,7 +6,6 @@ from multiprocessing import freeze_support
|
|
|
6
6
|
from typing import Optional
|
|
7
7
|
|
|
8
8
|
from datachain.cli.utils import get_logging_level
|
|
9
|
-
from datachain.telemetry import telemetry
|
|
10
9
|
|
|
11
10
|
from .commands import (
|
|
12
11
|
clear_cache,
|
|
@@ -70,6 +69,8 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
70
69
|
error, return_code = handle_general_exception(exc, args, logging_level)
|
|
71
70
|
return return_code
|
|
72
71
|
finally:
|
|
72
|
+
from datachain.telemetry import telemetry
|
|
73
|
+
|
|
73
74
|
telemetry.send_cli_call(args.command, error=error)
|
|
74
75
|
|
|
75
76
|
|
|
@@ -38,11 +38,12 @@ def ls_local(
|
|
|
38
38
|
):
|
|
39
39
|
from datachain import DataChain
|
|
40
40
|
|
|
41
|
-
if catalog is None:
|
|
42
|
-
from datachain.catalog import get_catalog
|
|
43
|
-
|
|
44
|
-
catalog = get_catalog(client_config=client_config)
|
|
45
41
|
if sources:
|
|
42
|
+
if catalog is None:
|
|
43
|
+
from datachain.catalog import get_catalog
|
|
44
|
+
|
|
45
|
+
catalog = get_catalog(client_config=client_config)
|
|
46
|
+
|
|
46
47
|
actual_sources = list(ls_urls(sources, catalog=catalog, long=long, **kwargs))
|
|
47
48
|
if len(actual_sources) == 1:
|
|
48
49
|
for _, entries in actual_sources:
|
|
@@ -61,8 +62,9 @@ def ls_local(
|
|
|
61
62
|
for entry in entries:
|
|
62
63
|
print(format_ls_entry(entry))
|
|
63
64
|
else:
|
|
64
|
-
|
|
65
|
-
|
|
65
|
+
# Collect results in a list here to prevent interference from `tqdm` and `print`
|
|
66
|
+
listing = list(DataChain.listings().collect("listing"))
|
|
67
|
+
for ls in listing:
|
|
66
68
|
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
67
69
|
|
|
68
70
|
|
|
@@ -40,6 +40,13 @@ def show(
|
|
|
40
40
|
.offset(offset)
|
|
41
41
|
)
|
|
42
42
|
records = query.to_db_records()
|
|
43
|
+
print("Name: ", name)
|
|
44
|
+
if dataset.description:
|
|
45
|
+
print("Description: ", dataset.description)
|
|
46
|
+
if dataset.labels:
|
|
47
|
+
print("Labels: ", ",".join(dataset.labels))
|
|
48
|
+
print("\n")
|
|
49
|
+
|
|
43
50
|
show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
|
|
44
51
|
|
|
45
52
|
if schema and dataset_version.feature_schema:
|
|
@@ -63,19 +63,31 @@ def add_auth_parser(subparsers, parent_parser) -> None:
|
|
|
63
63
|
default=False,
|
|
64
64
|
help="Use code-based authentication without browser",
|
|
65
65
|
)
|
|
66
|
+
login_parser.add_argument(
|
|
67
|
+
"--local",
|
|
68
|
+
action="store_true",
|
|
69
|
+
default=False,
|
|
70
|
+
help="Save the token in the local project config",
|
|
71
|
+
)
|
|
66
72
|
|
|
67
73
|
auth_logout_help = "Log out from Studio"
|
|
68
74
|
auth_logout_description = (
|
|
69
75
|
"Remove the Studio authentication token from global config."
|
|
70
76
|
)
|
|
71
77
|
|
|
72
|
-
auth_subparser.add_parser(
|
|
78
|
+
logout_parser = auth_subparser.add_parser(
|
|
73
79
|
"logout",
|
|
74
80
|
parents=[parent_parser],
|
|
75
81
|
description=auth_logout_description,
|
|
76
82
|
help=auth_logout_help,
|
|
77
83
|
formatter_class=CustomHelpFormatter,
|
|
78
84
|
)
|
|
85
|
+
logout_parser.add_argument(
|
|
86
|
+
"--local",
|
|
87
|
+
action="store_true",
|
|
88
|
+
default=False,
|
|
89
|
+
help="Remove the token from the local project config",
|
|
90
|
+
)
|
|
79
91
|
|
|
80
92
|
auth_team_help = "Set default team for Studio operations"
|
|
81
93
|
auth_team_description = "Set the default team for Studio operations."
|
|
@@ -17,10 +17,10 @@ from typing import (
|
|
|
17
17
|
ClassVar,
|
|
18
18
|
NamedTuple,
|
|
19
19
|
Optional,
|
|
20
|
+
Union,
|
|
20
21
|
)
|
|
21
22
|
from urllib.parse import urlparse
|
|
22
23
|
|
|
23
|
-
from botocore.exceptions import ClientError
|
|
24
24
|
from dvc_objects.fs.system import reflink
|
|
25
25
|
from fsspec.asyn import get_loop, sync
|
|
26
26
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
@@ -28,7 +28,6 @@ from tqdm.auto import tqdm
|
|
|
28
28
|
|
|
29
29
|
from datachain.cache import Cache
|
|
30
30
|
from datachain.client.fileslice import FileWrapper
|
|
31
|
-
from datachain.error import ClientError as DataChainClientError
|
|
32
31
|
from datachain.nodes_fetcher import NodesFetcher
|
|
33
32
|
from datachain.nodes_thread_pool import NodeChunk
|
|
34
33
|
|
|
@@ -83,19 +82,17 @@ class Client(ABC):
|
|
|
83
82
|
self.uri = self.get_uri(self.name)
|
|
84
83
|
|
|
85
84
|
@staticmethod
|
|
86
|
-
def get_implementation(url: str) -> type["Client"]:
|
|
85
|
+
def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]:
|
|
87
86
|
from .azure import AzureClient
|
|
88
87
|
from .gcs import GCSClient
|
|
89
88
|
from .hf import HfClient
|
|
90
89
|
from .local import FileClient
|
|
91
90
|
from .s3 import ClientS3
|
|
92
91
|
|
|
93
|
-
protocol = urlparse(url).scheme
|
|
92
|
+
protocol = urlparse(str(url)).scheme
|
|
94
93
|
|
|
95
|
-
if not protocol or _is_win_local_path(url):
|
|
94
|
+
if not protocol or _is_win_local_path(str(url)):
|
|
96
95
|
return FileClient
|
|
97
|
-
|
|
98
|
-
protocol = protocol.lower()
|
|
99
96
|
if protocol == ClientS3.protocol:
|
|
100
97
|
return ClientS3
|
|
101
98
|
if protocol == GCSClient.protocol:
|
|
@@ -121,9 +118,11 @@ class Client(ABC):
|
|
|
121
118
|
return cls.get_uri(storage_name), rel_path
|
|
122
119
|
|
|
123
120
|
@staticmethod
|
|
124
|
-
def get_client(
|
|
121
|
+
def get_client(
|
|
122
|
+
source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
|
|
123
|
+
) -> "Client":
|
|
125
124
|
cls = Client.get_implementation(source)
|
|
126
|
-
storage_url, _ = cls.split_url(source)
|
|
125
|
+
storage_url, _ = cls.split_url(str(source))
|
|
127
126
|
if os.name == "nt":
|
|
128
127
|
storage_url = storage_url.removeprefix("/")
|
|
129
128
|
|
|
@@ -209,7 +208,7 @@ class Client(ABC):
|
|
|
209
208
|
|
|
210
209
|
async def get_current_etag(self, file: "File") -> str:
|
|
211
210
|
kwargs = {}
|
|
212
|
-
if self.fs
|
|
211
|
+
if getattr(self.fs, "version_aware", False):
|
|
213
212
|
kwargs["version_id"] = file.version
|
|
214
213
|
info = await self.fs._info(
|
|
215
214
|
self.get_full_path(file.path, file.version), **kwargs
|
|
@@ -286,11 +285,6 @@ class Client(ABC):
|
|
|
286
285
|
worker.cancel()
|
|
287
286
|
if excs:
|
|
288
287
|
raise excs[0]
|
|
289
|
-
except ClientError as exc:
|
|
290
|
-
raise DataChainClientError(
|
|
291
|
-
exc.response.get("Error", {}).get("Message") or exc,
|
|
292
|
-
exc.response.get("Error", {}).get("Code"),
|
|
293
|
-
) from exc
|
|
294
288
|
finally:
|
|
295
289
|
# This ensures the progress bar is closed before any exceptions are raised
|
|
296
290
|
progress_bar.close()
|
|
@@ -333,7 +327,9 @@ class Client(ABC):
|
|
|
333
327
|
return not (key.startswith("/") or key.endswith("/") or "//" in key)
|
|
334
328
|
|
|
335
329
|
async def ls_dir(self, path):
|
|
336
|
-
|
|
330
|
+
if getattr(self.fs, "version_aware", False):
|
|
331
|
+
kwargs = {"versions": True}
|
|
332
|
+
return await self.fs._ls(path, detail=True, **kwargs)
|
|
337
333
|
|
|
338
334
|
def rel_path(self, path: str) -> str:
|
|
339
335
|
return self.fs.split_path(path)[1]
|