datachain 0.8.13__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.8.13 → datachain-0.9.0}/.github/workflows/tests-studio.yml +3 -0
- {datachain-0.8.13 → datachain-0.9.0}/.github/workflows/tests.yml +3 -0
- {datachain-0.8.13 → datachain-0.9.0}/.pre-commit-config.yaml +2 -2
- {datachain-0.8.13 → datachain-0.9.0}/PKG-INFO +13 -5
- datachain-0.9.0/docs/references/data-types/arrowrow.md +3 -0
- datachain-0.9.0/docs/references/data-types/bbox.md +5 -0
- datachain-0.9.0/docs/references/data-types/file.md +35 -0
- datachain-0.9.0/docs/references/data-types/imagefile.md +15 -0
- datachain-0.8.13/docs/references/datatype.md → datachain-0.9.0/docs/references/data-types/index.md +1 -1
- datachain-0.9.0/docs/references/data-types/pose.md +5 -0
- datachain-0.9.0/docs/references/data-types/segment.md +3 -0
- datachain-0.9.0/docs/references/data-types/tarvfile.md +3 -0
- datachain-0.9.0/docs/references/data-types/textfile.md +13 -0
- datachain-0.9.0/docs/references/data-types/videofile.md +29 -0
- datachain-0.9.0/docs/references/index.md +23 -0
- datachain-0.9.0/docs/references/toolkit.md +5 -0
- datachain-0.9.0/examples/get_started/common_sql_functions.py +54 -0
- {datachain-0.8.13 → datachain-0.9.0}/mkdocs.yml +12 -2
- {datachain-0.8.13 → datachain-0.9.0}/noxfile.py +7 -1
- {datachain-0.8.13 → datachain-0.9.0}/pyproject.toml +16 -5
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/__init__.py +10 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/catalog/catalog.py +32 -9
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/__init__.py +2 -0
- datachain-0.9.0/src/datachain/cli/commands/datasets.py +175 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/parser/__init__.py +62 -12
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/parser/job.py +14 -4
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/parser/studio.py +8 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/parser/utils.py +20 -1
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/dataset.py +7 -4
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/diff/__init__.py +78 -128
- datachain-0.9.0/src/datachain/fs/reference.py +21 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/__init__.py +3 -1
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/conditional.py +66 -2
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/job.py +1 -1
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/arrow.py +1 -11
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/dc.py +2 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/file.py +292 -5
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/hf.py +1 -1
- datachain-0.9.0/src/datachain/lib/video.py +223 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/dataset.py +28 -3
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/remote/studio.py +13 -6
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/studio.py +34 -12
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/utils.py +12 -2
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/PKG-INFO +13 -5
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/SOURCES.txt +16 -3
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/requires.txt +13 -4
- {datachain-0.8.13 → datachain-0.9.0}/tests/conftest.py +11 -5
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_catalog.py +44 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_pull.py +42 -4
- {datachain-0.8.13 → datachain-0.9.0}/tests/test_cli_e2e.py +1 -1
- {datachain-0.8.13 → datachain-0.9.0}/tests/test_cli_studio.py +33 -12
- datachain-0.9.0/tests/unit/lib/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_diff.py +16 -5
- datachain-0.9.0/tests/unit/lib/test_video.py +229 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_conditional.py +32 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_cli_parsing.py +2 -1
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_func.py +26 -0
- datachain-0.8.13/docs/references/file.md +0 -22
- datachain-0.8.13/docs/references/index.md +0 -14
- datachain-0.8.13/examples/get_started/common_sql_functions.py +0 -113
- datachain-0.8.13/src/datachain/cli/commands/datasets.py +0 -109
- {datachain-0.8.13 → datachain-0.9.0}/.cruft.json +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/.gitattributes +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/.github/codecov.yaml +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/.github/dependabot.yml +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/.github/workflows/release.yml +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/.gitignore +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/LICENSE +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/README.rst +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/contributing.md +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/examples.md +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/index.md +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/overrides/main.html +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/quick-start.md +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/references/datachain.md +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/references/func.md +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/references/torch.md +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/references/udf.md +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/docs/tutorials.md +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/setup.cfg +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/__main__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/asyn.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cache.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/local.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/config.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/error.py +0 -0
- {datachain-0.8.13/src/datachain/lib → datachain-0.9.0/src/datachain/fs}/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/array.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/base.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/func.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/path.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/random.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/string.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/window.py +0 -0
- {datachain-0.8.13/src/datachain/lib/convert → datachain-0.9.0/src/datachain/lib}/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.8.13/src/datachain/remote → datachain-0.9.0/src/datachain/lib/convert}/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/listing.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/node.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/progress.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/py.typed +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/params.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/session.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.8.13/src/datachain/sql/functions → datachain-0.9.0/src/datachain/remote}/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.8.13/tests/benchmarks → datachain-0.9.0/src/datachain/sql/functions}/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/__init__.py +0 -0
- {datachain-0.8.13/tests/examples → datachain-0.9.0/tests/benchmarks}/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/data.py +0 -0
- {datachain-0.8.13/tests/func → datachain-0.9.0/tests/examples}/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.8.13/tests/unit → datachain-0.9.0/tests/func}/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_client.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_datachain.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_file.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_hf.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_listing.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_ls.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_query.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_session.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/test_atomicity.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/test_telemetry.py +0 -0
- {datachain-0.8.13/tests/unit/lib → datachain-0.9.0/tests/unit}/__init__.py +0 -0
- {datachain-0.8.13/tests/unit/sql → datachain-0.9.0/tests/unit/lib}/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.8.13/tests/unit/sql/sqlite → datachain-0.9.0/tests/unit/sql}/__init__.py +0 -0
- /datachain-0.8.13/src/datachain/lib/vfile.py → /datachain-0.9.0/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_client.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_config.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_query.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_session.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.8.13 → datachain-0.9.0}/tests/utils.py +0 -0
|
@@ -24,13 +24,13 @@ repos:
|
|
|
24
24
|
- id: trailing-whitespace
|
|
25
25
|
exclude: '^LICENSES/'
|
|
26
26
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
27
|
-
rev: 'v0.9.
|
|
27
|
+
rev: 'v0.9.6'
|
|
28
28
|
hooks:
|
|
29
29
|
- id: ruff
|
|
30
30
|
args: [--fix, --exit-non-zero-on-fix]
|
|
31
31
|
- id: ruff-format
|
|
32
32
|
- repo: https://github.com/codespell-project/codespell
|
|
33
|
-
rev: v2.4.
|
|
33
|
+
rev: v2.4.1
|
|
34
34
|
hooks:
|
|
35
35
|
- id: codespell
|
|
36
36
|
additional_dependencies: ["tomli"]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -21,10 +21,12 @@ Requires-Dist: tomlkit
|
|
|
21
21
|
Requires-Dist: tqdm
|
|
22
22
|
Requires-Dist: numpy<3,>=1
|
|
23
23
|
Requires-Dist: pandas>=2.0.0
|
|
24
|
+
Requires-Dist: packaging
|
|
24
25
|
Requires-Dist: pyarrow
|
|
25
26
|
Requires-Dist: typing-extensions
|
|
26
27
|
Requires-Dist: python-dateutil>=2
|
|
27
28
|
Requires-Dist: attrs>=21.3.0
|
|
29
|
+
Requires-Dist: fsspec>=2024.2.0
|
|
28
30
|
Requires-Dist: s3fs>=2024.2.0
|
|
29
31
|
Requires-Dist: gcsfs>=2024.2.0
|
|
30
32
|
Requires-Dist: adlfs>=2024.2.0
|
|
@@ -42,7 +44,7 @@ Requires-Dist: Pillow<12,>=10.0.0
|
|
|
42
44
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
43
45
|
Requires-Dist: psutil
|
|
44
46
|
Requires-Dist: huggingface_hub
|
|
45
|
-
Requires-Dist: iterative-telemetry>=0.0.
|
|
47
|
+
Requires-Dist: iterative-telemetry>=0.0.10
|
|
46
48
|
Requires-Dist: platformdirs
|
|
47
49
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
48
50
|
Requires-Dist: tabulate
|
|
@@ -54,6 +56,7 @@ Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
|
|
|
54
56
|
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
55
57
|
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
56
58
|
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
59
|
+
Requires-Dist: eval-type-backport; extra == "docs"
|
|
57
60
|
Provides-Extra: torch
|
|
58
61
|
Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
59
62
|
Requires-Dist: torchvision; extra == "torch"
|
|
@@ -66,8 +69,13 @@ Requires-Dist: usearch; extra == "vector"
|
|
|
66
69
|
Provides-Extra: hf
|
|
67
70
|
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
68
71
|
Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
|
|
72
|
+
Provides-Extra: video
|
|
73
|
+
Requires-Dist: av<14; extra == "video"
|
|
74
|
+
Requires-Dist: ffmpeg-python; extra == "video"
|
|
75
|
+
Requires-Dist: imageio[ffmpeg]; extra == "video"
|
|
76
|
+
Requires-Dist: opencv-python; extra == "video"
|
|
69
77
|
Provides-Extra: tests
|
|
70
|
-
Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
|
|
78
|
+
Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
|
|
71
79
|
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
72
80
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
73
81
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
@@ -83,7 +91,7 @@ Requires-Dist: requests-mock; extra == "tests"
|
|
|
83
91
|
Requires-Dist: scipy; extra == "tests"
|
|
84
92
|
Provides-Extra: dev
|
|
85
93
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
86
|
-
Requires-Dist: mypy==1.
|
|
94
|
+
Requires-Dist: mypy==1.15.0; extra == "dev"
|
|
87
95
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
88
96
|
Requires-Dist: types-pytz; extra == "dev"
|
|
89
97
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -94,7 +102,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
94
102
|
Requires-Dist: defusedxml; extra == "examples"
|
|
95
103
|
Requires-Dist: accelerate; extra == "examples"
|
|
96
104
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
97
|
-
Requires-Dist: ultralytics==8.3.
|
|
105
|
+
Requires-Dist: ultralytics==8.3.74; extra == "examples"
|
|
98
106
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
99
107
|
|
|
100
108
|
================
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# File
|
|
2
|
+
|
|
3
|
+
`File` is a special [`DataModel`](index.md#datachain.lib.data_model.DataModel),
|
|
4
|
+
which is automatically generated when a `DataChain` is created from files,
|
|
5
|
+
such as in [`DataChain.from_storage`](../datachain.md#datachain.lib.dc.DataChain.from_storage):
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from datachain import DataChain
|
|
9
|
+
|
|
10
|
+
dc = DataChain.from_storage("gs://datachain-demo/dogs-and-cats")
|
|
11
|
+
dc.print_schema()
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Output:
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
file: File@v1
|
|
18
|
+
source: str
|
|
19
|
+
path: str
|
|
20
|
+
size: int
|
|
21
|
+
version: str
|
|
22
|
+
etag: str
|
|
23
|
+
is_latest: bool
|
|
24
|
+
last_modified: datetime
|
|
25
|
+
location: Union[dict, list[dict], NoneType]
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
`File` classes include various metadata fields describing the underlying file,
|
|
29
|
+
along with methods to read and manipulate file contents.
|
|
30
|
+
|
|
31
|
+
::: datachain.lib.file.File
|
|
32
|
+
|
|
33
|
+
::: datachain.lib.file.FileError
|
|
34
|
+
|
|
35
|
+
::: datachain.lib.file.TarVFile
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# ImageFile
|
|
2
|
+
|
|
3
|
+
`ImageFile` is inherited from [`File`](file.md) with additional methods for working with image files.
|
|
4
|
+
|
|
5
|
+
`ImageFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.DataChain.from_storage), using `type="image"` param:
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from datachain import DataChain
|
|
9
|
+
|
|
10
|
+
dc = DataChain.from_storage("s3://bucket-name/", type="image")
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
::: datachain.lib.file.ImageFile
|
|
14
|
+
|
|
15
|
+
::: datachain.lib.file.Image
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# TextFile
|
|
2
|
+
|
|
3
|
+
`TextFile` is inherited from [`File`](file.md) with additional methods for working with text files.
|
|
4
|
+
|
|
5
|
+
`TextFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.DataChain.from_storage), using `type="text"` param:
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from datachain import DataChain
|
|
9
|
+
|
|
10
|
+
dc = DataChain.from_storage("s3://bucket-name/", type="text")
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
::: datachain.lib.file.TextFile
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# VideoFile
|
|
2
|
+
|
|
3
|
+
`VideoFile` extends [`File`](file.md) and provides additional methods for working with video files.
|
|
4
|
+
|
|
5
|
+
`VideoFile` instances are created when a `DataChain` is initialized [from storage](../datachain.md#datachain.lib.dc.DataChain.from_storage) with the `type="video"` parameter:
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from datachain import DataChain
|
|
9
|
+
|
|
10
|
+
dc = DataChain.from_storage("s3://bucket-name/", type="video")
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
There are additional models for working with video files:
|
|
14
|
+
|
|
15
|
+
- `VideoFrame` - represents a single frame of a video file.
|
|
16
|
+
- `VideoFragment` - represents a fragment of a video file.
|
|
17
|
+
|
|
18
|
+
These are virtual models that do not create physical files.
|
|
19
|
+
Instead, they are used to represent the data in the `VideoFile` these models are referring to.
|
|
20
|
+
If you need to save the data, you can use the `save` method of these models,
|
|
21
|
+
allowing you to save data locally or upload it to a storage service.
|
|
22
|
+
|
|
23
|
+
::: datachain.lib.file.VideoFile
|
|
24
|
+
|
|
25
|
+
::: datachain.lib.file.VideoFrame
|
|
26
|
+
|
|
27
|
+
::: datachain.lib.file.VideoFragment
|
|
28
|
+
|
|
29
|
+
::: datachain.lib.file.Video
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: API Reference
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
# API Reference
|
|
6
|
+
|
|
7
|
+
DataChain's API is organized into several modules:
|
|
8
|
+
|
|
9
|
+
- [DataChain](./datachain.md) - Core chain operations and dataset management
|
|
10
|
+
- [Data Types](./data-types/index.md) - Supported data types and schema definitions
|
|
11
|
+
- [File](./data-types/file.md) - File handling and storage operations
|
|
12
|
+
- [TextFile](./data-types/textfile.md) - Text file
|
|
13
|
+
- [ImageFile](./data-types/imagefile.md) - Image file
|
|
14
|
+
- [VideoFile](./data-types/imagefile.md) - Video file
|
|
15
|
+
- [TarVFile](./data-types/tarvfile.md) - Virtual file model for files extracted from tar archives
|
|
16
|
+
- [ArrowRow](./data-types/arrowrow.md) - Working with Arrow-supported file
|
|
17
|
+
- [BBox](./data-types/bbox.md) - Bounding box data type
|
|
18
|
+
- [Pose](./data-types/pose.md) - Pose data type
|
|
19
|
+
- [Segment](./data-types/segment.md) - Segment data type
|
|
20
|
+
- [UDF](./udf.md) - User-defined functions and transformations
|
|
21
|
+
- [Functions](./func.md) - Built-in functions for data manipulation and analysis
|
|
22
|
+
- [Torch](./torch.md) - PyTorch data loading utilities
|
|
23
|
+
- [Toolkit](./toolkit.md) - Functions for common DS/ML operations
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from datachain import C, DataChain
|
|
2
|
+
from datachain.func import array, greatest, least, path, string
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def num_chars_udf(file):
|
|
6
|
+
parts = file.name.split(".")
|
|
7
|
+
if len(parts) > 1:
|
|
8
|
+
return (list(parts[1]),)
|
|
9
|
+
return ([],)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
dc = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", anon=True)
|
|
13
|
+
dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
|
|
14
|
+
"file.path", "num_chars"
|
|
15
|
+
).show(5)
|
|
16
|
+
|
|
17
|
+
(
|
|
18
|
+
dc.mutate(
|
|
19
|
+
length=string.length(path.name(C("file.path"))),
|
|
20
|
+
parts=string.split(path.name(C("file.path")), "."),
|
|
21
|
+
)
|
|
22
|
+
.select("file.path", "length", "parts")
|
|
23
|
+
.show(5)
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
(
|
|
27
|
+
dc.mutate(
|
|
28
|
+
stem=path.file_stem(C("file.path")),
|
|
29
|
+
ext=path.file_ext(C("file.path")),
|
|
30
|
+
)
|
|
31
|
+
.select("file.path", "stem", "ext")
|
|
32
|
+
.show(5)
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
parts = string.split(path.name(C("file.path")), ".")
|
|
36
|
+
chain = dc.mutate(
|
|
37
|
+
isdog=array.contains(parts, "dog"),
|
|
38
|
+
iscat=array.contains(parts, "cat"),
|
|
39
|
+
)
|
|
40
|
+
chain.select("file.path", "isdog", "iscat").show(5)
|
|
41
|
+
|
|
42
|
+
chain = dc.mutate(
|
|
43
|
+
a=array.length(string.split("file.path", "/")),
|
|
44
|
+
b=array.length(string.split(path.name("file.path"), "0")),
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
(
|
|
48
|
+
chain.mutate(
|
|
49
|
+
greatest=greatest(chain.column("a"), C("b")),
|
|
50
|
+
least=least(chain.column("a"), C("b")),
|
|
51
|
+
)
|
|
52
|
+
.select("a", "b", "greatest", "least")
|
|
53
|
+
.show(10)
|
|
54
|
+
)
|
|
@@ -69,11 +69,21 @@ nav:
|
|
|
69
69
|
- 🐍 API Reference:
|
|
70
70
|
- Overview: references/index.md
|
|
71
71
|
- DataChain: references/datachain.md
|
|
72
|
-
-
|
|
73
|
-
|
|
72
|
+
- Data Types:
|
|
73
|
+
- Overview: references/data-types/index.md
|
|
74
|
+
- File: references/data-types/file.md
|
|
75
|
+
- TextFile: references/data-types/textfile.md
|
|
76
|
+
- ImageFile: references/data-types/imagefile.md
|
|
77
|
+
- VideoFile: references/data-types/videofile.md
|
|
78
|
+
- TarVFile: references/data-types/tarvfile.md
|
|
79
|
+
- ArrowRow: references/data-types/arrowrow.md
|
|
80
|
+
- BBox: references/data-types/bbox.md
|
|
81
|
+
- Pose: references/data-types/pose.md
|
|
82
|
+
- Segment: references/data-types/segment.md
|
|
74
83
|
- UDF: references/udf.md
|
|
75
84
|
- Torch: references/torch.md
|
|
76
85
|
- Functions: references/func.md
|
|
86
|
+
- Toolkit: references/toolkit.md
|
|
77
87
|
- 🤝 Contributing: contributing.md
|
|
78
88
|
|
|
79
89
|
- DataChain Website ↗: https://datachain.ai" target="_blank"
|
|
@@ -32,6 +32,12 @@ def bench(session: nox.Session) -> None:
|
|
|
32
32
|
@nox.session(python=["3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"])
|
|
33
33
|
def tests(session: nox.Session) -> None:
|
|
34
34
|
session.install(".[tests]")
|
|
35
|
+
env = {"COVERAGE_FILE": f".coverage.{session.python}"}
|
|
36
|
+
if session.python == "3.12":
|
|
37
|
+
# improve performance of tests in Python 3.12 when used with coverage
|
|
38
|
+
# https://github.com/nedbat/coveragepy/issues/1665
|
|
39
|
+
# https://github.com/python/cpython/issues/107674
|
|
40
|
+
env["COVERAGE_CORE"] = "sysmon"
|
|
35
41
|
session.run(
|
|
36
42
|
"pytest",
|
|
37
43
|
"--cov",
|
|
@@ -41,7 +47,7 @@ def tests(session: nox.Session) -> None:
|
|
|
41
47
|
"--numprocesses=logical",
|
|
42
48
|
"--dist=loadgroup",
|
|
43
49
|
*session.posargs,
|
|
44
|
-
env=
|
|
50
|
+
env=env,
|
|
45
51
|
)
|
|
46
52
|
|
|
47
53
|
|
|
@@ -24,10 +24,12 @@ dependencies = [
|
|
|
24
24
|
"tqdm",
|
|
25
25
|
"numpy>=1,<3",
|
|
26
26
|
"pandas>=2.0.0",
|
|
27
|
+
"packaging",
|
|
27
28
|
"pyarrow",
|
|
28
29
|
"typing-extensions",
|
|
29
30
|
"python-dateutil>=2",
|
|
30
31
|
"attrs>=21.3.0",
|
|
32
|
+
"fsspec>=2024.2.0",
|
|
31
33
|
"s3fs>=2024.2.0",
|
|
32
34
|
"gcsfs>=2024.2.0",
|
|
33
35
|
"adlfs>=2024.2.0",
|
|
@@ -45,7 +47,7 @@ dependencies = [
|
|
|
45
47
|
"msgpack>=1.0.4,<2",
|
|
46
48
|
"psutil",
|
|
47
49
|
"huggingface_hub",
|
|
48
|
-
"iterative-telemetry>=0.0.
|
|
50
|
+
"iterative-telemetry>=0.0.10",
|
|
49
51
|
"platformdirs",
|
|
50
52
|
"dvc-studio-client>=0.21,<1",
|
|
51
53
|
"tabulate",
|
|
@@ -59,7 +61,8 @@ docs = [
|
|
|
59
61
|
"mkdocs-material==9.5.22",
|
|
60
62
|
"mkdocs-section-index>=0.3.6",
|
|
61
63
|
"mkdocstrings-python>=1.6.3",
|
|
62
|
-
"mkdocs-literate-nav>=0.6.1"
|
|
64
|
+
"mkdocs-literate-nav>=0.6.1",
|
|
65
|
+
"eval-type-backport"
|
|
63
66
|
]
|
|
64
67
|
torch = [
|
|
65
68
|
"torch>=2.1.0",
|
|
@@ -77,8 +80,16 @@ hf = [
|
|
|
77
80
|
"numba>=0.60.0",
|
|
78
81
|
"datasets[audio,vision]>=2.21.0"
|
|
79
82
|
]
|
|
83
|
+
video = [
|
|
84
|
+
# Use 'av<14' because of incompatibility with imageio
|
|
85
|
+
# See https://github.com/PyAV-Org/PyAV/discussions/1700
|
|
86
|
+
"av<14",
|
|
87
|
+
"ffmpeg-python",
|
|
88
|
+
"imageio[ffmpeg]",
|
|
89
|
+
"opencv-python"
|
|
90
|
+
]
|
|
80
91
|
tests = [
|
|
81
|
-
"datachain[torch,remote,vector,hf]",
|
|
92
|
+
"datachain[torch,remote,vector,hf,video]",
|
|
82
93
|
"pytest>=8,<9",
|
|
83
94
|
"pytest-sugar>=0.9.6",
|
|
84
95
|
"pytest-cov>=4.1.0",
|
|
@@ -95,7 +106,7 @@ tests = [
|
|
|
95
106
|
]
|
|
96
107
|
dev = [
|
|
97
108
|
"datachain[docs,tests]",
|
|
98
|
-
"mypy==1.
|
|
109
|
+
"mypy==1.15.0",
|
|
99
110
|
"types-python-dateutil",
|
|
100
111
|
"types-pytz",
|
|
101
112
|
"types-PyYAML",
|
|
@@ -107,7 +118,7 @@ examples = [
|
|
|
107
118
|
"defusedxml",
|
|
108
119
|
"accelerate",
|
|
109
120
|
"huggingface_hub[hf_transfer]",
|
|
110
|
-
"ultralytics==8.3.
|
|
121
|
+
"ultralytics==8.3.74",
|
|
111
122
|
"open_clip_torch"
|
|
112
123
|
]
|
|
113
124
|
|
|
@@ -4,9 +4,14 @@ from datachain.lib.file import (
|
|
|
4
4
|
ArrowRow,
|
|
5
5
|
File,
|
|
6
6
|
FileError,
|
|
7
|
+
Image,
|
|
7
8
|
ImageFile,
|
|
8
9
|
TarVFile,
|
|
9
10
|
TextFile,
|
|
11
|
+
Video,
|
|
12
|
+
VideoFile,
|
|
13
|
+
VideoFragment,
|
|
14
|
+
VideoFrame,
|
|
10
15
|
)
|
|
11
16
|
from datachain.lib.model_store import ModelStore
|
|
12
17
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
@@ -27,6 +32,7 @@ __all__ = [
|
|
|
27
32
|
"File",
|
|
28
33
|
"FileError",
|
|
29
34
|
"Generator",
|
|
35
|
+
"Image",
|
|
30
36
|
"ImageFile",
|
|
31
37
|
"Mapper",
|
|
32
38
|
"ModelStore",
|
|
@@ -34,6 +40,10 @@ __all__ = [
|
|
|
34
40
|
"Sys",
|
|
35
41
|
"TarVFile",
|
|
36
42
|
"TextFile",
|
|
43
|
+
"Video",
|
|
44
|
+
"VideoFile",
|
|
45
|
+
"VideoFragment",
|
|
46
|
+
"VideoFrame",
|
|
37
47
|
"is_chain_type",
|
|
38
48
|
"metrics",
|
|
39
49
|
"param",
|
|
@@ -89,10 +89,6 @@ PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be av
|
|
|
89
89
|
PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
|
|
90
90
|
|
|
91
91
|
|
|
92
|
-
def raise_remote_error(error_message: str) -> NoReturn:
|
|
93
|
-
raise DataChainError(f"Error from server: {error_message}")
|
|
94
|
-
|
|
95
|
-
|
|
96
92
|
def noop(_: str):
|
|
97
93
|
pass
|
|
98
94
|
|
|
@@ -211,14 +207,14 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
211
207
|
self.remote_ds_name, self.remote_ds_version
|
|
212
208
|
)
|
|
213
209
|
if not export_status_response.ok:
|
|
214
|
-
|
|
210
|
+
raise DataChainError(export_status_response.message)
|
|
215
211
|
|
|
216
212
|
export_status = export_status_response.data["status"] # type: ignore [index]
|
|
217
213
|
|
|
218
214
|
if export_status == "failed":
|
|
219
|
-
|
|
215
|
+
raise DataChainError("Dataset export failed in Studio")
|
|
220
216
|
if export_status == "removed":
|
|
221
|
-
|
|
217
|
+
raise DataChainError("Dataset export removed in Studio")
|
|
222
218
|
|
|
223
219
|
self.last_status_check = time.time()
|
|
224
220
|
|
|
@@ -1101,6 +1097,31 @@ class Catalog:
|
|
|
1101
1097
|
def get_dataset(self, name: str) -> DatasetRecord:
|
|
1102
1098
|
return self.metastore.get_dataset(name)
|
|
1103
1099
|
|
|
1100
|
+
def get_dataset_with_remote_fallback(
|
|
1101
|
+
self, name: str, version: Optional[int] = None
|
|
1102
|
+
) -> DatasetRecord:
|
|
1103
|
+
try:
|
|
1104
|
+
ds = self.get_dataset(name)
|
|
1105
|
+
if version and not ds.has_version(version):
|
|
1106
|
+
raise DatasetVersionNotFoundError(
|
|
1107
|
+
f"Dataset {name} does not have version {version}"
|
|
1108
|
+
)
|
|
1109
|
+
return ds
|
|
1110
|
+
|
|
1111
|
+
except (DatasetNotFoundError, DatasetVersionNotFoundError):
|
|
1112
|
+
print("Dataset not found in local catalog, trying to get from studio")
|
|
1113
|
+
|
|
1114
|
+
remote_ds_uri = f"{DATASET_PREFIX}{name}"
|
|
1115
|
+
if version:
|
|
1116
|
+
remote_ds_uri += f"@v{version}"
|
|
1117
|
+
|
|
1118
|
+
self.pull_dataset(
|
|
1119
|
+
remote_ds_uri=remote_ds_uri,
|
|
1120
|
+
local_ds_name=name,
|
|
1121
|
+
local_ds_version=version,
|
|
1122
|
+
)
|
|
1123
|
+
return self.get_dataset(name)
|
|
1124
|
+
|
|
1104
1125
|
def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
|
|
1105
1126
|
"""Returns dataset that contains version with specific uuid"""
|
|
1106
1127
|
for dataset in self.ls_datasets():
|
|
@@ -1113,7 +1134,7 @@ class Catalog:
|
|
|
1113
1134
|
|
|
1114
1135
|
info_response = studio_client.dataset_info(name)
|
|
1115
1136
|
if not info_response.ok:
|
|
1116
|
-
|
|
1137
|
+
raise DataChainError(info_response.message)
|
|
1117
1138
|
|
|
1118
1139
|
dataset_info = info_response.data
|
|
1119
1140
|
assert isinstance(dataset_info, dict)
|
|
@@ -1209,6 +1230,8 @@ class Catalog:
|
|
|
1209
1230
|
**kwargs,
|
|
1210
1231
|
) -> str:
|
|
1211
1232
|
client_config = client_config or self.client_config
|
|
1233
|
+
if client_config.get("anon"):
|
|
1234
|
+
content_disposition = None
|
|
1212
1235
|
client = Client.get_client(source, self.cache, **client_config)
|
|
1213
1236
|
return client.url(
|
|
1214
1237
|
path,
|
|
@@ -1407,7 +1430,7 @@ class Catalog:
|
|
|
1407
1430
|
remote_ds_name, remote_ds_version.version
|
|
1408
1431
|
)
|
|
1409
1432
|
if not export_response.ok:
|
|
1410
|
-
|
|
1433
|
+
raise DataChainError(export_response.message)
|
|
1411
1434
|
|
|
1412
1435
|
signed_urls = export_response.data
|
|
1413
1436
|
|