datachain 0.26.3__tar.gz → 0.27.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.26.3 → datachain-0.27.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.26.3 → datachain-0.27.0}/PKG-INFO +7 -2
- {datachain-0.26.3 → datachain-0.27.0}/README.rst +4 -1
- {datachain-0.26.3 → datachain-0.27.0}/docs/commands/job/run.md +62 -13
- {datachain-0.26.3 → datachain-0.27.0}/docs/examples.md +21 -31
- {datachain-0.26.3 → datachain-0.27.0}/pyproject.toml +2 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/parser/job.py +14 -1
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/arrow.py +1 -1
- datachain-0.27.0/src/datachain/lib/audio.py +244 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/data_model.py +9 -1
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/datachain.py +8 -4
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/hf.py +20 -4
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/storage.py +3 -3
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/file.py +60 -8
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/hf.py +17 -7
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/video.py +4 -1
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/remote/studio.py +4 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/studio.py +36 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/PKG-INFO +7 -2
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/requires.txt +2 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_audio.py +3 -2
- datachain-0.27.0/tests/func/test_hf.py +142 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_image.py +28 -0
- datachain-0.27.0/tests/func/test_studio_datetime_parsing.py +107 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/test_cli_studio.py +47 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_audio.py +153 -34
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_datachain.py +0 -18
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_hf.py +3 -1
- datachain-0.26.3/src/datachain/lib/audio.py +0 -151
- datachain-0.26.3/tests/func/test_hf.py +0 -67
- {datachain-0.26.3 → datachain-0.27.0}/.cruft.json +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.gitattributes +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.github/codecov.yaml +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.github/dependabot.yml +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.github/workflows/release.yml +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/.gitignore +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/LICENSE +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/commands/auth/login.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/commands/auth/logout.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/commands/auth/team.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/commands/auth/token.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/commands/index.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/commands/job/cancel.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/commands/job/clusters.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/commands/job/logs.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/commands/job/ls.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/contributing.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/guide/db_migrations.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/guide/delta.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/guide/env.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/guide/index.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/guide/namespaces.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/guide/processing.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/guide/remotes.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/guide/retry.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/index.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/overrides/main.html +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/quick-start.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/datachain.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/func.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/index.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/toolkit.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/torch.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/references/udf.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/docs/tutorials.md +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/mkdocs.yml +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/noxfile.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/setup.cfg +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/__main__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/asyn.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cache.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/local.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/config.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/dataset.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/delta.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/error.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/array.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/base.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/func.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/path.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/random.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/string.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/window.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/job.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/projects.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/listing.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/namespace.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/node.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/progress.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/project.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/py.typed +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/dataset.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/params.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/session.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/semver.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain/utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/conftest.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/data.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/examples/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_array.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_path.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_random.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_string.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_batching.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_client.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_datachain.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_delta.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_file.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_listing.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_ls.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_metastore.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_pull.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_query.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_read_database.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_retry.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_session.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_video.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/test_atomicity.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/test_import_time.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/test_telemetry.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_client.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_config.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_func.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_query.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_semver.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_session.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.26.3 → datachain-0.27.0}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.27.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -26,6 +26,7 @@ Requires-Dist: packaging
|
|
|
26
26
|
Requires-Dist: pyarrow
|
|
27
27
|
Requires-Dist: typing-extensions
|
|
28
28
|
Requires-Dist: python-dateutil>=2
|
|
29
|
+
Requires-Dist: dateparser>=1.0.0
|
|
29
30
|
Requires-Dist: attrs>=21.3.0
|
|
30
31
|
Requires-Dist: fsspec>=2024.2.0
|
|
31
32
|
Requires-Dist: s3fs>=2024.2.0
|
|
@@ -100,6 +101,7 @@ Provides-Extra: dev
|
|
|
100
101
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
101
102
|
Requires-Dist: mypy==1.17.0; extra == "dev"
|
|
102
103
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
104
|
+
Requires-Dist: types-dateparser; extra == "dev"
|
|
103
105
|
Requires-Dist: types-pytz; extra == "dev"
|
|
104
106
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
105
107
|
Requires-Dist: types-requests; extra == "dev"
|
|
@@ -118,7 +120,7 @@ Dynamic: license-file
|
|
|
118
120
|
|logo| DataChain
|
|
119
121
|
================
|
|
120
122
|
|
|
121
|
-
|PyPI| |Python Version| |Codecov| |Tests|
|
|
123
|
+
|PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
|
|
122
124
|
|
|
123
125
|
.. |logo| image:: docs/assets/datachain.svg
|
|
124
126
|
:height: 24
|
|
@@ -134,6 +136,9 @@ Dynamic: license-file
|
|
|
134
136
|
.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
|
|
135
137
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
136
138
|
:alt: Tests
|
|
139
|
+
.. |DeepWiki| image:: https://deepwiki.com/badge.svg
|
|
140
|
+
:target: https://deepwiki.com/iterative/datachain
|
|
141
|
+
:alt: DeepWiki
|
|
137
142
|
|
|
138
143
|
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
139
144
|
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|logo| DataChain
|
|
3
3
|
================
|
|
4
4
|
|
|
5
|
-
|PyPI| |Python Version| |Codecov| |Tests|
|
|
5
|
+
|PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
|
|
6
6
|
|
|
7
7
|
.. |logo| image:: docs/assets/datachain.svg
|
|
8
8
|
:height: 24
|
|
@@ -18,6 +18,9 @@
|
|
|
18
18
|
.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
|
|
19
19
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
20
20
|
:alt: Tests
|
|
21
|
+
.. |DeepWiki| image:: https://deepwiki.com/badge.svg
|
|
22
|
+
:target: https://deepwiki.com/iterative/datachain
|
|
23
|
+
:alt: DeepWiki
|
|
21
24
|
|
|
22
25
|
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
23
26
|
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
@@ -5,15 +5,21 @@ Run a job in Studio.
|
|
|
5
5
|
## Synopsis
|
|
6
6
|
|
|
7
7
|
```usage
|
|
8
|
-
usage: datachain job run [-h] [-v] [-q] [--team TEAM] [--env-file ENV_FILE]
|
|
9
|
-
[--
|
|
8
|
+
usage: datachain job run [-h] [-v] [-q] [--team TEAM] [--env-file ENV_FILE]
|
|
9
|
+
[--env ENV [ENV ...]]
|
|
10
|
+
[--cluster CLUSTER] [--workers WORKERS]
|
|
11
|
+
[--files FILES [FILES ...]]
|
|
12
|
+
[--python-version PYTHON_VERSION]
|
|
13
|
+
[--repository REPOSITORY]
|
|
10
14
|
[--req-file REQ_FILE] [--req REQ [REQ ...]]
|
|
15
|
+
[--priority PRIORITY]
|
|
16
|
+
[--start-time START_TIME] [--cron CRON]
|
|
11
17
|
file
|
|
12
18
|
```
|
|
13
19
|
|
|
14
20
|
## Description
|
|
15
21
|
|
|
16
|
-
This command runs a job in Studio using the specified query file. You can configure various aspects of the job including environment variables, Python version, dependencies, and more.
|
|
22
|
+
This command runs a job in Studio using the specified query file. You can configure various aspects of the job including environment variables, Python version, dependencies, and more. When using --start-time or --cron, the job is scheduled to run but won't start immediately. (can be seen in the Tasks tab in UI)
|
|
17
23
|
|
|
18
24
|
## Arguments
|
|
19
25
|
|
|
@@ -28,10 +34,12 @@ This command runs a job in Studio using the specified query file. You can config
|
|
|
28
34
|
* `--workers WORKERS` - Number of workers for the job
|
|
29
35
|
* `--files FILES` - Additional files to include in the job
|
|
30
36
|
* `--python-version PYTHON_VERSION` - Python version for the job (e.g., 3.9, 3.10, 3.11)
|
|
37
|
+
* `--repository REPOSITORY` - Repository URL to clone before running the job
|
|
31
38
|
* `--req-file REQ_FILE` - Python requirements file
|
|
32
39
|
* `--req REQ` - Python package requirements
|
|
33
40
|
* `--priority PRIORITY` - Priority for the job in range 0-5. Lower value is higher priority (default: 5)
|
|
34
|
-
* `--
|
|
41
|
+
* `--start-time START_TIME` - Time to schedule the task in YYYY-MM-DDTHH:mm format or natural language.
|
|
42
|
+
* `--cron CRON` - Cron expression for the cron task.
|
|
35
43
|
* `-h`, `--help` - Show the help message and exit.
|
|
36
44
|
* `-v`, `--verbose` - Be verbose.
|
|
37
45
|
* `-q`, `--quiet` - Be quiet.
|
|
@@ -66,17 +74,11 @@ datachain job run --env API_KEY=123 --req pandas numpy query.py
|
|
|
66
74
|
6. Run a job with a repository (will be cloned in the job working directory):
|
|
67
75
|
```bash
|
|
68
76
|
datachain job run --repository https://github.com/iterative/datachain query.py
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
To specify a branch / revision:
|
|
72
77
|
|
|
73
|
-
|
|
78
|
+
# To specify a branch / revision:
|
|
74
79
|
datachain job run --repository https://github.com/iterative/datachain@main query.py
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
Git URLs are also supported:
|
|
78
80
|
|
|
79
|
-
|
|
81
|
+
# Git URLs are also supported:
|
|
80
82
|
datachain job run --repository git@github.com:iterative/datachain.git@main query.py
|
|
81
83
|
```
|
|
82
84
|
|
|
@@ -90,7 +92,43 @@ datachain job run --priority 2 query.py
|
|
|
90
92
|
# Get the cluster id using following command
|
|
91
93
|
datachain job clusters
|
|
92
94
|
# Use the id of an active clusters from above
|
|
93
|
-
datachain job run --cluster
|
|
95
|
+
datachain job run --cluster 1 query.py
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
9. Schedule a job to run once at a specific time
|
|
99
|
+
```bash
|
|
100
|
+
# Run job tomorrow at 3pm
|
|
101
|
+
datachain job run --start-time "tomorrow 3pm" query.py
|
|
102
|
+
|
|
103
|
+
# Run job in 2 hours
|
|
104
|
+
datachain job run --start-time "in 2 hours" query.py
|
|
105
|
+
|
|
106
|
+
# Run job on Monday at 9am
|
|
107
|
+
datachain job run --start-time "monday 9am" query.py
|
|
108
|
+
|
|
109
|
+
# Run job at a specific date and time
|
|
110
|
+
datachain job run --start-time "2024-01-15 14:30:00" query.py
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
10. Schedule a recurring job using cron expression
|
|
114
|
+
```bash
|
|
115
|
+
# Run job daily at midnight
|
|
116
|
+
datachain job run --cron "0 0 * * *" query.py
|
|
117
|
+
|
|
118
|
+
# Run job every Monday at 9am
|
|
119
|
+
datachain job run --cron "0 9 * * 1" query.py
|
|
120
|
+
|
|
121
|
+
# Run job every hour
|
|
122
|
+
datachain job run --cron "0 * * * *" query.py
|
|
123
|
+
|
|
124
|
+
# Run job every month
|
|
125
|
+
datachain job run --cron "@monthly" query.py
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
11. Schedule a recurring job with a start time
|
|
129
|
+
```bash
|
|
130
|
+
# Start the cron job after tomorrow 3pm
|
|
131
|
+
datachain job run --start-time "tomorrow 3pm" --cron "0 0 * * *" query.py
|
|
94
132
|
```
|
|
95
133
|
|
|
96
134
|
## Notes
|
|
@@ -99,3 +137,14 @@ datachain job run --cluster-id 1 query.py
|
|
|
99
137
|
* To cancel a running job, use the `datachain job cancel` command
|
|
100
138
|
* The job will continue running in Studio even after you stop viewing the logs
|
|
101
139
|
* You can get the list of compute clusters using `datachain job clusters` command.
|
|
140
|
+
* When using `--start-time` or `--cron` options, the job is scheduled as a task and will not show logs immediately. The job will be executed according to the schedule.
|
|
141
|
+
* The `--start-time` option supports natural language parsing using the [dateparser](https://dateparser.readthedocs.io/en/latest/) library, allowing flexible time expressions like "tomorrow 3pm", "in 2 hours", "monday 9am", etc.
|
|
142
|
+
* Cron expressions follow the standard format: minute hour day-of-month month day-of-week (e.g., "0 0 * * *" for daily at midnight) or Vixie cron-style “@” keyword expressions.
|
|
143
|
+
* Following options for Vixie cron-style expressions are supported:
|
|
144
|
+
* @midnight
|
|
145
|
+
* @hourly
|
|
146
|
+
* @daily
|
|
147
|
+
* @weekly
|
|
148
|
+
* @monthly
|
|
149
|
+
* @yearly
|
|
150
|
+
* @annually
|
|
@@ -10,55 +10,45 @@ title: Examples
|
|
|
10
10
|
|
|
11
11
|
Datachain is built by composing wrangling operations.
|
|
12
12
|
|
|
13
|
-
For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies
|
|
13
|
+
For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies BLIP Large model to caption the first five of them and put the results in the column "scene":
|
|
14
14
|
|
|
15
15
|
```python
|
|
16
16
|
import datachain as dc # (1)!
|
|
17
|
-
from transformers import
|
|
17
|
+
from transformers import Pipeline, pipeline
|
|
18
|
+
from datachain import File
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
|
|
23
|
-
|
|
24
|
-
def process(file: File) -> str:
|
|
25
|
-
image=file.read().convert("RGB")
|
|
26
|
-
inputs = processor(text="caption", images=image, return_tensors="pt")
|
|
27
|
-
generate_ids = model.generate(**inputs, max_new_tokens=100)
|
|
28
|
-
return processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
20
|
+
def process(file: File, pipeline: Pipeline) -> str:
|
|
21
|
+
image = file.read().convert("RGB")
|
|
22
|
+
return pipeline(image)[0]["generated_text"]
|
|
29
23
|
|
|
30
24
|
chain = (
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
25
|
+
dc.read_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image", anon=True)
|
|
26
|
+
.limit(5)
|
|
27
|
+
.settings(cache=True)
|
|
28
|
+
.setup(pipeline=lambda: pipeline("image-to-text", model="Salesforce/blip-image-captioning-large"))
|
|
29
|
+
.map(scene=process)
|
|
30
|
+
.persist()
|
|
35
31
|
)
|
|
36
32
|
```
|
|
37
33
|
|
|
38
|
-
1. `pip install datachain`
|
|
39
|
-
2. `pip install transformers`
|
|
34
|
+
1. `pip install datachain[hf]`
|
|
40
35
|
|
|
41
36
|
Here is how we can view the results in a plot:
|
|
42
37
|
|
|
43
38
|
```python
|
|
44
39
|
import matplotlib.pyplot as plt
|
|
45
|
-
import re
|
|
46
40
|
from textwrap import wrap
|
|
47
41
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
return match.group(0) if match else ''
|
|
51
|
-
|
|
52
|
-
images = chain.collect("file")
|
|
53
|
-
captions = chain.collect("scene")
|
|
54
|
-
_ , axes = plt.subplots(1, len(captions), figsize=(15, 5))
|
|
42
|
+
count = chain.count()
|
|
43
|
+
_, axes = plt.subplots(1, count, figsize=(15, 5))
|
|
55
44
|
|
|
56
|
-
for ax,
|
|
57
|
-
ax.imshow(
|
|
58
|
-
ax.axis(
|
|
59
|
-
wrapped_caption = "\n".join(wrap(
|
|
60
|
-
ax.set_title(wrapped_caption, fontsize=
|
|
45
|
+
for ax, (img_file, caption) in zip(axes, chain.to_iter("file", "scene")):
|
|
46
|
+
ax.imshow(img_file.read(), cmap="gray")
|
|
47
|
+
ax.axis("off")
|
|
48
|
+
wrapped_caption = "\n".join(wrap(caption.strip(), 40))
|
|
49
|
+
ax.set_title(wrapped_caption, fontsize=10, pad=20)
|
|
61
50
|
|
|
51
|
+
plt.tight_layout()
|
|
62
52
|
plt.show()
|
|
63
53
|
```
|
|
64
54
|
|
|
@@ -30,6 +30,7 @@ dependencies = [
|
|
|
30
30
|
"pyarrow",
|
|
31
31
|
"typing-extensions",
|
|
32
32
|
"python-dateutil>=2",
|
|
33
|
+
"dateparser>=1.0.0",
|
|
33
34
|
"attrs>=21.3.0",
|
|
34
35
|
"fsspec>=2024.2.0",
|
|
35
36
|
"s3fs>=2024.2.0",
|
|
@@ -116,6 +117,7 @@ dev = [
|
|
|
116
117
|
"datachain[docs,tests]",
|
|
117
118
|
"mypy==1.17.0",
|
|
118
119
|
"types-python-dateutil",
|
|
120
|
+
"types-dateparser",
|
|
119
121
|
"types-pytz",
|
|
120
122
|
"types-PyYAML",
|
|
121
123
|
"types-requests",
|
|
@@ -17,7 +17,12 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
17
17
|
)
|
|
18
18
|
|
|
19
19
|
studio_run_help = "Run a job in Studio"
|
|
20
|
-
studio_run_description = "Run a job in Studio."
|
|
20
|
+
studio_run_description = "Run a job in Studio. \n"
|
|
21
|
+
studio_run_description += (
|
|
22
|
+
"When using --start-time or --cron,"
|
|
23
|
+
" the job is scheduled to run but won't start immediately"
|
|
24
|
+
" (can be seen in the Tasks tab in UI)"
|
|
25
|
+
)
|
|
21
26
|
|
|
22
27
|
studio_run_parser = jobs_subparser.add_parser(
|
|
23
28
|
"run",
|
|
@@ -96,6 +101,14 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
96
101
|
help="Priority for the job in range 0-5. "
|
|
97
102
|
"Lower value is higher priority (default: 5)",
|
|
98
103
|
)
|
|
104
|
+
studio_run_parser.add_argument(
|
|
105
|
+
"--start-time",
|
|
106
|
+
action="store",
|
|
107
|
+
help="Time to schedule a task in YYYY-MM-DDTHH:mm format or natural language.",
|
|
108
|
+
)
|
|
109
|
+
studio_run_parser.add_argument(
|
|
110
|
+
"--cron", action="store", help="Cron expression for the cron task."
|
|
111
|
+
)
|
|
99
112
|
|
|
100
113
|
studio_ls_help = "List jobs in Studio"
|
|
101
114
|
studio_ls_description = "List jobs in Studio."
|
|
@@ -245,7 +245,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
|
|
|
245
245
|
if field.nullable and not ModelStore.is_pydantic(dtype):
|
|
246
246
|
dtype = Optional[dtype] # type: ignore[assignment]
|
|
247
247
|
type_dict[field.name] = dtype
|
|
248
|
-
return dict_to_data_model(column, type_dict)
|
|
248
|
+
return dict_to_data_model(f"ArrowDataModel_{column}", type_dict)
|
|
249
249
|
if pa.types.is_map(col_type):
|
|
250
250
|
return dict
|
|
251
251
|
if isinstance(col_type, pa.lib.DictionaryType):
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
import posixpath
|
|
2
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
3
|
+
|
|
4
|
+
from datachain.lib.file import FileError
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from numpy import ndarray
|
|
8
|
+
|
|
9
|
+
from datachain.lib.file import Audio, AudioFile, File
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import torchaudio
|
|
13
|
+
except ImportError as exc:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
"Missing dependencies for processing audio.\n"
|
|
16
|
+
"To install run:\n\n"
|
|
17
|
+
" pip install 'datachain[audio]'\n"
|
|
18
|
+
) from exc
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def audio_info(file: "Union[File, AudioFile]") -> "Audio":
|
|
22
|
+
"""Extract metadata like sample rate, channels, duration, and format."""
|
|
23
|
+
from datachain.lib.file import Audio
|
|
24
|
+
|
|
25
|
+
file = file.as_audio_file()
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
with file.open() as f:
|
|
29
|
+
info = torchaudio.info(f)
|
|
30
|
+
|
|
31
|
+
sample_rate = int(info.sample_rate)
|
|
32
|
+
channels = int(info.num_channels)
|
|
33
|
+
frames = int(info.num_frames)
|
|
34
|
+
duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
|
|
35
|
+
|
|
36
|
+
codec_name = getattr(info, "encoding", "")
|
|
37
|
+
file_ext = file.get_file_ext().lower()
|
|
38
|
+
format_name = _encoding_to_format(codec_name, file_ext)
|
|
39
|
+
|
|
40
|
+
bits_per_sample = getattr(info, "bits_per_sample", 0)
|
|
41
|
+
bit_rate = (
|
|
42
|
+
bits_per_sample * sample_rate * channels if bits_per_sample > 0 else -1
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
except Exception as exc:
|
|
46
|
+
raise FileError(
|
|
47
|
+
"unable to extract metadata from audio file", file.source, file.path
|
|
48
|
+
) from exc
|
|
49
|
+
|
|
50
|
+
return Audio(
|
|
51
|
+
sample_rate=sample_rate,
|
|
52
|
+
channels=channels,
|
|
53
|
+
duration=duration,
|
|
54
|
+
samples=frames,
|
|
55
|
+
format=format_name,
|
|
56
|
+
codec=codec_name,
|
|
57
|
+
bit_rate=bit_rate,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _encoding_to_format(encoding: str, file_ext: str) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Map torchaudio encoding to a format name.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
encoding: The encoding string from torchaudio.info()
|
|
67
|
+
file_ext: The file extension as a fallback
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Format name as a string
|
|
71
|
+
"""
|
|
72
|
+
# Direct mapping for formats that match exactly
|
|
73
|
+
encoding_map = {
|
|
74
|
+
"FLAC": "flac",
|
|
75
|
+
"MP3": "mp3",
|
|
76
|
+
"VORBIS": "ogg",
|
|
77
|
+
"AMR_WB": "amr",
|
|
78
|
+
"AMR_NB": "amr",
|
|
79
|
+
"OPUS": "opus",
|
|
80
|
+
"GSM": "gsm",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if encoding in encoding_map:
|
|
84
|
+
return encoding_map[encoding]
|
|
85
|
+
|
|
86
|
+
# For PCM variants, use file extension to determine format
|
|
87
|
+
if encoding.startswith("PCM_"):
|
|
88
|
+
# Common PCM formats by extension
|
|
89
|
+
pcm_formats = {
|
|
90
|
+
"wav": "wav",
|
|
91
|
+
"aiff": "aiff",
|
|
92
|
+
"au": "au",
|
|
93
|
+
"raw": "raw",
|
|
94
|
+
}
|
|
95
|
+
return pcm_formats.get(file_ext, "wav") # Default to wav for PCM
|
|
96
|
+
|
|
97
|
+
# Fallback to file extension if encoding is unknown
|
|
98
|
+
return file_ext if file_ext else "unknown"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def audio_to_np(
|
|
102
|
+
audio: "AudioFile", start: float = 0, duration: Optional[float] = None
|
|
103
|
+
) -> "tuple[ndarray, int]":
|
|
104
|
+
"""Load audio fragment as numpy array.
|
|
105
|
+
Multi-channel audio is transposed to (samples, channels)."""
|
|
106
|
+
if start < 0:
|
|
107
|
+
raise ValueError("start must be a non-negative float")
|
|
108
|
+
|
|
109
|
+
if duration is not None and duration <= 0:
|
|
110
|
+
raise ValueError("duration must be a positive float")
|
|
111
|
+
|
|
112
|
+
if hasattr(audio, "as_audio_file"):
|
|
113
|
+
audio = audio.as_audio_file()
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
with audio.open() as f:
|
|
117
|
+
info = torchaudio.info(f)
|
|
118
|
+
sample_rate = info.sample_rate
|
|
119
|
+
|
|
120
|
+
frame_offset = int(start * sample_rate)
|
|
121
|
+
num_frames = int(duration * sample_rate) if duration is not None else -1
|
|
122
|
+
|
|
123
|
+
# Reset file pointer to the beginning
|
|
124
|
+
# This is important to ensure we read from the correct position later
|
|
125
|
+
f.seek(0)
|
|
126
|
+
|
|
127
|
+
waveform, sr = torchaudio.load(
|
|
128
|
+
f, frame_offset=frame_offset, num_frames=num_frames
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
audio_np = waveform.numpy()
|
|
132
|
+
|
|
133
|
+
if audio_np.shape[0] > 1:
|
|
134
|
+
audio_np = audio_np.T
|
|
135
|
+
else:
|
|
136
|
+
audio_np = audio_np.squeeze()
|
|
137
|
+
|
|
138
|
+
return audio_np, int(sr)
|
|
139
|
+
except Exception as exc:
|
|
140
|
+
raise FileError(
|
|
141
|
+
"unable to read audio fragment", audio.source, audio.path
|
|
142
|
+
) from exc
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def audio_to_bytes(
|
|
146
|
+
audio: "AudioFile",
|
|
147
|
+
format: str = "wav",
|
|
148
|
+
start: float = 0,
|
|
149
|
+
duration: Optional[float] = None,
|
|
150
|
+
) -> bytes:
|
|
151
|
+
"""Convert audio to bytes using soundfile.
|
|
152
|
+
|
|
153
|
+
If duration is None, converts from start to end of file.
|
|
154
|
+
If start is 0 and duration is None, converts entire file."""
|
|
155
|
+
y, sr = audio_to_np(audio, start, duration)
|
|
156
|
+
|
|
157
|
+
import io
|
|
158
|
+
|
|
159
|
+
import soundfile as sf
|
|
160
|
+
|
|
161
|
+
buffer = io.BytesIO()
|
|
162
|
+
sf.write(buffer, y, sr, format=format)
|
|
163
|
+
return buffer.getvalue()
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def save_audio(
|
|
167
|
+
audio: "AudioFile",
|
|
168
|
+
output: str,
|
|
169
|
+
format: Optional[str] = None,
|
|
170
|
+
start: float = 0,
|
|
171
|
+
end: Optional[float] = None,
|
|
172
|
+
) -> "AudioFile":
|
|
173
|
+
"""Save audio file or extract fragment to specified format.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
audio: Source AudioFile object
|
|
177
|
+
output: Output directory path
|
|
178
|
+
format: Output format ('wav', 'mp3', etc). Defaults to source format
|
|
179
|
+
start: Start time in seconds (>= 0). Defaults to 0
|
|
180
|
+
end: End time in seconds. If None, extracts to end of file
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
AudioFile: New audio file with format conversion/extraction applied
|
|
184
|
+
|
|
185
|
+
Examples:
|
|
186
|
+
save_audio(audio, "/path", "mp3") # Entire file to MP3
|
|
187
|
+
save_audio(audio, "s3://bucket/path", "wav", start=2.5) # From 2.5s to end
|
|
188
|
+
save_audio(audio, "/path", "flac", start=1, end=3) # Extract 1-3s fragment
|
|
189
|
+
"""
|
|
190
|
+
if format is None:
|
|
191
|
+
format = audio.get_file_ext()
|
|
192
|
+
|
|
193
|
+
# Validate start time
|
|
194
|
+
if start < 0:
|
|
195
|
+
raise ValueError(
|
|
196
|
+
f"Can't save audio for '{audio.path}', "
|
|
197
|
+
f"start time must be non-negative: {start:.3f}"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Handle full file conversion when end is None and start is 0
|
|
201
|
+
if end is None and start == 0:
|
|
202
|
+
output_file = posixpath.join(output, f"{audio.get_file_stem()}.{format}")
|
|
203
|
+
try:
|
|
204
|
+
audio_bytes = audio_to_bytes(audio, format, start=0, duration=None)
|
|
205
|
+
except Exception as exc:
|
|
206
|
+
raise FileError(
|
|
207
|
+
"unable to convert audio file", audio.source, audio.path
|
|
208
|
+
) from exc
|
|
209
|
+
elif end is None:
|
|
210
|
+
# Extract from start to end of file
|
|
211
|
+
output_file = posixpath.join(
|
|
212
|
+
output, f"{audio.get_file_stem()}_{int(start * 1000):06d}_end.{format}"
|
|
213
|
+
)
|
|
214
|
+
try:
|
|
215
|
+
audio_bytes = audio_to_bytes(audio, format, start=start, duration=None)
|
|
216
|
+
except Exception as exc:
|
|
217
|
+
raise FileError(
|
|
218
|
+
"unable to save audio fragment", audio.source, audio.path
|
|
219
|
+
) from exc
|
|
220
|
+
else:
|
|
221
|
+
# Fragment extraction mode with specific end time
|
|
222
|
+
if end < 0 or start >= end:
|
|
223
|
+
raise ValueError(
|
|
224
|
+
f"Can't save audio for '{audio.path}', "
|
|
225
|
+
f"invalid time range: ({start:.3f}, {end:.3f})"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
duration = end - start
|
|
229
|
+
start_ms = int(start * 1000)
|
|
230
|
+
end_ms = int(end * 1000)
|
|
231
|
+
output_file = posixpath.join(
|
|
232
|
+
output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
audio_bytes = audio_to_bytes(audio, format, start, duration)
|
|
237
|
+
except Exception as exc:
|
|
238
|
+
raise FileError(
|
|
239
|
+
"unable to save audio fragment", audio.source, audio.path
|
|
240
|
+
) from exc
|
|
241
|
+
|
|
242
|
+
from datachain.lib.file import AudioFile
|
|
243
|
+
|
|
244
|
+
return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import uuid
|
|
1
3
|
from collections.abc import Sequence
|
|
2
4
|
from datetime import datetime
|
|
3
5
|
from typing import ClassVar, Optional, Union, get_args, get_origin
|
|
@@ -80,7 +82,9 @@ def dict_to_data_model(
|
|
|
80
82
|
|
|
81
83
|
fields = {
|
|
82
84
|
name: (
|
|
83
|
-
anno
|
|
85
|
+
anno
|
|
86
|
+
if inspect.isclass(anno) and issubclass(anno, BaseModel)
|
|
87
|
+
else Optional[anno],
|
|
84
88
|
Field(
|
|
85
89
|
validation_alias=AliasChoices(name, original_names[idx] or name),
|
|
86
90
|
default=None,
|
|
@@ -101,6 +105,10 @@ def dict_to_data_model(
|
|
|
101
105
|
field_info[str(alias)] = (_name, field)
|
|
102
106
|
return field_info
|
|
103
107
|
|
|
108
|
+
# Generate random unique name if not provided
|
|
109
|
+
if not name:
|
|
110
|
+
name = f"DataModel_{uuid.uuid4().hex[:8]}"
|
|
111
|
+
|
|
104
112
|
return create_model(
|
|
105
113
|
name,
|
|
106
114
|
__base__=_DataModelStrict,
|
|
@@ -2388,7 +2388,7 @@ class DataChain:
|
|
|
2388
2388
|
placement: FileExportPlacement = "fullpath",
|
|
2389
2389
|
link_type: Literal["copy", "symlink"] = "copy",
|
|
2390
2390
|
num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
|
|
2391
|
-
anon: bool =
|
|
2391
|
+
anon: Optional[bool] = None,
|
|
2392
2392
|
client_config: Optional[dict] = None,
|
|
2393
2393
|
) -> None:
|
|
2394
2394
|
"""Export files from a specified signal to a directory. Files can be
|
|
@@ -2403,7 +2403,11 @@ class DataChain:
|
|
|
2403
2403
|
Falls back to `'copy'` if symlinking fails.
|
|
2404
2404
|
num_threads : number of threads to use for exporting files.
|
|
2405
2405
|
By default it uses 5 threads.
|
|
2406
|
-
anon: If
|
|
2406
|
+
anon: If True, we will treat cloud bucket as public one. Default behavior
|
|
2407
|
+
depends on the previous session configuration (e.g. happens in the
|
|
2408
|
+
initial `read_storage`) and particular cloud storage client
|
|
2409
|
+
implementation (e.g. S3 fallbacks to anonymous access if no credentials
|
|
2410
|
+
were found).
|
|
2407
2411
|
client_config: Optional configuration for the destination storage client
|
|
2408
2412
|
|
|
2409
2413
|
Example:
|
|
@@ -2421,8 +2425,8 @@ class DataChain:
|
|
|
2421
2425
|
):
|
|
2422
2426
|
raise ValueError("Files with the same name found")
|
|
2423
2427
|
|
|
2424
|
-
if anon:
|
|
2425
|
-
client_config = (client_config or {}) | {"anon":
|
|
2428
|
+
if anon is not None:
|
|
2429
|
+
client_config = (client_config or {}) | {"anon": anon}
|
|
2426
2430
|
|
|
2427
2431
|
progress_bar = tqdm(
|
|
2428
2432
|
desc=f"Exporting files to {output}: ",
|
|
@@ -25,19 +25,23 @@ def read_hf(
|
|
|
25
25
|
settings: Optional[dict] = None,
|
|
26
26
|
column: str = "",
|
|
27
27
|
model_name: str = "",
|
|
28
|
+
limit: int = 0,
|
|
28
29
|
**kwargs,
|
|
29
30
|
) -> "DataChain":
|
|
30
|
-
"""Generate chain from
|
|
31
|
+
"""Generate chain from Hugging Face Hub dataset.
|
|
31
32
|
|
|
32
33
|
Parameters:
|
|
33
34
|
dataset : Path or name of the dataset to read from Hugging Face Hub,
|
|
34
35
|
or an instance of `datasets.Dataset`-like object.
|
|
35
|
-
args : Additional positional arguments to pass to datasets.load_dataset
|
|
36
|
+
args : Additional positional arguments to pass to `datasets.load_dataset`.
|
|
36
37
|
session : Session to use for the chain.
|
|
37
38
|
settings : Settings to use for the chain.
|
|
38
39
|
column : Generated object column name.
|
|
39
40
|
model_name : Generated model name.
|
|
40
|
-
|
|
41
|
+
limit : Limit the number of items to read from the HF dataset.
|
|
42
|
+
Adds `take(limit)` to the `datasets.load_dataset`.
|
|
43
|
+
Defaults to 0 (no limit).
|
|
44
|
+
kwargs : Parameters to pass to `datasets.load_dataset`.
|
|
41
45
|
|
|
42
46
|
Example:
|
|
43
47
|
Load from Hugging Face Hub:
|
|
@@ -53,6 +57,18 @@ def read_hf(
|
|
|
53
57
|
import datachain as dc
|
|
54
58
|
chain = dc.read_hf(ds)
|
|
55
59
|
```
|
|
60
|
+
|
|
61
|
+
Streaming with limit, for large datasets:
|
|
62
|
+
```py
|
|
63
|
+
import datachain as dc
|
|
64
|
+
ds = dc.read_hf("beans", split="train", streaming=True, limit=10)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
or use HF split syntax (not supported if streaming is enabled):
|
|
68
|
+
```py
|
|
69
|
+
import datachain as dc
|
|
70
|
+
ds = dc.read_hf("beans", split="train[%10]")
|
|
71
|
+
```
|
|
56
72
|
"""
|
|
57
73
|
from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
|
|
58
74
|
|
|
@@ -72,4 +88,4 @@ def read_hf(
|
|
|
72
88
|
output = {column: model}
|
|
73
89
|
|
|
74
90
|
chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
|
|
75
|
-
return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
|
|
91
|
+
return chain.gen(HFGenerator(dataset, model, limit, *args, **kwargs), output=output)
|