datachain 0.26.4__tar.gz → 0.28.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.26.4 → datachain-0.28.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.26.4 → datachain-0.28.0}/PKG-INFO +6 -3
- {datachain-0.26.4 → datachain-0.28.0}/README.rst +4 -1
- {datachain-0.26.4 → datachain-0.28.0}/docs/commands/job/run.md +20 -6
- {datachain-0.26.4 → datachain-0.28.0}/docs/examples.md +21 -31
- {datachain-0.26.4 → datachain-0.28.0}/mkdocs.yml +1 -1
- {datachain-0.26.4 → datachain-0.28.0}/pyproject.toml +1 -1
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/parser/job.py +8 -3
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/job.py +2 -1
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/arrow.py +1 -1
- datachain-0.28.0/src/datachain/lib/audio.py +244 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/data_model.py +9 -1
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/hf.py +20 -4
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/file.py +43 -8
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/hf.py +17 -7
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/video.py +4 -1
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/studio.py +42 -27
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/PKG-INFO +6 -3
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_audio.py +3 -2
- datachain-0.28.0/tests/func/test_hf.py +142 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_studio_datetime_parsing.py +1 -1
- {datachain-0.26.4 → datachain-0.28.0}/tests/test_cli_studio.py +1 -1
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_audio.py +153 -34
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_datachain.py +0 -18
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_hf.py +3 -1
- datachain-0.26.4/src/datachain/lib/audio.py +0 -151
- datachain-0.26.4/tests/func/test_hf.py +0 -67
- {datachain-0.26.4 → datachain-0.28.0}/.cruft.json +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.gitattributes +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.github/codecov.yaml +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.github/dependabot.yml +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.github/workflows/release.yml +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/.gitignore +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/LICENSE +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/commands/auth/login.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/commands/auth/logout.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/commands/auth/team.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/commands/auth/token.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/commands/index.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/commands/job/cancel.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/commands/job/clusters.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/commands/job/logs.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/commands/job/ls.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/contributing.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/guide/db_migrations.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/guide/delta.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/guide/env.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/guide/index.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/guide/namespaces.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/guide/processing.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/guide/remotes.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/guide/retry.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/index.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/overrides/main.html +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/quick-start.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/datachain.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/func.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/index.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/toolkit.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/torch.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/references/udf.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/docs/tutorials.md +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/noxfile.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/setup.cfg +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/__main__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/asyn.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cache.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/local.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/config.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/dataset.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/delta.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/error.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/array.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/base.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/func.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/path.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/random.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/string.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/window.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/job.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/datachain.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/projects.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/listing.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/namespace.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/node.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/progress.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/project.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/py.typed +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/dataset.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/params.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/session.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/semver.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain/utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/conftest.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/data.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/examples/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_array.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_path.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_random.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_string.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_batching.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_client.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_datachain.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_delta.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_file.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_image.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_listing.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_ls.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_metastore.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_pull.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_query.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_read_database.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_retry.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_session.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_video.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/test_atomicity.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/test_import_time.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/test_telemetry.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_client.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_config.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_func.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_query.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_semver.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_session.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.26.4 → datachain-0.28.0}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.28.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -45,7 +45,7 @@ Requires-Dist: datamodel-code-generator>=0.25
|
|
|
45
45
|
Requires-Dist: Pillow<12,>=10.0.0
|
|
46
46
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
47
47
|
Requires-Dist: psutil
|
|
48
|
-
Requires-Dist: huggingface_hub
|
|
48
|
+
Requires-Dist: huggingface_hub<0.34.0
|
|
49
49
|
Requires-Dist: iterative-telemetry>=0.0.10
|
|
50
50
|
Requires-Dist: platformdirs
|
|
51
51
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
@@ -120,7 +120,7 @@ Dynamic: license-file
|
|
|
120
120
|
|logo| DataChain
|
|
121
121
|
================
|
|
122
122
|
|
|
123
|
-
|PyPI| |Python Version| |Codecov| |Tests|
|
|
123
|
+
|PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
|
|
124
124
|
|
|
125
125
|
.. |logo| image:: docs/assets/datachain.svg
|
|
126
126
|
:height: 24
|
|
@@ -136,6 +136,9 @@ Dynamic: license-file
|
|
|
136
136
|
.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
|
|
137
137
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
138
138
|
:alt: Tests
|
|
139
|
+
.. |DeepWiki| image:: https://deepwiki.com/badge.svg
|
|
140
|
+
:target: https://deepwiki.com/iterative/datachain
|
|
141
|
+
:alt: DeepWiki
|
|
139
142
|
|
|
140
143
|
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
141
144
|
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|logo| DataChain
|
|
3
3
|
================
|
|
4
4
|
|
|
5
|
-
|PyPI| |Python Version| |Codecov| |Tests|
|
|
5
|
+
|PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
|
|
6
6
|
|
|
7
7
|
.. |logo| image:: docs/assets/datachain.svg
|
|
8
8
|
:height: 24
|
|
@@ -18,6 +18,9 @@
|
|
|
18
18
|
.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
|
|
19
19
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
20
20
|
:alt: Tests
|
|
21
|
+
.. |DeepWiki| image:: https://deepwiki.com/badge.svg
|
|
22
|
+
:target: https://deepwiki.com/iterative/datachain
|
|
23
|
+
:alt: DeepWiki
|
|
21
24
|
|
|
22
25
|
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
23
26
|
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
@@ -5,15 +5,22 @@ Run a job in Studio.
|
|
|
5
5
|
## Synopsis
|
|
6
6
|
|
|
7
7
|
```usage
|
|
8
|
-
usage: datachain job run [-h] [-v] [-q] [--team TEAM] [--env-file ENV_FILE]
|
|
9
|
-
[--
|
|
10
|
-
[--
|
|
8
|
+
usage: datachain job run [-h] [-v] [-q] [--team TEAM] [--env-file ENV_FILE]
|
|
9
|
+
[--env ENV [ENV ...]]
|
|
10
|
+
[--cluster CLUSTER] [--workers WORKERS]
|
|
11
|
+
[--files FILES [FILES ...]]
|
|
12
|
+
[--python-version PYTHON_VERSION]
|
|
13
|
+
[--repository REPOSITORY]
|
|
14
|
+
[--req-file REQ_FILE] [--req REQ [REQ ...]]
|
|
15
|
+
[--priority PRIORITY]
|
|
16
|
+
[--start-time START_TIME] [--cron CRON]
|
|
17
|
+
[--no-wait]
|
|
11
18
|
file
|
|
12
19
|
```
|
|
13
20
|
|
|
14
21
|
## Description
|
|
15
22
|
|
|
16
|
-
This command runs a job in Studio using the specified query file. You can configure various aspects of the job including environment variables, Python version, dependencies, and more. When using --start-time or --cron, the job is scheduled
|
|
23
|
+
This command runs a job in Studio using the specified query file. You can configure various aspects of the job including environment variables, Python version, dependencies, and more. When using --start-time or --cron, the job is scheduled to run but won't start immediately. (can be seen in the Tasks tab in UI)
|
|
17
24
|
|
|
18
25
|
## Arguments
|
|
19
26
|
|
|
@@ -32,8 +39,9 @@ This command runs a job in Studio using the specified query file. You can config
|
|
|
32
39
|
* `--req-file REQ_FILE` - Python requirements file
|
|
33
40
|
* `--req REQ` - Python package requirements
|
|
34
41
|
* `--priority PRIORITY` - Priority for the job in range 0-5. Lower value is higher priority (default: 5)
|
|
35
|
-
* `--start-time START_TIME` -
|
|
42
|
+
* `--start-time START_TIME` - Time to schedule the task in YYYY-MM-DDTHH:mm format or natural language.
|
|
36
43
|
* `--cron CRON` - Cron expression for the cron task.
|
|
44
|
+
* `--no-wait` - Do not wait for the job to finish.
|
|
37
45
|
* `-h`, `--help` - Show the help message and exit.
|
|
38
46
|
* `-v`, `--verbose` - Be verbose.
|
|
39
47
|
* `-q`, `--quiet` - Be quiet.
|
|
@@ -125,6 +133,12 @@ datachain job run --cron "@monthly" query.py
|
|
|
125
133
|
datachain job run --start-time "tomorrow 3pm" --cron "0 0 * * *" query.py
|
|
126
134
|
```
|
|
127
135
|
|
|
136
|
+
12. Start the job and do not wait for the job to complete
|
|
137
|
+
```bash
|
|
138
|
+
# Do not follow or tail the logs from Studio.
|
|
139
|
+
datachain job run query.py --no-wait
|
|
140
|
+
```
|
|
141
|
+
|
|
128
142
|
## Notes
|
|
129
143
|
|
|
130
144
|
* Closing the logs command (e.g., with Ctrl+C) will only stop displaying the logs but will not cancel the job execution
|
|
@@ -132,7 +146,7 @@ datachain job run --start-time "tomorrow 3pm" --cron "0 0 * * *" query.py
|
|
|
132
146
|
* The job will continue running in Studio even after you stop viewing the logs
|
|
133
147
|
* You can get the list of compute clusters using `datachain job clusters` command.
|
|
134
148
|
* When using `--start-time` or `--cron` options, the job is scheduled as a task and will not show logs immediately. The job will be executed according to the schedule.
|
|
135
|
-
* The `--start-time` option supports natural language parsing using the dateparser library, allowing flexible time expressions like "tomorrow 3pm", "in 2 hours", "monday 9am", etc.
|
|
149
|
+
* The `--start-time` option supports natural language parsing using the [dateparser](https://dateparser.readthedocs.io/en/latest/) library, allowing flexible time expressions like "tomorrow 3pm", "in 2 hours", "monday 9am", etc.
|
|
136
150
|
* Cron expressions follow the standard format: minute hour day-of-month month day-of-week (e.g., "0 0 * * *" for daily at midnight) or Vixie cron-style “@” keyword expressions.
|
|
137
151
|
* Following options for Vixie cron-style expressions are supported:
|
|
138
152
|
* @midnight
|
|
@@ -10,55 +10,45 @@ title: Examples
|
|
|
10
10
|
|
|
11
11
|
Datachain is built by composing wrangling operations.
|
|
12
12
|
|
|
13
|
-
For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies
|
|
13
|
+
For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies BLIP Large model to caption the first five of them and put the results in the column "scene":
|
|
14
14
|
|
|
15
15
|
```python
|
|
16
16
|
import datachain as dc # (1)!
|
|
17
|
-
from transformers import
|
|
17
|
+
from transformers import Pipeline, pipeline
|
|
18
|
+
from datachain import File
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
|
|
23
|
-
|
|
24
|
-
def process(file: File) -> str:
|
|
25
|
-
image=file.read().convert("RGB")
|
|
26
|
-
inputs = processor(text="caption", images=image, return_tensors="pt")
|
|
27
|
-
generate_ids = model.generate(**inputs, max_new_tokens=100)
|
|
28
|
-
return processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
20
|
+
def process(file: File, pipeline: Pipeline) -> str:
|
|
21
|
+
image = file.read().convert("RGB")
|
|
22
|
+
return pipeline(image)[0]["generated_text"]
|
|
29
23
|
|
|
30
24
|
chain = (
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
25
|
+
dc.read_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image", anon=True)
|
|
26
|
+
.limit(5)
|
|
27
|
+
.settings(cache=True)
|
|
28
|
+
.setup(pipeline=lambda: pipeline("image-to-text", model="Salesforce/blip-image-captioning-large"))
|
|
29
|
+
.map(scene=process)
|
|
30
|
+
.persist()
|
|
35
31
|
)
|
|
36
32
|
```
|
|
37
33
|
|
|
38
|
-
1. `pip install datachain`
|
|
39
|
-
2. `pip install transformers`
|
|
34
|
+
1. `pip install datachain[hf]`
|
|
40
35
|
|
|
41
36
|
Here is how we can view the results in a plot:
|
|
42
37
|
|
|
43
38
|
```python
|
|
44
39
|
import matplotlib.pyplot as plt
|
|
45
|
-
import re
|
|
46
40
|
from textwrap import wrap
|
|
47
41
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
return match.group(0) if match else ''
|
|
51
|
-
|
|
52
|
-
images = chain.collect("file")
|
|
53
|
-
captions = chain.collect("scene")
|
|
54
|
-
_ , axes = plt.subplots(1, len(captions), figsize=(15, 5))
|
|
42
|
+
count = chain.count()
|
|
43
|
+
_, axes = plt.subplots(1, count, figsize=(15, 5))
|
|
55
44
|
|
|
56
|
-
for ax,
|
|
57
|
-
ax.imshow(
|
|
58
|
-
ax.axis(
|
|
59
|
-
wrapped_caption = "\n".join(wrap(
|
|
60
|
-
ax.set_title(wrapped_caption, fontsize=
|
|
45
|
+
for ax, (img_file, caption) in zip(axes, chain.to_iter("file", "scene")):
|
|
46
|
+
ax.imshow(img_file.read(), cmap="gray")
|
|
47
|
+
ax.axis("off")
|
|
48
|
+
wrapped_caption = "\n".join(wrap(caption.strip(), 40))
|
|
49
|
+
ax.set_title(wrapped_caption, fontsize=10, pad=20)
|
|
61
50
|
|
|
51
|
+
plt.tight_layout()
|
|
62
52
|
plt.show()
|
|
63
53
|
```
|
|
64
54
|
|
|
@@ -177,7 +177,7 @@ plugins:
|
|
|
177
177
|
- https://numpy.org/doc/stable/objects.inv
|
|
178
178
|
- https://pandas.pydata.org/docs/objects.inv
|
|
179
179
|
- https://arrow.apache.org/docs/objects.inv
|
|
180
|
-
- https://docs.sqlalchemy.org/objects.inv
|
|
180
|
+
# - https://docs.sqlalchemy.org/objects.inv # SSL certificate issue
|
|
181
181
|
- https://docs.pydantic.dev/latest/objects.inv
|
|
182
182
|
|
|
183
183
|
watch:
|
|
@@ -20,8 +20,8 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
20
20
|
studio_run_description = "Run a job in Studio. \n"
|
|
21
21
|
studio_run_description += (
|
|
22
22
|
"When using --start-time or --cron,"
|
|
23
|
-
" the job is scheduled
|
|
24
|
-
"
|
|
23
|
+
" the job is scheduled to run but won't start immediately"
|
|
24
|
+
" (can be seen in the Tasks tab in UI)"
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
studio_run_parser = jobs_subparser.add_parser(
|
|
@@ -104,11 +104,16 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
104
104
|
studio_run_parser.add_argument(
|
|
105
105
|
"--start-time",
|
|
106
106
|
action="store",
|
|
107
|
-
help="
|
|
107
|
+
help="Time to schedule a task in YYYY-MM-DDTHH:mm format or natural language.",
|
|
108
108
|
)
|
|
109
109
|
studio_run_parser.add_argument(
|
|
110
110
|
"--cron", action="store", help="Cron expression for the cron task."
|
|
111
111
|
)
|
|
112
|
+
studio_run_parser.add_argument(
|
|
113
|
+
"--no-wait",
|
|
114
|
+
action="store_true",
|
|
115
|
+
help="Do not wait for the job to finish",
|
|
116
|
+
)
|
|
112
117
|
|
|
113
118
|
studio_ls_help = "List jobs in Studio"
|
|
114
119
|
studio_ls_description = "List jobs in Studio."
|
|
@@ -12,10 +12,11 @@ class JobStatus(int, Enum):
|
|
|
12
12
|
CANCELING = 7
|
|
13
13
|
CANCELED = 8
|
|
14
14
|
CANCELING_SCHEDULED = 9
|
|
15
|
+
TASK = 11
|
|
15
16
|
|
|
16
17
|
@classmethod
|
|
17
18
|
def finished(cls) -> tuple[int, ...]:
|
|
18
|
-
return cls.COMPLETE, cls.FAILED, cls.CANCELED
|
|
19
|
+
return cls.COMPLETE, cls.FAILED, cls.CANCELED, cls.TASK
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class JobQueryType(int, Enum):
|
|
@@ -245,7 +245,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
|
|
|
245
245
|
if field.nullable and not ModelStore.is_pydantic(dtype):
|
|
246
246
|
dtype = Optional[dtype] # type: ignore[assignment]
|
|
247
247
|
type_dict[field.name] = dtype
|
|
248
|
-
return dict_to_data_model(column, type_dict)
|
|
248
|
+
return dict_to_data_model(f"ArrowDataModel_{column}", type_dict)
|
|
249
249
|
if pa.types.is_map(col_type):
|
|
250
250
|
return dict
|
|
251
251
|
if isinstance(col_type, pa.lib.DictionaryType):
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
import posixpath
|
|
2
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
3
|
+
|
|
4
|
+
from datachain.lib.file import FileError
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from numpy import ndarray
|
|
8
|
+
|
|
9
|
+
from datachain.lib.file import Audio, AudioFile, File
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import torchaudio
|
|
13
|
+
except ImportError as exc:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
"Missing dependencies for processing audio.\n"
|
|
16
|
+
"To install run:\n\n"
|
|
17
|
+
" pip install 'datachain[audio]'\n"
|
|
18
|
+
) from exc
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def audio_info(file: "Union[File, AudioFile]") -> "Audio":
|
|
22
|
+
"""Extract metadata like sample rate, channels, duration, and format."""
|
|
23
|
+
from datachain.lib.file import Audio
|
|
24
|
+
|
|
25
|
+
file = file.as_audio_file()
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
with file.open() as f:
|
|
29
|
+
info = torchaudio.info(f)
|
|
30
|
+
|
|
31
|
+
sample_rate = int(info.sample_rate)
|
|
32
|
+
channels = int(info.num_channels)
|
|
33
|
+
frames = int(info.num_frames)
|
|
34
|
+
duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
|
|
35
|
+
|
|
36
|
+
codec_name = getattr(info, "encoding", "")
|
|
37
|
+
file_ext = file.get_file_ext().lower()
|
|
38
|
+
format_name = _encoding_to_format(codec_name, file_ext)
|
|
39
|
+
|
|
40
|
+
bits_per_sample = getattr(info, "bits_per_sample", 0)
|
|
41
|
+
bit_rate = (
|
|
42
|
+
bits_per_sample * sample_rate * channels if bits_per_sample > 0 else -1
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
except Exception as exc:
|
|
46
|
+
raise FileError(
|
|
47
|
+
"unable to extract metadata from audio file", file.source, file.path
|
|
48
|
+
) from exc
|
|
49
|
+
|
|
50
|
+
return Audio(
|
|
51
|
+
sample_rate=sample_rate,
|
|
52
|
+
channels=channels,
|
|
53
|
+
duration=duration,
|
|
54
|
+
samples=frames,
|
|
55
|
+
format=format_name,
|
|
56
|
+
codec=codec_name,
|
|
57
|
+
bit_rate=bit_rate,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _encoding_to_format(encoding: str, file_ext: str) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Map torchaudio encoding to a format name.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
encoding: The encoding string from torchaudio.info()
|
|
67
|
+
file_ext: The file extension as a fallback
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Format name as a string
|
|
71
|
+
"""
|
|
72
|
+
# Direct mapping for formats that match exactly
|
|
73
|
+
encoding_map = {
|
|
74
|
+
"FLAC": "flac",
|
|
75
|
+
"MP3": "mp3",
|
|
76
|
+
"VORBIS": "ogg",
|
|
77
|
+
"AMR_WB": "amr",
|
|
78
|
+
"AMR_NB": "amr",
|
|
79
|
+
"OPUS": "opus",
|
|
80
|
+
"GSM": "gsm",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if encoding in encoding_map:
|
|
84
|
+
return encoding_map[encoding]
|
|
85
|
+
|
|
86
|
+
# For PCM variants, use file extension to determine format
|
|
87
|
+
if encoding.startswith("PCM_"):
|
|
88
|
+
# Common PCM formats by extension
|
|
89
|
+
pcm_formats = {
|
|
90
|
+
"wav": "wav",
|
|
91
|
+
"aiff": "aiff",
|
|
92
|
+
"au": "au",
|
|
93
|
+
"raw": "raw",
|
|
94
|
+
}
|
|
95
|
+
return pcm_formats.get(file_ext, "wav") # Default to wav for PCM
|
|
96
|
+
|
|
97
|
+
# Fallback to file extension if encoding is unknown
|
|
98
|
+
return file_ext if file_ext else "unknown"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def audio_to_np(
|
|
102
|
+
audio: "AudioFile", start: float = 0, duration: Optional[float] = None
|
|
103
|
+
) -> "tuple[ndarray, int]":
|
|
104
|
+
"""Load audio fragment as numpy array.
|
|
105
|
+
Multi-channel audio is transposed to (samples, channels)."""
|
|
106
|
+
if start < 0:
|
|
107
|
+
raise ValueError("start must be a non-negative float")
|
|
108
|
+
|
|
109
|
+
if duration is not None and duration <= 0:
|
|
110
|
+
raise ValueError("duration must be a positive float")
|
|
111
|
+
|
|
112
|
+
if hasattr(audio, "as_audio_file"):
|
|
113
|
+
audio = audio.as_audio_file()
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
with audio.open() as f:
|
|
117
|
+
info = torchaudio.info(f)
|
|
118
|
+
sample_rate = info.sample_rate
|
|
119
|
+
|
|
120
|
+
frame_offset = int(start * sample_rate)
|
|
121
|
+
num_frames = int(duration * sample_rate) if duration is not None else -1
|
|
122
|
+
|
|
123
|
+
# Reset file pointer to the beginning
|
|
124
|
+
# This is important to ensure we read from the correct position later
|
|
125
|
+
f.seek(0)
|
|
126
|
+
|
|
127
|
+
waveform, sr = torchaudio.load(
|
|
128
|
+
f, frame_offset=frame_offset, num_frames=num_frames
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
audio_np = waveform.numpy()
|
|
132
|
+
|
|
133
|
+
if audio_np.shape[0] > 1:
|
|
134
|
+
audio_np = audio_np.T
|
|
135
|
+
else:
|
|
136
|
+
audio_np = audio_np.squeeze()
|
|
137
|
+
|
|
138
|
+
return audio_np, int(sr)
|
|
139
|
+
except Exception as exc:
|
|
140
|
+
raise FileError(
|
|
141
|
+
"unable to read audio fragment", audio.source, audio.path
|
|
142
|
+
) from exc
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def audio_to_bytes(
|
|
146
|
+
audio: "AudioFile",
|
|
147
|
+
format: str = "wav",
|
|
148
|
+
start: float = 0,
|
|
149
|
+
duration: Optional[float] = None,
|
|
150
|
+
) -> bytes:
|
|
151
|
+
"""Convert audio to bytes using soundfile.
|
|
152
|
+
|
|
153
|
+
If duration is None, converts from start to end of file.
|
|
154
|
+
If start is 0 and duration is None, converts entire file."""
|
|
155
|
+
y, sr = audio_to_np(audio, start, duration)
|
|
156
|
+
|
|
157
|
+
import io
|
|
158
|
+
|
|
159
|
+
import soundfile as sf
|
|
160
|
+
|
|
161
|
+
buffer = io.BytesIO()
|
|
162
|
+
sf.write(buffer, y, sr, format=format)
|
|
163
|
+
return buffer.getvalue()
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def save_audio(
|
|
167
|
+
audio: "AudioFile",
|
|
168
|
+
output: str,
|
|
169
|
+
format: Optional[str] = None,
|
|
170
|
+
start: float = 0,
|
|
171
|
+
end: Optional[float] = None,
|
|
172
|
+
) -> "AudioFile":
|
|
173
|
+
"""Save audio file or extract fragment to specified format.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
audio: Source AudioFile object
|
|
177
|
+
output: Output directory path
|
|
178
|
+
format: Output format ('wav', 'mp3', etc). Defaults to source format
|
|
179
|
+
start: Start time in seconds (>= 0). Defaults to 0
|
|
180
|
+
end: End time in seconds. If None, extracts to end of file
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
AudioFile: New audio file with format conversion/extraction applied
|
|
184
|
+
|
|
185
|
+
Examples:
|
|
186
|
+
save_audio(audio, "/path", "mp3") # Entire file to MP3
|
|
187
|
+
save_audio(audio, "s3://bucket/path", "wav", start=2.5) # From 2.5s to end
|
|
188
|
+
save_audio(audio, "/path", "flac", start=1, end=3) # Extract 1-3s fragment
|
|
189
|
+
"""
|
|
190
|
+
if format is None:
|
|
191
|
+
format = audio.get_file_ext()
|
|
192
|
+
|
|
193
|
+
# Validate start time
|
|
194
|
+
if start < 0:
|
|
195
|
+
raise ValueError(
|
|
196
|
+
f"Can't save audio for '{audio.path}', "
|
|
197
|
+
f"start time must be non-negative: {start:.3f}"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Handle full file conversion when end is None and start is 0
|
|
201
|
+
if end is None and start == 0:
|
|
202
|
+
output_file = posixpath.join(output, f"{audio.get_file_stem()}.{format}")
|
|
203
|
+
try:
|
|
204
|
+
audio_bytes = audio_to_bytes(audio, format, start=0, duration=None)
|
|
205
|
+
except Exception as exc:
|
|
206
|
+
raise FileError(
|
|
207
|
+
"unable to convert audio file", audio.source, audio.path
|
|
208
|
+
) from exc
|
|
209
|
+
elif end is None:
|
|
210
|
+
# Extract from start to end of file
|
|
211
|
+
output_file = posixpath.join(
|
|
212
|
+
output, f"{audio.get_file_stem()}_{int(start * 1000):06d}_end.{format}"
|
|
213
|
+
)
|
|
214
|
+
try:
|
|
215
|
+
audio_bytes = audio_to_bytes(audio, format, start=start, duration=None)
|
|
216
|
+
except Exception as exc:
|
|
217
|
+
raise FileError(
|
|
218
|
+
"unable to save audio fragment", audio.source, audio.path
|
|
219
|
+
) from exc
|
|
220
|
+
else:
|
|
221
|
+
# Fragment extraction mode with specific end time
|
|
222
|
+
if end < 0 or start >= end:
|
|
223
|
+
raise ValueError(
|
|
224
|
+
f"Can't save audio for '{audio.path}', "
|
|
225
|
+
f"invalid time range: ({start:.3f}, {end:.3f})"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
duration = end - start
|
|
229
|
+
start_ms = int(start * 1000)
|
|
230
|
+
end_ms = int(end * 1000)
|
|
231
|
+
output_file = posixpath.join(
|
|
232
|
+
output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
audio_bytes = audio_to_bytes(audio, format, start, duration)
|
|
237
|
+
except Exception as exc:
|
|
238
|
+
raise FileError(
|
|
239
|
+
"unable to save audio fragment", audio.source, audio.path
|
|
240
|
+
) from exc
|
|
241
|
+
|
|
242
|
+
from datachain.lib.file import AudioFile
|
|
243
|
+
|
|
244
|
+
return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import uuid
|
|
1
3
|
from collections.abc import Sequence
|
|
2
4
|
from datetime import datetime
|
|
3
5
|
from typing import ClassVar, Optional, Union, get_args, get_origin
|
|
@@ -80,7 +82,9 @@ def dict_to_data_model(
|
|
|
80
82
|
|
|
81
83
|
fields = {
|
|
82
84
|
name: (
|
|
83
|
-
anno
|
|
85
|
+
anno
|
|
86
|
+
if inspect.isclass(anno) and issubclass(anno, BaseModel)
|
|
87
|
+
else Optional[anno],
|
|
84
88
|
Field(
|
|
85
89
|
validation_alias=AliasChoices(name, original_names[idx] or name),
|
|
86
90
|
default=None,
|
|
@@ -101,6 +105,10 @@ def dict_to_data_model(
|
|
|
101
105
|
field_info[str(alias)] = (_name, field)
|
|
102
106
|
return field_info
|
|
103
107
|
|
|
108
|
+
# Generate random unique name if not provided
|
|
109
|
+
if not name:
|
|
110
|
+
name = f"DataModel_{uuid.uuid4().hex[:8]}"
|
|
111
|
+
|
|
104
112
|
return create_model(
|
|
105
113
|
name,
|
|
106
114
|
__base__=_DataModelStrict,
|
|
@@ -25,19 +25,23 @@ def read_hf(
|
|
|
25
25
|
settings: Optional[dict] = None,
|
|
26
26
|
column: str = "",
|
|
27
27
|
model_name: str = "",
|
|
28
|
+
limit: int = 0,
|
|
28
29
|
**kwargs,
|
|
29
30
|
) -> "DataChain":
|
|
30
|
-
"""Generate chain from
|
|
31
|
+
"""Generate chain from Hugging Face Hub dataset.
|
|
31
32
|
|
|
32
33
|
Parameters:
|
|
33
34
|
dataset : Path or name of the dataset to read from Hugging Face Hub,
|
|
34
35
|
or an instance of `datasets.Dataset`-like object.
|
|
35
|
-
args : Additional positional arguments to pass to datasets.load_dataset
|
|
36
|
+
args : Additional positional arguments to pass to `datasets.load_dataset`.
|
|
36
37
|
session : Session to use for the chain.
|
|
37
38
|
settings : Settings to use for the chain.
|
|
38
39
|
column : Generated object column name.
|
|
39
40
|
model_name : Generated model name.
|
|
40
|
-
|
|
41
|
+
limit : Limit the number of items to read from the HF dataset.
|
|
42
|
+
Adds `take(limit)` to the `datasets.load_dataset`.
|
|
43
|
+
Defaults to 0 (no limit).
|
|
44
|
+
kwargs : Parameters to pass to `datasets.load_dataset`.
|
|
41
45
|
|
|
42
46
|
Example:
|
|
43
47
|
Load from Hugging Face Hub:
|
|
@@ -53,6 +57,18 @@ def read_hf(
|
|
|
53
57
|
import datachain as dc
|
|
54
58
|
chain = dc.read_hf(ds)
|
|
55
59
|
```
|
|
60
|
+
|
|
61
|
+
Streaming with limit, for large datasets:
|
|
62
|
+
```py
|
|
63
|
+
import datachain as dc
|
|
64
|
+
ds = dc.read_hf("beans", split="train", streaming=True, limit=10)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
or use HF split syntax (not supported if streaming is enabled):
|
|
68
|
+
```py
|
|
69
|
+
import datachain as dc
|
|
70
|
+
ds = dc.read_hf("beans", split="train[%10]")
|
|
71
|
+
```
|
|
56
72
|
"""
|
|
57
73
|
from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
|
|
58
74
|
|
|
@@ -72,4 +88,4 @@ def read_hf(
|
|
|
72
88
|
output = {column: model}
|
|
73
89
|
|
|
74
90
|
chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
|
|
75
|
-
return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
|
|
91
|
+
return chain.gen(HFGenerator(dataset, model, limit, *args, **kwargs), output=output)
|