datachain 0.30.2__tar.gz → 0.30.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.30.2 → datachain-0.30.3}/.github/workflows/benchmarks.yml +1 -1
- {datachain-0.30.2 → datachain-0.30.3}/.github/workflows/release.yml +1 -1
- {datachain-0.30.2 → datachain-0.30.3}/.github/workflows/tests-studio.yml +2 -2
- {datachain-0.30.2 → datachain-0.30.3}/.github/workflows/tests.yml +19 -3
- {datachain-0.30.2 → datachain-0.30.3}/.github/workflows/update-template.yaml +1 -1
- {datachain-0.30.2 → datachain-0.30.3}/.pre-commit-config.yaml +1 -1
- {datachain-0.30.2 → datachain-0.30.3}/PKG-INFO +2 -2
- datachain-0.30.3/docs/references/func.md +38 -0
- datachain-0.30.3/docs/references/functions/aggregate.md +5 -0
- datachain-0.30.3/docs/references/functions/array.md +5 -0
- datachain-0.30.3/docs/references/functions/conditional.md +5 -0
- datachain-0.30.3/docs/references/functions/numeric.md +5 -0
- datachain-0.30.3/docs/references/functions/path.md +5 -0
- datachain-0.30.3/docs/references/functions/random.md +5 -0
- datachain-0.30.3/docs/references/functions/string.md +22 -0
- datachain-0.30.3/docs/references/functions/window.md +5 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/multimodal/audio-to-text.py +4 -1
- {datachain-0.30.2 → datachain-0.30.3}/mkdocs.yml +11 -2
- {datachain-0.30.2 → datachain-0.30.3}/pyproject.toml +1 -1
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/catalog/catalog.py +86 -29
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/commands/datasets.py +3 -2
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/data_storage/metastore.py +34 -9
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/delta.py +23 -12
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/string.py +8 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/database.py +50 -6
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/datachain.py +31 -9
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/datasets.py +9 -4
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/listing.py +5 -9
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/model/ultralytics/bbox.py +14 -12
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/model/ultralytics/pose.py +14 -12
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/model/ultralytics/segment.py +14 -12
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/dataset.py +20 -10
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain.egg-info/PKG-INFO +2 -2
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain.egg-info/SOURCES.txt +8 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_datasets.py +11 -18
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_pull.py +14 -4
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_to_database.py +62 -5
- datachain-0.30.2/docs/references/func.md +0 -5
- {datachain-0.30.2 → datachain-0.30.3}/.cruft.json +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/.gitattributes +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/.github/codecov.yaml +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/.github/dependabot.yml +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/.gitignore +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/LICENSE +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/README.rst +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/assets/datachain.svg +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/commands/auth/login.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/commands/auth/logout.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/commands/auth/team.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/commands/auth/token.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/commands/index.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/commands/job/cancel.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/commands/job/clusters.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/commands/job/logs.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/commands/job/ls.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/commands/job/run.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/contributing.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/examples.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/guide/db_migrations.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/guide/delta.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/guide/env.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/guide/index.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/guide/namespaces.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/guide/processing.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/guide/remotes.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/guide/retry.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/index.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/overrides/main.html +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/quick-start.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/data-types/file.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/data-types/index.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/data-types/pose.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/data-types/segment.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/datachain.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/index.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/toolkit.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/torch.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/references/udf.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/docs/tutorials.md +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/multimodal/wds.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/noxfile.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/setup.cfg +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/__main__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/asyn.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cache.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/cli/utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/client/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/client/azure.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/client/gcs.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/client/hf.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/client/local.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/client/s3.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/config.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/dataset.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/error.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/fs/reference.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/fs/utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/array.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/base.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/conditional.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/func.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/numeric.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/path.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/random.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/func/window.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/job.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/audio.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/clip.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/file.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/hf.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/image.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/listing.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/projects.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/settings.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/tar.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/text.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/udf.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/video.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/model/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/model/bbox.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/model/pose.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/model/segment.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/model/utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/namespace.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/node.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/progress.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/project.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/py.typed +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/batch.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/metrics.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/params.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/queue.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/schema.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/session.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/udf.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/query/utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/remote/studio.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/script_meta.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/semver.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/postgresql_dialect.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/postgresql_types.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/types.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/sql/utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/studio.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/telemetry.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain/utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/conftest.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/data.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/examples/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/examples/test_examples.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/examples/wds_data.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/data/lena.jpg +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/functions/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/functions/test_array.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/functions/test_path.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/functions/test_random.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/functions/test_string.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/model/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_audio.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_batching.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_catalog.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_client.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_data_storage.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_datachain.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_delta.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_file.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_hf.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_image.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_listing.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_ls.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_metastore.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_metrics.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_pytorch.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_query.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_read_database.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_retry.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_session.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_toolkit.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_video.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/func/test_warehouse.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/scripts/feature_class.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/test_atomicity.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/test_cli_e2e.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/test_cli_studio.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/test_import_time.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/test_query_e2e.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/test_telemetry.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_settings.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/model/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_asyn.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_cache.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_catalog.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_client.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_config.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_dataset.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_func.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_listing.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_metastore.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_query.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_query_params.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_semver.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_serializer.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_session.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_utils.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.30.2 → datachain-0.30.3}/tests/utils.py +0 -0
|
@@ -62,7 +62,7 @@ jobs:
|
|
|
62
62
|
echo "Studio branch: $STUDIO_BRANCH"
|
|
63
63
|
|
|
64
64
|
- name: Check out Studio
|
|
65
|
-
uses: actions/checkout@
|
|
65
|
+
uses: actions/checkout@v5
|
|
66
66
|
with:
|
|
67
67
|
fetch-depth: 0
|
|
68
68
|
repository: iterative/studio
|
|
@@ -70,7 +70,7 @@ jobs:
|
|
|
70
70
|
token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
|
|
71
71
|
|
|
72
72
|
- name: Check out repository
|
|
73
|
-
uses: actions/checkout@
|
|
73
|
+
uses: actions/checkout@v5
|
|
74
74
|
with:
|
|
75
75
|
path: './backend/datachain'
|
|
76
76
|
fetch-depth: 0
|
|
@@ -18,7 +18,7 @@ jobs:
|
|
|
18
18
|
runs-on: ubuntu-latest
|
|
19
19
|
steps:
|
|
20
20
|
- name: Check out the repository
|
|
21
|
-
uses: actions/checkout@
|
|
21
|
+
uses: actions/checkout@v5
|
|
22
22
|
with:
|
|
23
23
|
fetch-depth: 0
|
|
24
24
|
ref: ${{ github.event.pull_request.head.sha || github.ref }}
|
|
@@ -73,7 +73,7 @@ jobs:
|
|
|
73
73
|
|
|
74
74
|
steps:
|
|
75
75
|
- name: Check out the repository
|
|
76
|
-
uses: actions/checkout@
|
|
76
|
+
uses: actions/checkout@v5
|
|
77
77
|
with:
|
|
78
78
|
fetch-depth: 0
|
|
79
79
|
ref: ${{ github.event.pull_request.head.sha || github.ref }}
|
|
@@ -175,7 +175,7 @@ jobs:
|
|
|
175
175
|
- {os: ubuntu-latest-4-cores, pyv: "3.13", group: multimodal}
|
|
176
176
|
|
|
177
177
|
steps:
|
|
178
|
-
- uses: actions/checkout@
|
|
178
|
+
- uses: actions/checkout@v5
|
|
179
179
|
with:
|
|
180
180
|
ref: ${{ github.event.pull_request.head.sha || github.ref }}
|
|
181
181
|
|
|
@@ -194,6 +194,22 @@ jobs:
|
|
|
194
194
|
- name: Install nox
|
|
195
195
|
run: uv pip install nox --system
|
|
196
196
|
|
|
197
|
+
- name: Install FFmpeg on Windows
|
|
198
|
+
if: runner.os == 'Windows'
|
|
199
|
+
run: choco install ffmpeg
|
|
200
|
+
|
|
201
|
+
- name: Install FFmpeg on macOS
|
|
202
|
+
if: runner.os == 'macOS'
|
|
203
|
+
run: |
|
|
204
|
+
brew install ffmpeg
|
|
205
|
+
echo 'DYLD_FALLBACK_LIBRARY_PATH=/opt/homebrew/lib' >> "$GITHUB_ENV"
|
|
206
|
+
|
|
207
|
+
- name: Install FFmpeg on Ubuntu
|
|
208
|
+
if: runner.os == 'Linux'
|
|
209
|
+
run: |
|
|
210
|
+
sudo apt update
|
|
211
|
+
sudo apt install -y ffmpeg
|
|
212
|
+
|
|
197
213
|
- name: Set hf token
|
|
198
214
|
if: matrix.group == 'llm_and_nlp'
|
|
199
215
|
run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.30.
|
|
3
|
+
Version: 0.30.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -45,7 +45,7 @@ Requires-Dist: datamodel-code-generator>=0.25
|
|
|
45
45
|
Requires-Dist: Pillow<12,>=10.0.0
|
|
46
46
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
47
47
|
Requires-Dist: psutil
|
|
48
|
-
Requires-Dist: huggingface_hub
|
|
48
|
+
Requires-Dist: huggingface_hub
|
|
49
49
|
Requires-Dist: iterative-telemetry>=0.0.10
|
|
50
50
|
Requires-Dist: platformdirs
|
|
51
51
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Functions
|
|
2
|
+
|
|
3
|
+
Use built-in functions for data manipulation and analysis to operate on the underlying database storing the chain data. These functions are useful for operations like [`DataChain.filter`](datachain.md#datachain.lib.dc.DataChain.filter) and [`DataChain.mutate`](datachain.md#datachain.lib.dc.DataChain.mutate).
|
|
4
|
+
|
|
5
|
+
Functions are organized by category and accessed through their respective modules. For example, string functions are accessed via `func.string.length()`, array functions via `func.array.contains()`, etc.
|
|
6
|
+
|
|
7
|
+
!!! note "Global Function Access"
|
|
8
|
+
Only a subset of functions are available directly from `datachain.func` (e.g., `func.length`). Most functions should be accessed through their specific module namespace (e.g., `func.string.length`) to avoid naming conflicts.
|
|
9
|
+
|
|
10
|
+
## Function Categories
|
|
11
|
+
|
|
12
|
+
DataChain provides several categories of functions for different types of operations:
|
|
13
|
+
|
|
14
|
+
- **[Aggregate Functions](functions/aggregate.md)** - Functions for aggregating data like `sum`, `count`, `avg`, etc.
|
|
15
|
+
- **[Array Functions](functions/array.md)** - Functions for working with arrays and lists
|
|
16
|
+
- **[Conditional Functions](functions/conditional.md)** - Functions for conditional logic like `ifelse`, `case`, etc.
|
|
17
|
+
- **[Numeric Functions](functions/numeric.md)** - Functions for numeric operations and computations
|
|
18
|
+
- **[Path Functions](functions/path.md)** - Functions for working with file paths
|
|
19
|
+
- **[Random Functions](functions/random.md)** - Functions for generating random values
|
|
20
|
+
- **[String Functions](functions/string.md)** - Functions for string manipulation and processing
|
|
21
|
+
- **[Window Functions](functions/window.md)** - Functions for window operations
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from datachain.func import aggregate, array, conditional, numeric, path, random, string, window
|
|
27
|
+
|
|
28
|
+
# Access functions through their module namespaces
|
|
29
|
+
dc.mutate(
|
|
30
|
+
text_length=string.length("text_column"),
|
|
31
|
+
contains_item=array.contains("array_column", "value"),
|
|
32
|
+
file_extension=path.file_ext("file_path")
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Some commonly used functions are also available directly
|
|
36
|
+
from datachain.func import sum, count, length, ifelse
|
|
37
|
+
dc.mutate(total=sum("amount"))
|
|
38
|
+
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# String Functions
|
|
2
|
+
|
|
3
|
+
Functions for string manipulation, text processing, and string analysis.
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
String functions are available under the `func.string` namespace to avoid name collisions with other functions:
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from datachain.func import string
|
|
11
|
+
|
|
12
|
+
# Use string functions with the string namespace
|
|
13
|
+
dc.mutate(
|
|
14
|
+
str_len=string.length("text_column"),
|
|
15
|
+
parts=string.split("text_column", ","),
|
|
16
|
+
cleaned=string.replace("text_column", "old", "new"),
|
|
17
|
+
regex_cleaned=string.regexp_replace("text_column", r"\d+", "X"),
|
|
18
|
+
distance=string.byte_hamming_distance("col1", "col2")
|
|
19
|
+
)
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
::: datachain.func.string
|
|
@@ -35,7 +35,10 @@ def process(fragment: AudioFragment, pipeline: Pipeline) -> str:
|
|
|
35
35
|
audio_array = audio_array.mean(axis=1)
|
|
36
36
|
|
|
37
37
|
# Pass the numpy array with exact sampling rate from fragment
|
|
38
|
-
result = pipeline(
|
|
38
|
+
result = pipeline(
|
|
39
|
+
{"raw": audio_array, "sampling_rate": sample_rate},
|
|
40
|
+
generate_kwargs={"language": "en"},
|
|
41
|
+
)
|
|
39
42
|
return str(result["text"])
|
|
40
43
|
|
|
41
44
|
|
|
@@ -82,7 +82,16 @@ nav:
|
|
|
82
82
|
- Segment: references/data-types/segment.md
|
|
83
83
|
- UDF: references/udf.md
|
|
84
84
|
- Torch: references/torch.md
|
|
85
|
-
- Functions:
|
|
85
|
+
- Functions:
|
|
86
|
+
- Overview: references/func.md
|
|
87
|
+
- Aggregate: references/functions/aggregate.md
|
|
88
|
+
- Array: references/functions/array.md
|
|
89
|
+
- Conditional: references/functions/conditional.md
|
|
90
|
+
- Numeric: references/functions/numeric.md
|
|
91
|
+
- Path: references/functions/path.md
|
|
92
|
+
- Random: references/functions/random.md
|
|
93
|
+
- String: references/functions/string.md
|
|
94
|
+
- Window: references/functions/window.md
|
|
86
95
|
- Toolkit: references/toolkit.md
|
|
87
96
|
- 📖 CLI Reference:
|
|
88
97
|
- Overview: commands/index.md
|
|
@@ -177,7 +186,7 @@ plugins:
|
|
|
177
186
|
- https://numpy.org/doc/stable/objects.inv
|
|
178
187
|
- https://pandas.pydata.org/docs/objects.inv
|
|
179
188
|
- https://arrow.apache.org/docs/objects.inv
|
|
180
|
-
|
|
189
|
+
- https://docs.sqlalchemy.org/objects.inv # SSL certificate issue
|
|
181
190
|
- https://docs.pydantic.dev/latest/objects.inv
|
|
182
191
|
|
|
183
192
|
watch:
|
|
@@ -680,8 +680,9 @@ class Catalog:
|
|
|
680
680
|
ds_namespace, ds_project, ds_name = parse_dataset_name(ds_name)
|
|
681
681
|
assert ds_namespace
|
|
682
682
|
assert ds_project
|
|
683
|
-
|
|
684
|
-
|
|
683
|
+
dataset = self.get_dataset(
|
|
684
|
+
ds_name, namespace_name=ds_namespace, project_name=ds_project
|
|
685
|
+
)
|
|
685
686
|
if not ds_version:
|
|
686
687
|
ds_version = dataset.latest_version
|
|
687
688
|
dataset_sources = self.warehouse.get_dataset_sources(
|
|
@@ -807,7 +808,11 @@ class Catalog:
|
|
|
807
808
|
)
|
|
808
809
|
default_version = DEFAULT_DATASET_VERSION
|
|
809
810
|
try:
|
|
810
|
-
dataset = self.get_dataset(
|
|
811
|
+
dataset = self.get_dataset(
|
|
812
|
+
name,
|
|
813
|
+
namespace_name=project.namespace.name if project else None,
|
|
814
|
+
project_name=project.name if project else None,
|
|
815
|
+
)
|
|
811
816
|
default_version = dataset.next_version_patch
|
|
812
817
|
if update_version == "major":
|
|
813
818
|
default_version = dataset.next_version_major
|
|
@@ -1016,7 +1021,11 @@ class Catalog:
|
|
|
1016
1021
|
dc.save(name)
|
|
1017
1022
|
except Exception as e: # noqa: BLE001
|
|
1018
1023
|
try:
|
|
1019
|
-
ds = self.get_dataset(
|
|
1024
|
+
ds = self.get_dataset(
|
|
1025
|
+
name,
|
|
1026
|
+
namespace_name=project.namespace.name,
|
|
1027
|
+
project_name=project.name,
|
|
1028
|
+
)
|
|
1020
1029
|
self.metastore.update_dataset_status(
|
|
1021
1030
|
ds,
|
|
1022
1031
|
DatasetStatus.FAILED,
|
|
@@ -1033,7 +1042,11 @@ class Catalog:
|
|
|
1033
1042
|
except DatasetNotFoundError:
|
|
1034
1043
|
raise e from None
|
|
1035
1044
|
|
|
1036
|
-
ds = self.get_dataset(
|
|
1045
|
+
ds = self.get_dataset(
|
|
1046
|
+
name,
|
|
1047
|
+
namespace_name=project.namespace.name,
|
|
1048
|
+
project_name=project.name,
|
|
1049
|
+
)
|
|
1037
1050
|
|
|
1038
1051
|
self.update_dataset_version_with_warehouse_info(
|
|
1039
1052
|
ds,
|
|
@@ -1041,7 +1054,11 @@ class Catalog:
|
|
|
1041
1054
|
sources="\n".join(sources),
|
|
1042
1055
|
)
|
|
1043
1056
|
|
|
1044
|
-
return self.get_dataset(
|
|
1057
|
+
return self.get_dataset(
|
|
1058
|
+
name,
|
|
1059
|
+
namespace_name=project.namespace.name,
|
|
1060
|
+
project_name=project.name,
|
|
1061
|
+
)
|
|
1045
1062
|
|
|
1046
1063
|
def get_full_dataset_name(
|
|
1047
1064
|
self,
|
|
@@ -1077,22 +1094,23 @@ class Catalog:
|
|
|
1077
1094
|
return namespace_name, project_name, name
|
|
1078
1095
|
|
|
1079
1096
|
def get_dataset(
|
|
1080
|
-
self,
|
|
1097
|
+
self,
|
|
1098
|
+
name: str,
|
|
1099
|
+
namespace_name: Optional[str] = None,
|
|
1100
|
+
project_name: Optional[str] = None,
|
|
1081
1101
|
) -> DatasetRecord:
|
|
1082
1102
|
from datachain.lib.listing import is_listing_dataset
|
|
1083
1103
|
|
|
1084
|
-
|
|
1104
|
+
namespace_name = namespace_name or self.metastore.default_namespace_name
|
|
1105
|
+
project_name = project_name or self.metastore.default_project_name
|
|
1085
1106
|
|
|
1086
1107
|
if is_listing_dataset(name):
|
|
1087
|
-
|
|
1108
|
+
namespace_name = self.metastore.system_namespace_name
|
|
1109
|
+
project_name = self.metastore.listing_project_name
|
|
1088
1110
|
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
raise DatasetNotFoundError(
|
|
1093
|
-
f"Dataset {name} not found in namespace {project.namespace.name}"
|
|
1094
|
-
f" and project {project.name}"
|
|
1095
|
-
) from None
|
|
1111
|
+
return self.metastore.get_dataset(
|
|
1112
|
+
name, namespace_name=namespace_name, project_name=project_name
|
|
1113
|
+
)
|
|
1096
1114
|
|
|
1097
1115
|
def get_dataset_with_remote_fallback(
|
|
1098
1116
|
self,
|
|
@@ -1113,8 +1131,11 @@ class Catalog:
|
|
|
1113
1131
|
|
|
1114
1132
|
if self.metastore.is_local_dataset(namespace_name) or not update:
|
|
1115
1133
|
try:
|
|
1116
|
-
|
|
1117
|
-
|
|
1134
|
+
ds = self.get_dataset(
|
|
1135
|
+
name,
|
|
1136
|
+
namespace_name=namespace_name,
|
|
1137
|
+
project_name=project_name,
|
|
1138
|
+
)
|
|
1118
1139
|
if not version or ds.has_version(version):
|
|
1119
1140
|
return ds
|
|
1120
1141
|
except (NamespaceNotFoundError, ProjectNotFoundError, DatasetNotFoundError):
|
|
@@ -1139,7 +1160,9 @@ class Catalog:
|
|
|
1139
1160
|
local_ds_version=version,
|
|
1140
1161
|
)
|
|
1141
1162
|
return self.get_dataset(
|
|
1142
|
-
name,
|
|
1163
|
+
name,
|
|
1164
|
+
namespace_name=namespace_name,
|
|
1165
|
+
project_name=project_name,
|
|
1143
1166
|
)
|
|
1144
1167
|
|
|
1145
1168
|
return self.get_remote_dataset(namespace_name, project_name, name)
|
|
@@ -1148,7 +1171,11 @@ class Catalog:
|
|
|
1148
1171
|
"""Returns dataset that contains version with specific uuid"""
|
|
1149
1172
|
for dataset in self.ls_datasets():
|
|
1150
1173
|
if dataset.has_version_with_uuid(uuid):
|
|
1151
|
-
return self.get_dataset(
|
|
1174
|
+
return self.get_dataset(
|
|
1175
|
+
dataset.name,
|
|
1176
|
+
namespace_name=dataset.project.namespace.name,
|
|
1177
|
+
project_name=dataset.project.name,
|
|
1178
|
+
)
|
|
1152
1179
|
raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
|
|
1153
1180
|
|
|
1154
1181
|
def get_remote_dataset(
|
|
@@ -1171,9 +1198,18 @@ class Catalog:
|
|
|
1171
1198
|
return DatasetRecord.from_dict(dataset_info)
|
|
1172
1199
|
|
|
1173
1200
|
def get_dataset_dependencies(
|
|
1174
|
-
self,
|
|
1201
|
+
self,
|
|
1202
|
+
name: str,
|
|
1203
|
+
version: str,
|
|
1204
|
+
namespace_name: Optional[str] = None,
|
|
1205
|
+
project_name: Optional[str] = None,
|
|
1206
|
+
indirect=False,
|
|
1175
1207
|
) -> list[Optional[DatasetDependency]]:
|
|
1176
|
-
dataset = self.get_dataset(
|
|
1208
|
+
dataset = self.get_dataset(
|
|
1209
|
+
name,
|
|
1210
|
+
namespace_name=namespace_name,
|
|
1211
|
+
project_name=project_name,
|
|
1212
|
+
)
|
|
1177
1213
|
|
|
1178
1214
|
direct_dependencies = self.metastore.get_direct_dataset_dependencies(
|
|
1179
1215
|
dataset, version
|
|
@@ -1187,10 +1223,13 @@ class Catalog:
|
|
|
1187
1223
|
# dependency has been removed
|
|
1188
1224
|
continue
|
|
1189
1225
|
if d.is_dataset:
|
|
1190
|
-
project = self.metastore.get_project(d.project, d.namespace)
|
|
1191
1226
|
# only datasets can have dependencies
|
|
1192
1227
|
d.dependencies = self.get_dataset_dependencies(
|
|
1193
|
-
d.name,
|
|
1228
|
+
d.name,
|
|
1229
|
+
d.version,
|
|
1230
|
+
namespace_name=d.namespace,
|
|
1231
|
+
project_name=d.project,
|
|
1232
|
+
indirect=indirect,
|
|
1194
1233
|
)
|
|
1195
1234
|
|
|
1196
1235
|
return direct_dependencies
|
|
@@ -1340,7 +1379,11 @@ class Catalog:
|
|
|
1340
1379
|
project: Optional[Project] = None,
|
|
1341
1380
|
client_config=None,
|
|
1342
1381
|
) -> list[str]:
|
|
1343
|
-
dataset = self.get_dataset(
|
|
1382
|
+
dataset = self.get_dataset(
|
|
1383
|
+
name,
|
|
1384
|
+
namespace_name=project.namespace.name if project else None,
|
|
1385
|
+
project_name=project.name if project else None,
|
|
1386
|
+
)
|
|
1344
1387
|
|
|
1345
1388
|
return self.warehouse.export_dataset_table(
|
|
1346
1389
|
bucket_uri, dataset, version, client_config
|
|
@@ -1349,7 +1392,11 @@ class Catalog:
|
|
|
1349
1392
|
def dataset_table_export_file_names(
|
|
1350
1393
|
self, name: str, version: str, project: Optional[Project] = None
|
|
1351
1394
|
) -> list[str]:
|
|
1352
|
-
dataset = self.get_dataset(
|
|
1395
|
+
dataset = self.get_dataset(
|
|
1396
|
+
name,
|
|
1397
|
+
namespace_name=project.namespace.name if project else None,
|
|
1398
|
+
project_name=project.name if project else None,
|
|
1399
|
+
)
|
|
1353
1400
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1354
1401
|
|
|
1355
1402
|
def remove_dataset(
|
|
@@ -1359,7 +1406,11 @@ class Catalog:
|
|
|
1359
1406
|
version: Optional[str] = None,
|
|
1360
1407
|
force: Optional[bool] = False,
|
|
1361
1408
|
):
|
|
1362
|
-
dataset = self.get_dataset(
|
|
1409
|
+
dataset = self.get_dataset(
|
|
1410
|
+
name,
|
|
1411
|
+
namespace_name=project.namespace.name if project else None,
|
|
1412
|
+
project_name=project.name if project else None,
|
|
1413
|
+
)
|
|
1363
1414
|
if not version and not force:
|
|
1364
1415
|
raise ValueError(f"Missing dataset version from input for dataset {name}")
|
|
1365
1416
|
if version and not dataset.has_version(version):
|
|
@@ -1395,7 +1446,11 @@ class Catalog:
|
|
|
1395
1446
|
if attrs is not None:
|
|
1396
1447
|
update_data["attrs"] = attrs # type: ignore[assignment]
|
|
1397
1448
|
|
|
1398
|
-
dataset = self.get_dataset(
|
|
1449
|
+
dataset = self.get_dataset(
|
|
1450
|
+
name,
|
|
1451
|
+
namespace_name=project.namespace.name if project else None,
|
|
1452
|
+
project_name=project.name if project else None,
|
|
1453
|
+
)
|
|
1399
1454
|
return self.update_dataset(dataset, **update_data)
|
|
1400
1455
|
|
|
1401
1456
|
def ls(
|
|
@@ -1549,7 +1604,9 @@ class Catalog:
|
|
|
1549
1604
|
)
|
|
1550
1605
|
|
|
1551
1606
|
try:
|
|
1552
|
-
local_dataset = self.get_dataset(
|
|
1607
|
+
local_dataset = self.get_dataset(
|
|
1608
|
+
local_ds_name, namespace_name=namespace.name, project_name=project.name
|
|
1609
|
+
)
|
|
1553
1610
|
if local_dataset and local_dataset.has_version(local_ds_version):
|
|
1554
1611
|
raise DataChainError(
|
|
1555
1612
|
f"Local dataset {local_ds_uri} already exists with different uuid,"
|
|
@@ -107,8 +107,9 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
|
|
|
107
107
|
def list_datasets_local_versions(catalog: "Catalog", name: str):
|
|
108
108
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
109
109
|
|
|
110
|
-
|
|
111
|
-
|
|
110
|
+
ds = catalog.get_dataset(
|
|
111
|
+
name, namespace_name=namespace_name, project_name=project_name
|
|
112
|
+
)
|
|
112
113
|
for v in ds.versions:
|
|
113
114
|
yield (name, v.version)
|
|
114
115
|
|
|
@@ -301,7 +301,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
301
301
|
"""
|
|
302
302
|
|
|
303
303
|
@abstractmethod
|
|
304
|
-
def get_dataset(
|
|
304
|
+
def get_dataset(
|
|
305
|
+
self,
|
|
306
|
+
name: str, # normal, not full dataset name
|
|
307
|
+
namespace_name: Optional[str] = None,
|
|
308
|
+
project_name: Optional[str] = None,
|
|
309
|
+
conn=None,
|
|
310
|
+
) -> DatasetRecord:
|
|
305
311
|
"""Gets a single dataset by name."""
|
|
306
312
|
|
|
307
313
|
@abstractmethod
|
|
@@ -912,11 +918,14 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
912
918
|
**kwargs, # TODO registered = True / False
|
|
913
919
|
) -> DatasetRecord:
|
|
914
920
|
"""Creates new dataset."""
|
|
915
|
-
|
|
921
|
+
if not project_id:
|
|
922
|
+
project = self.default_project
|
|
923
|
+
else:
|
|
924
|
+
project = self.get_project_by_id(project_id)
|
|
916
925
|
|
|
917
926
|
query = self._datasets_insert().values(
|
|
918
927
|
name=name,
|
|
919
|
-
project_id=
|
|
928
|
+
project_id=project.id,
|
|
920
929
|
status=status,
|
|
921
930
|
feature_schema=json.dumps(feature_schema or {}),
|
|
922
931
|
created_at=datetime.now(timezone.utc),
|
|
@@ -935,7 +944,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
935
944
|
query = query.on_conflict_do_nothing(index_elements=["project_id", "name"])
|
|
936
945
|
self.db.execute(query)
|
|
937
946
|
|
|
938
|
-
return self.get_dataset(
|
|
947
|
+
return self.get_dataset(
|
|
948
|
+
name, namespace_name=project.namespace.name, project_name=project.name
|
|
949
|
+
)
|
|
939
950
|
|
|
940
951
|
def create_dataset_version( # noqa: PLR0913
|
|
941
952
|
self,
|
|
@@ -992,7 +1003,12 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
992
1003
|
)
|
|
993
1004
|
self.db.execute(query, conn=conn)
|
|
994
1005
|
|
|
995
|
-
return self.get_dataset(
|
|
1006
|
+
return self.get_dataset(
|
|
1007
|
+
dataset.name,
|
|
1008
|
+
namespace_name=dataset.project.namespace.name,
|
|
1009
|
+
project_name=dataset.project.name,
|
|
1010
|
+
conn=conn,
|
|
1011
|
+
)
|
|
996
1012
|
|
|
997
1013
|
def remove_dataset(self, dataset: DatasetRecord) -> None:
|
|
998
1014
|
"""Removes dataset."""
|
|
@@ -1216,21 +1232,30 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1216
1232
|
def get_dataset(
|
|
1217
1233
|
self,
|
|
1218
1234
|
name: str, # normal, not full dataset name
|
|
1219
|
-
|
|
1235
|
+
namespace_name: Optional[str] = None,
|
|
1236
|
+
project_name: Optional[str] = None,
|
|
1220
1237
|
conn=None,
|
|
1221
1238
|
) -> DatasetRecord:
|
|
1222
1239
|
"""
|
|
1223
1240
|
Gets a single dataset in project by dataset name.
|
|
1224
1241
|
"""
|
|
1225
|
-
|
|
1242
|
+
namespace_name = namespace_name or self.default_namespace_name
|
|
1243
|
+
project_name = project_name or self.default_project_name
|
|
1226
1244
|
|
|
1227
1245
|
d = self._datasets
|
|
1246
|
+
n = self._namespaces
|
|
1247
|
+
p = self._projects
|
|
1228
1248
|
query = self._base_dataset_query()
|
|
1229
|
-
query = query.where(
|
|
1249
|
+
query = query.where(
|
|
1250
|
+
d.c.name == name,
|
|
1251
|
+
n.c.name == namespace_name,
|
|
1252
|
+
p.c.name == project_name,
|
|
1253
|
+
) # type: ignore [attr-defined]
|
|
1230
1254
|
ds = self._parse_dataset(self.db.execute(query, conn=conn))
|
|
1231
1255
|
if not ds:
|
|
1232
1256
|
raise DatasetNotFoundError(
|
|
1233
|
-
f"Dataset {name} not found in
|
|
1257
|
+
f"Dataset {name} not found in namespace {namespace_name}"
|
|
1258
|
+
f" and project {project_name}"
|
|
1234
1259
|
)
|
|
1235
1260
|
|
|
1236
1261
|
return ds
|