datachain 0.26.0__tar.gz → 0.26.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.26.0 → datachain-0.26.2}/.pre-commit-config.yaml +1 -1
- {datachain-0.26.0 → datachain-0.26.2}/PKG-INFO +2 -2
- {datachain-0.26.0 → datachain-0.26.2}/pyproject.toml +1 -1
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/catalog/loader.py +4 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/__init__.py +2 -1
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/conditional.py +34 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/arrow.py +1 -1
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/data_model.py +11 -1
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/datachain.py +102 -44
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/hf.py +4 -2
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/hf.py +31 -10
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/pytorch.py +4 -1
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/signal_schema.py +9 -4
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/dataset.py +22 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain.egg-info/PKG-INFO +2 -2
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/functions/test_conditional.py +4 -3
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_dataset_query.py +1 -1
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_hf.py +6 -4
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_datachain.py +716 -1
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_hf.py +23 -17
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_partition_by.py +38 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_signal_schema.py +12 -6
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/sql/test_conditional.py +15 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_func.py +13 -0
- {datachain-0.26.0 → datachain-0.26.2}/.cruft.json +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.gitattributes +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.github/codecov.yaml +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.github/dependabot.yml +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.github/workflows/release.yml +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.github/workflows/tests.yml +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/.gitignore +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/LICENSE +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/README.rst +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/assets/datachain.svg +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/commands/auth/login.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/commands/auth/logout.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/commands/auth/team.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/commands/auth/token.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/commands/index.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/commands/job/cancel.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/commands/job/clusters.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/commands/job/logs.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/commands/job/ls.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/commands/job/run.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/contributing.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/examples.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/guide/db_migrations.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/guide/delta.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/guide/env.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/guide/index.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/guide/namespaces.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/guide/processing.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/guide/remotes.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/guide/retry.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/index.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/overrides/main.html +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/quick-start.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/data-types/file.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/data-types/index.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/data-types/pose.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/data-types/segment.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/datachain.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/func.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/index.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/toolkit.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/torch.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/references/udf.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/docs/tutorials.md +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/multimodal/wds.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/mkdocs.yml +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/noxfile.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/setup.cfg +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/__main__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/asyn.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cache.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/cli/utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/client/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/client/azure.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/client/gcs.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/client/hf.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/client/local.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/client/s3.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/config.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/dataset.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/delta.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/error.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/fs/reference.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/fs/utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/array.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/base.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/func.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/numeric.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/path.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/random.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/string.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/func/window.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/job.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/audio.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/clip.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/file.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/image.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/listing.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/projects.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/settings.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/tar.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/text.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/udf.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/video.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/listing.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/model/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/model/bbox.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/model/pose.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/model/segment.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/model/utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/namespace.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/node.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/progress.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/project.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/py.typed +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/batch.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/metrics.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/params.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/queue.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/schema.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/session.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/udf.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/query/utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/remote/studio.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/script_meta.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/semver.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/types.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/sql/utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/studio.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/telemetry.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain/utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/conftest.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/data.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/examples/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/examples/test_examples.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/examples/wds_data.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/data/lena.jpg +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/functions/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/functions/test_array.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/functions/test_path.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/functions/test_random.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/functions/test_string.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/model/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_audio.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_batching.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_catalog.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_client.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_data_storage.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_datachain.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_datasets.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_delta.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_file.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_image.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_listing.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_ls.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_metastore.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_metrics.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_pull.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_pytorch.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_query.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_read_database.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_retry.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_session.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_toolkit.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_video.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/func/test_warehouse.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/scripts/feature_class.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/test_atomicity.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/test_cli_e2e.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/test_cli_studio.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/test_import_time.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/test_query_e2e.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/test_telemetry.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/model/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_asyn.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_cache.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_catalog.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_client.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_config.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_dataset.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_listing.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_metastore.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_query.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_query_params.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_semver.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_serializer.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_session.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_utils.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.26.0 → datachain-0.26.2}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.26.
|
|
3
|
+
Version: 0.26.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
98
98
|
Requires-Dist: ultralytics; extra == "tests"
|
|
99
99
|
Provides-Extra: dev
|
|
100
100
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
101
|
-
Requires-Dist: mypy==1.
|
|
101
|
+
Requires-Dist: mypy==1.17.0; extra == "dev"
|
|
102
102
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
103
103
|
Requires-Dist: types-pytz; extra == "dev"
|
|
104
104
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -18,6 +18,7 @@ WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
|
|
|
18
18
|
WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
|
|
19
19
|
DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
|
|
20
20
|
DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
|
|
21
|
+
DISTRIBUTED_DISABLED = "DATACHAIN_DISTRIBUTED_DISABLED"
|
|
21
22
|
|
|
22
23
|
IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
23
24
|
|
|
@@ -103,6 +104,9 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
|
103
104
|
|
|
104
105
|
|
|
105
106
|
def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
|
|
107
|
+
if os.environ.get(DISTRIBUTED_DISABLED) == "True":
|
|
108
|
+
return None
|
|
109
|
+
|
|
106
110
|
if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
|
|
107
111
|
return None
|
|
108
112
|
|
|
@@ -16,7 +16,7 @@ from .aggregate import (
|
|
|
16
16
|
sum,
|
|
17
17
|
)
|
|
18
18
|
from .array import contains, cosine_distance, euclidean_distance, length, sip_hash_64
|
|
19
|
-
from .conditional import and_, case, greatest, ifelse, isnone, least, or_
|
|
19
|
+
from .conditional import and_, case, greatest, ifelse, isnone, least, not_, or_
|
|
20
20
|
from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
|
|
21
21
|
from .path import file_ext, file_stem, name, parent
|
|
22
22
|
from .random import rand
|
|
@@ -54,6 +54,7 @@ __all__ = [
|
|
|
54
54
|
"max",
|
|
55
55
|
"min",
|
|
56
56
|
"name",
|
|
57
|
+
"not_",
|
|
57
58
|
"or_",
|
|
58
59
|
"parent",
|
|
59
60
|
"path",
|
|
@@ -3,6 +3,7 @@ from typing import Optional, Union
|
|
|
3
3
|
from sqlalchemy import ColumnElement
|
|
4
4
|
from sqlalchemy import and_ as sql_and
|
|
5
5
|
from sqlalchemy import case as sql_case
|
|
6
|
+
from sqlalchemy import not_ as sql_not
|
|
6
7
|
from sqlalchemy import or_ as sql_or
|
|
7
8
|
|
|
8
9
|
from datachain.lib.utils import DataChainParamsError
|
|
@@ -288,3 +289,36 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
|
|
|
288
289
|
func_args.append(arg)
|
|
289
290
|
|
|
290
291
|
return Func("and", inner=sql_and, cols=cols, args=func_args, result_type=bool)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def not_(arg: Union[ColumnElement, Func]) -> Func:
|
|
295
|
+
"""
|
|
296
|
+
Returns the function that produces NOT of the given expressions.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
arg (ColumnElement | Func): The expression for NOT statement.
|
|
300
|
+
If a string is provided, it is assumed to be the name of the column.
|
|
301
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
302
|
+
If a Func is provided, it is assumed to be a function returning a value.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Func: A `Func` object that represents the NOT function.
|
|
306
|
+
|
|
307
|
+
Example:
|
|
308
|
+
```py
|
|
309
|
+
dc.mutate(
|
|
310
|
+
test=not_(C("value") == 5)
|
|
311
|
+
)
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
Notes:
|
|
315
|
+
- The result column will always be of type bool.
|
|
316
|
+
"""
|
|
317
|
+
cols, func_args = [], []
|
|
318
|
+
|
|
319
|
+
if isinstance(arg, (str, Func)):
|
|
320
|
+
cols.append(arg)
|
|
321
|
+
else:
|
|
322
|
+
func_args.append(arg)
|
|
323
|
+
|
|
324
|
+
return Func("not", inner=sql_not, cols=cols, args=func_args, result_type=bool)
|
|
@@ -262,7 +262,7 @@ def _get_hf_schema(
|
|
|
262
262
|
from datachain.lib.hf import get_output_schema, schema_from_arrow
|
|
263
263
|
|
|
264
264
|
features = schema_from_arrow(schema)
|
|
265
|
-
return features, get_output_schema(features)
|
|
265
|
+
return features, get_output_schema(features)[0]
|
|
266
266
|
return None
|
|
267
267
|
|
|
268
268
|
|
|
@@ -3,6 +3,7 @@ from datetime import datetime
|
|
|
3
3
|
from typing import ClassVar, Optional, Union, get_args, get_origin
|
|
4
4
|
|
|
5
5
|
from pydantic import AliasChoices, BaseModel, Field, create_model
|
|
6
|
+
from pydantic.fields import FieldInfo
|
|
6
7
|
|
|
7
8
|
from datachain.lib.model_store import ModelStore
|
|
8
9
|
from datachain.lib.utils import normalize_col_names
|
|
@@ -89,7 +90,16 @@ def dict_to_data_model(
|
|
|
89
90
|
}
|
|
90
91
|
|
|
91
92
|
class _DataModelStrict(BaseModel, extra="forbid"):
|
|
92
|
-
|
|
93
|
+
@classmethod
|
|
94
|
+
def _model_fields_by_aliases(cls) -> dict[str, tuple[str, FieldInfo]]:
|
|
95
|
+
"""Returns a map of aliases to original field names and info."""
|
|
96
|
+
field_info = {}
|
|
97
|
+
for _name, field in cls.model_fields.items():
|
|
98
|
+
assert isinstance(field.validation_alias, AliasChoices)
|
|
99
|
+
# Add mapping for all aliases (both normalized and original names)
|
|
100
|
+
for alias in field.validation_alias.choices:
|
|
101
|
+
field_info[str(alias)] = (_name, field)
|
|
102
|
+
return field_info
|
|
93
103
|
|
|
94
104
|
return create_model(
|
|
95
105
|
name,
|
|
@@ -33,7 +33,13 @@ from datachain.func import literal
|
|
|
33
33
|
from datachain.func.base import Function
|
|
34
34
|
from datachain.func.func import Func
|
|
35
35
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
36
|
-
from datachain.lib.data_model import
|
|
36
|
+
from datachain.lib.data_model import (
|
|
37
|
+
DataModel,
|
|
38
|
+
DataType,
|
|
39
|
+
DataValue,
|
|
40
|
+
StandardType,
|
|
41
|
+
dict_to_data_model,
|
|
42
|
+
)
|
|
37
43
|
from datachain.lib.file import (
|
|
38
44
|
EXPORT_FILES_MAX_THREADS,
|
|
39
45
|
ArrowRow,
|
|
@@ -360,14 +366,6 @@ class DataChain:
|
|
|
360
366
|
self._settings = settings if settings else Settings()
|
|
361
367
|
return self
|
|
362
368
|
|
|
363
|
-
def reset_schema(self, signals_schema: SignalSchema) -> "Self":
|
|
364
|
-
self.signals_schema = signals_schema
|
|
365
|
-
return self
|
|
366
|
-
|
|
367
|
-
def add_schema(self, signals_schema: SignalSchema) -> "Self":
|
|
368
|
-
self.signals_schema |= signals_schema
|
|
369
|
-
return self
|
|
370
|
-
|
|
371
369
|
@classmethod
|
|
372
370
|
def from_storage(
|
|
373
371
|
cls,
|
|
@@ -958,7 +956,7 @@ class DataChain:
|
|
|
958
956
|
query_func = getattr(self._query, method_name)
|
|
959
957
|
|
|
960
958
|
new_schema = self.signals_schema.resolve(*args)
|
|
961
|
-
columns =
|
|
959
|
+
columns = new_schema.db_signals(as_columns=True)
|
|
962
960
|
return query_func(*columns, **kwargs)
|
|
963
961
|
|
|
964
962
|
@resolve_columns
|
|
@@ -1445,10 +1443,6 @@ class DataChain:
|
|
|
1445
1443
|
remove_prefetched=remove_prefetched,
|
|
1446
1444
|
)
|
|
1447
1445
|
|
|
1448
|
-
def remove_file_signals(self) -> "Self":
|
|
1449
|
-
schema = self.signals_schema.clone_without_file_signals()
|
|
1450
|
-
return self.select(*schema.values.keys())
|
|
1451
|
-
|
|
1452
1446
|
@delta_disabled
|
|
1453
1447
|
def merge(
|
|
1454
1448
|
self,
|
|
@@ -1803,12 +1797,19 @@ class DataChain:
|
|
|
1803
1797
|
)
|
|
1804
1798
|
return read_pandas(*args, **kwargs)
|
|
1805
1799
|
|
|
1806
|
-
def to_pandas(
|
|
1800
|
+
def to_pandas(
|
|
1801
|
+
self,
|
|
1802
|
+
flatten: bool = False,
|
|
1803
|
+
include_hidden: bool = True,
|
|
1804
|
+
) -> "pd.DataFrame":
|
|
1807
1805
|
"""Return a pandas DataFrame from the chain.
|
|
1808
1806
|
|
|
1809
1807
|
Parameters:
|
|
1810
|
-
flatten
|
|
1811
|
-
include_hidden
|
|
1808
|
+
flatten: Whether to use a multiindex or flatten column names.
|
|
1809
|
+
include_hidden: Whether to include hidden columns.
|
|
1810
|
+
|
|
1811
|
+
Returns:
|
|
1812
|
+
pd.DataFrame: A pandas DataFrame representation of the chain.
|
|
1812
1813
|
"""
|
|
1813
1814
|
import pandas as pd
|
|
1814
1815
|
|
|
@@ -1826,19 +1827,19 @@ class DataChain:
|
|
|
1826
1827
|
def show(
|
|
1827
1828
|
self,
|
|
1828
1829
|
limit: int = 20,
|
|
1829
|
-
flatten=False,
|
|
1830
|
-
transpose=False,
|
|
1831
|
-
truncate=True,
|
|
1832
|
-
include_hidden=False,
|
|
1830
|
+
flatten: bool = False,
|
|
1831
|
+
transpose: bool = False,
|
|
1832
|
+
truncate: bool = True,
|
|
1833
|
+
include_hidden: bool = False,
|
|
1833
1834
|
) -> None:
|
|
1834
1835
|
"""Show a preview of the chain results.
|
|
1835
1836
|
|
|
1836
1837
|
Parameters:
|
|
1837
|
-
limit
|
|
1838
|
-
flatten
|
|
1839
|
-
transpose
|
|
1840
|
-
truncate
|
|
1841
|
-
include_hidden
|
|
1838
|
+
limit: How many rows to show.
|
|
1839
|
+
flatten: Whether to use a multiindex or flatten column names.
|
|
1840
|
+
transpose: Whether to transpose rows and columns.
|
|
1841
|
+
truncate: Whether or not to truncate the contents of columns.
|
|
1842
|
+
include_hidden: Whether to include hidden columns.
|
|
1842
1843
|
"""
|
|
1843
1844
|
import pandas as pd
|
|
1844
1845
|
|
|
@@ -2268,21 +2269,73 @@ class DataChain:
|
|
|
2268
2269
|
)
|
|
2269
2270
|
return read_records(*args, **kwargs)
|
|
2270
2271
|
|
|
2271
|
-
def sum(self,
|
|
2272
|
-
"""Compute the sum of a column.
|
|
2273
|
-
|
|
2272
|
+
def sum(self, col: str) -> StandardType: # type: ignore[override]
|
|
2273
|
+
"""Compute the sum of a column.
|
|
2274
|
+
|
|
2275
|
+
Parameters:
|
|
2276
|
+
col: The column to compute the sum for.
|
|
2277
|
+
|
|
2278
|
+
Returns:
|
|
2279
|
+
The sum of the column values.
|
|
2280
|
+
|
|
2281
|
+
Example:
|
|
2282
|
+
```py
|
|
2283
|
+
total_size = chain.sum("file.size")
|
|
2284
|
+
print(f"Total size: {total_size}")
|
|
2285
|
+
```
|
|
2286
|
+
"""
|
|
2287
|
+
return self._extend_to_data_model("sum", col)
|
|
2288
|
+
|
|
2289
|
+
def avg(self, col: str) -> StandardType: # type: ignore[override]
|
|
2290
|
+
"""Compute the average of a column.
|
|
2291
|
+
|
|
2292
|
+
Parameters:
|
|
2293
|
+
col: The column to compute the average for.
|
|
2294
|
+
|
|
2295
|
+
Returns:
|
|
2296
|
+
The average of the column values.
|
|
2297
|
+
|
|
2298
|
+
Example:
|
|
2299
|
+
```py
|
|
2300
|
+
average_size = chain.avg("file.size")
|
|
2301
|
+
print(f"Average size: {average_size}")
|
|
2302
|
+
```
|
|
2303
|
+
"""
|
|
2304
|
+
return self._extend_to_data_model("avg", col)
|
|
2305
|
+
|
|
2306
|
+
def min(self, col: str) -> StandardType: # type: ignore[override]
|
|
2307
|
+
"""Compute the minimum of a column.
|
|
2308
|
+
|
|
2309
|
+
Parameters:
|
|
2310
|
+
col: The column to compute the minimum for.
|
|
2311
|
+
|
|
2312
|
+
Returns:
|
|
2313
|
+
The minimum value in the column.
|
|
2314
|
+
|
|
2315
|
+
Example:
|
|
2316
|
+
```py
|
|
2317
|
+
min_size = chain.min("file.size")
|
|
2318
|
+
print(f"Minimum size: {min_size}")
|
|
2319
|
+
```
|
|
2320
|
+
"""
|
|
2321
|
+
return self._extend_to_data_model("min", col)
|
|
2322
|
+
|
|
2323
|
+
def max(self, col: str) -> StandardType: # type: ignore[override]
|
|
2324
|
+
"""Compute the maximum of a column.
|
|
2274
2325
|
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
return self._extend_to_data_model("avg", fr)
|
|
2326
|
+
Parameters:
|
|
2327
|
+
col: The column to compute the maximum for.
|
|
2278
2328
|
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
return self._extend_to_data_model("min", fr)
|
|
2329
|
+
Returns:
|
|
2330
|
+
The maximum value in the column.
|
|
2282
2331
|
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2332
|
+
Example:
|
|
2333
|
+
```py
|
|
2334
|
+
max_size = chain.max("file.size")
|
|
2335
|
+
print(f"Maximum size: {max_size}")
|
|
2336
|
+
```
|
|
2337
|
+
"""
|
|
2338
|
+
return self._extend_to_data_model("max", col)
|
|
2286
2339
|
|
|
2287
2340
|
def setup(self, **kwargs) -> "Self":
|
|
2288
2341
|
"""Setup variables to pass to UDF functions.
|
|
@@ -2393,14 +2446,15 @@ class DataChain:
|
|
|
2393
2446
|
"""Shuffle the rows of the chain deterministically."""
|
|
2394
2447
|
return self.order_by("sys.rand")
|
|
2395
2448
|
|
|
2396
|
-
def sample(self, n) -> "Self":
|
|
2449
|
+
def sample(self, n: int) -> "Self":
|
|
2397
2450
|
"""Return a random sample from the chain.
|
|
2398
2451
|
|
|
2399
2452
|
Parameters:
|
|
2400
|
-
n
|
|
2453
|
+
n: Number of samples to draw.
|
|
2401
2454
|
|
|
2402
|
-
|
|
2403
|
-
|
|
2455
|
+
Note:
|
|
2456
|
+
Samples are not deterministic, and streamed/paginated queries or
|
|
2457
|
+
multiple workers will draw samples with replacement.
|
|
2404
2458
|
"""
|
|
2405
2459
|
return self._evolve(query=self._query.sample(n))
|
|
2406
2460
|
|
|
@@ -2507,6 +2561,10 @@ class DataChain:
|
|
|
2507
2561
|
def chunk(self, index: int, total: int) -> "Self":
|
|
2508
2562
|
"""Split a chain into smaller chunks for e.g. parallelization.
|
|
2509
2563
|
|
|
2564
|
+
Parameters:
|
|
2565
|
+
index: The index of the chunk (0-indexed).
|
|
2566
|
+
total: The total number of chunks.
|
|
2567
|
+
|
|
2510
2568
|
Example:
|
|
2511
2569
|
```py
|
|
2512
2570
|
import datachain as dc
|
|
@@ -2526,7 +2584,7 @@ class DataChain:
|
|
|
2526
2584
|
"""Returns a list of rows of values, optionally limited to the specified
|
|
2527
2585
|
columns.
|
|
2528
2586
|
|
|
2529
|
-
|
|
2587
|
+
Parameters:
|
|
2530
2588
|
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
2531
2589
|
|
|
2532
2590
|
Returns:
|
|
@@ -2556,7 +2614,7 @@ class DataChain:
|
|
|
2556
2614
|
def to_values(self, col: str) -> list[DataValue]:
|
|
2557
2615
|
"""Returns a flat list of values from a single column.
|
|
2558
2616
|
|
|
2559
|
-
|
|
2617
|
+
Parameters:
|
|
2560
2618
|
col: The name of the column to extract values from.
|
|
2561
2619
|
|
|
2562
2620
|
Returns:
|
|
@@ -32,6 +32,7 @@ def read_hf(
|
|
|
32
32
|
Parameters:
|
|
33
33
|
dataset : Path or name of the dataset to read from Hugging Face Hub,
|
|
34
34
|
or an instance of `datasets.Dataset`-like object.
|
|
35
|
+
args : Additional positional arguments to pass to datasets.load_dataset.
|
|
35
36
|
session : Session to use for the chain.
|
|
36
37
|
settings : Settings to use for the chain.
|
|
37
38
|
column : Generated object column name.
|
|
@@ -64,8 +65,9 @@ def read_hf(
|
|
|
64
65
|
|
|
65
66
|
model_name = model_name or column or ""
|
|
66
67
|
hf_features = next(iter(ds_dict.values())).features
|
|
67
|
-
|
|
68
|
-
|
|
68
|
+
hf_output, normalized_names = get_output_schema(hf_features, list(output.keys()))
|
|
69
|
+
output = output | hf_output
|
|
70
|
+
model = dict_to_data_model(model_name, output, list(normalized_names.values()))
|
|
69
71
|
if column:
|
|
70
72
|
output = {column: model}
|
|
71
73
|
|
|
@@ -26,7 +26,7 @@ except ImportError as exc:
|
|
|
26
26
|
) from exc
|
|
27
27
|
|
|
28
28
|
from io import BytesIO
|
|
29
|
-
from typing import TYPE_CHECKING, Any, Union
|
|
29
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
30
30
|
|
|
31
31
|
import PIL
|
|
32
32
|
from tqdm.auto import tqdm
|
|
@@ -34,6 +34,7 @@ from tqdm.auto import tqdm
|
|
|
34
34
|
from datachain.lib.arrow import arrow_type_mapper
|
|
35
35
|
from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
36
36
|
from datachain.lib.udf import Generator
|
|
37
|
+
from datachain.lib.utils import normalize_col_names
|
|
37
38
|
|
|
38
39
|
if TYPE_CHECKING:
|
|
39
40
|
import pyarrow as pa
|
|
@@ -94,14 +95,18 @@ class HFGenerator(Generator):
|
|
|
94
95
|
ds = self.ds_dict[split]
|
|
95
96
|
if split:
|
|
96
97
|
desc += f" split '{split}'"
|
|
98
|
+
model_fields = self.output_schema._model_fields_by_aliases() # type: ignore[attr-defined]
|
|
97
99
|
with tqdm(desc=desc, unit=" rows", leave=False) as pbar:
|
|
98
100
|
for row in ds:
|
|
99
101
|
output_dict = {}
|
|
100
102
|
if split and "split" in self.output_schema.model_fields:
|
|
101
103
|
output_dict["split"] = split
|
|
102
104
|
for name, feat in ds.features.items():
|
|
103
|
-
|
|
104
|
-
|
|
105
|
+
normalized_name, info = model_fields[name]
|
|
106
|
+
anno = info.annotation
|
|
107
|
+
output_dict[normalized_name] = convert_feature(
|
|
108
|
+
row[name], feat, anno
|
|
109
|
+
)
|
|
105
110
|
yield self.output_schema(**output_dict)
|
|
106
111
|
pbar.update(1)
|
|
107
112
|
|
|
@@ -122,10 +127,12 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
|
|
|
122
127
|
return HFClassLabel(string=feat.names[val], integer=val)
|
|
123
128
|
if isinstance(feat, dict):
|
|
124
129
|
sdict = {}
|
|
130
|
+
model_fields = anno._model_fields_by_aliases() # type: ignore[attr-defined]
|
|
125
131
|
for sname in val:
|
|
126
132
|
sfeat = feat[sname]
|
|
127
|
-
|
|
128
|
-
|
|
133
|
+
norm_name, info = model_fields[sname]
|
|
134
|
+
sanno = info.annotation
|
|
135
|
+
sdict[norm_name] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
|
|
129
136
|
return anno(**sdict)
|
|
130
137
|
if isinstance(feat, Image):
|
|
131
138
|
if isinstance(val, dict):
|
|
@@ -135,12 +142,26 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
|
|
|
135
142
|
return HFAudio(array=val["array"], sampling_rate=val["sampling_rate"])
|
|
136
143
|
|
|
137
144
|
|
|
138
|
-
def get_output_schema(
|
|
139
|
-
|
|
145
|
+
def get_output_schema(
|
|
146
|
+
features: Features, existing_column_names: Optional[list[str]] = None
|
|
147
|
+
) -> tuple[dict[str, DataType], dict[str, str]]:
|
|
148
|
+
"""
|
|
149
|
+
Generate UDF output schema from Hugging Face datasets features. It normalizes the
|
|
150
|
+
column names and returns a mapping of normalized names to original names along with
|
|
151
|
+
the data types. `existing_column_names` is the list of column names that already
|
|
152
|
+
exist in the dataset (to avoid name collisions due to normalization).
|
|
153
|
+
"""
|
|
154
|
+
existing_column_names = existing_column_names or []
|
|
140
155
|
fields_dict = {}
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
156
|
+
normalized_names = normalize_col_names(
|
|
157
|
+
existing_column_names + list(features.keys())
|
|
158
|
+
)
|
|
159
|
+
# List of tuple(str, str) for HF dataset feature names, (normalized, original)
|
|
160
|
+
new_feature_names = list(normalized_names.items())[len(existing_column_names) :]
|
|
161
|
+
for idx, feat in enumerate(features.items()):
|
|
162
|
+
name, val = feat
|
|
163
|
+
fields_dict[new_feature_names[idx][0]] = _feature_to_chain_type(name, val)
|
|
164
|
+
return fields_dict, normalized_names
|
|
144
165
|
|
|
145
166
|
|
|
146
167
|
def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
|
|
@@ -125,7 +125,10 @@ class PytorchDataset(IterableDataset):
|
|
|
125
125
|
ds = read_dataset(
|
|
126
126
|
name=self.name, version=self.version, session=session
|
|
127
127
|
).settings(cache=self.cache, prefetch=self.prefetch)
|
|
128
|
-
|
|
128
|
+
|
|
129
|
+
# remove file signals from dataset
|
|
130
|
+
schema = ds.signals_schema.clone_without_file_signals()
|
|
131
|
+
ds = ds.select(*schema.values.keys())
|
|
129
132
|
|
|
130
133
|
if self.num_samples > 0:
|
|
131
134
|
ds = ds.sample(self.num_samples)
|
|
@@ -610,20 +610,25 @@ class SignalSchema:
|
|
|
610
610
|
return SignalSchema(schema)
|
|
611
611
|
|
|
612
612
|
def _find_in_tree(self, path: list[str]) -> DataType:
|
|
613
|
+
if val := self.tree.get(".".join(path)):
|
|
614
|
+
# If the path is a single string, we can directly access it
|
|
615
|
+
# without traversing the tree.
|
|
616
|
+
return val[0]
|
|
617
|
+
|
|
613
618
|
curr_tree = self.tree
|
|
614
619
|
curr_type = None
|
|
615
620
|
i = 0
|
|
616
621
|
while curr_tree is not None and i < len(path):
|
|
617
622
|
if val := curr_tree.get(path[i]):
|
|
618
623
|
curr_type, curr_tree = val
|
|
619
|
-
elif i == 0 and len(path) > 1 and (val := curr_tree.get(".".join(path))):
|
|
620
|
-
curr_type, curr_tree = val
|
|
621
|
-
break
|
|
622
624
|
else:
|
|
623
625
|
curr_type = None
|
|
626
|
+
break
|
|
624
627
|
i += 1
|
|
625
628
|
|
|
626
|
-
if curr_type is None:
|
|
629
|
+
if curr_type is None or i < len(path):
|
|
630
|
+
# If we reached the end of the path and didn't find a type,
|
|
631
|
+
# or if we didn't traverse the entire path, raise an error.
|
|
627
632
|
raise SignalResolvingError(path, "is not found")
|
|
628
633
|
|
|
629
634
|
return curr_type
|
|
@@ -559,7 +559,13 @@ class UDFStep(Step, ABC):
|
|
|
559
559
|
"""
|
|
560
560
|
Create temporary table with group by partitions.
|
|
561
561
|
"""
|
|
562
|
+
# Check if partition_by is set, we need it to create partitions.
|
|
562
563
|
assert self.partition_by is not None
|
|
564
|
+
# Check if sys__id is in the query, we need it to be able to join
|
|
565
|
+
# the partition table with the udf table later.
|
|
566
|
+
assert any(c.name == "sys__id" for c in query.selected_columns), (
|
|
567
|
+
"Query must have sys__id column to use partitioning."
|
|
568
|
+
)
|
|
563
569
|
|
|
564
570
|
if isinstance(self.partition_by, (list, tuple, GeneratorType)):
|
|
565
571
|
list_partition_by = list(self.partition_by)
|
|
@@ -606,6 +612,22 @@ class UDFStep(Step, ABC):
|
|
|
606
612
|
|
|
607
613
|
# Apply partitioning if needed.
|
|
608
614
|
if self.partition_by is not None:
|
|
615
|
+
if not any(c.name == "sys__id" for c in query.selected_columns):
|
|
616
|
+
# If sys__id is not in the query, we need to create a temp table
|
|
617
|
+
# to hold the query results, so we can join it with the
|
|
618
|
+
# partition table later.
|
|
619
|
+
columns = [
|
|
620
|
+
c if isinstance(c, Column) else Column(c.name, c.type)
|
|
621
|
+
for c in query.subquery().columns
|
|
622
|
+
]
|
|
623
|
+
temp_table = self.catalog.warehouse.create_dataset_rows_table(
|
|
624
|
+
self.catalog.warehouse.temp_table_name(),
|
|
625
|
+
columns=columns,
|
|
626
|
+
)
|
|
627
|
+
temp_tables.append(temp_table.name)
|
|
628
|
+
self.catalog.warehouse.copy_table(temp_table, query)
|
|
629
|
+
_query = query = temp_table.select()
|
|
630
|
+
|
|
609
631
|
partition_tbl = self.create_partitions_table(query)
|
|
610
632
|
temp_tables.append(partition_tbl.name)
|
|
611
633
|
query = query.outerjoin(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.26.
|
|
3
|
+
Version: 0.26.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
98
98
|
Requires-Dist: ultralytics; extra == "tests"
|
|
99
99
|
Provides-Extra: dev
|
|
100
100
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
101
|
-
Requires-Dist: mypy==1.
|
|
101
|
+
Requires-Dist: mypy==1.17.0; extra == "dev"
|
|
102
102
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
103
103
|
Requires-Dist: types-pytz; extra == "dev"
|
|
104
104
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -5,7 +5,7 @@ from datachain import func
|
|
|
5
5
|
from tests.utils import skip_if_not_sqlite
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def
|
|
8
|
+
def test_conditional_and_or_not(test_session):
|
|
9
9
|
class Data(dc.DataModel):
|
|
10
10
|
i: int
|
|
11
11
|
f: float
|
|
@@ -25,11 +25,12 @@ def test_conditional_and_or(test_session):
|
|
|
25
25
|
t2=func.and_(dc.C("data.i") > 15, dc.C("data.f") > 2.5),
|
|
26
26
|
t3=func.or_(dc.C("data.i") > 15, dc.C("data.f") > 1.5),
|
|
27
27
|
t4=func.or_(dc.C("data.i") > 15, dc.C("data.f") > 2.5),
|
|
28
|
+
t5=func.not_(dc.C("data.i") > 15),
|
|
28
29
|
)
|
|
29
30
|
.order_by("id")
|
|
30
|
-
).to_list("t1", "t2", "t3", "t4")
|
|
31
|
+
).to_list("t1", "t2", "t3", "t4", "t5")
|
|
31
32
|
|
|
32
|
-
assert ds == [(0, 0, 0, 0), (1, 0, 1, 1), (1, 1, 1, 1)]
|
|
33
|
+
assert ds == [(0, 0, 0, 0, 1), (1, 0, 1, 1, 0), (1, 1, 1, 1, 0)]
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
def test_conditional_case(test_session):
|
|
@@ -227,7 +227,7 @@ def test_select_missing_column(cloud_test_catalog, animal_dataset):
|
|
|
227
227
|
ds1 = ds.select(C.missing_column_name)
|
|
228
228
|
ds2 = ds.select("missing_column_name")
|
|
229
229
|
# The exception type varies by database backend
|
|
230
|
-
exc1 = pytest.raises(Exception, ds1.db_results)
|
|
230
|
+
exc1 = pytest.raises(Exception, ds1.db_results) # noqa: B017
|
|
231
231
|
assert "missing_column_name" in str(exc1.value)
|
|
232
232
|
exc2 = pytest.raises(KeyError, ds2.db_results)
|
|
233
233
|
assert "missing_column_name" in str(exc2.value)
|
|
@@ -34,10 +34,11 @@ def test_hf_image(tmp_path):
|
|
|
34
34
|
img.save(train_dir / "img1.png")
|
|
35
35
|
|
|
36
36
|
ds = load_dataset("imagefolder", data_dir=tmp_path)
|
|
37
|
-
|
|
37
|
+
hf_schema, norm_names = get_output_schema(ds["train"].features, ["split"])
|
|
38
|
+
schema = {"split": str} | hf_schema
|
|
38
39
|
assert schema["image"] is HFImage
|
|
39
40
|
|
|
40
|
-
gen = HFGenerator(ds, dict_to_data_model("", schema))
|
|
41
|
+
gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
|
|
41
42
|
gen.setup()
|
|
42
43
|
row = next(iter(gen.process("train")))
|
|
43
44
|
assert row.image.img == image_to_bytes(img)
|
|
@@ -56,9 +57,10 @@ def test_hf_audio(tmp_path):
|
|
|
56
57
|
write(train_dir / "example.wav", samplerate, data.astype(np.int16))
|
|
57
58
|
|
|
58
59
|
ds = load_dataset("audiofolder", data_dir=tmp_path)
|
|
59
|
-
|
|
60
|
+
hf_schema, norm_names = get_output_schema(ds["train"].features, ["split"])
|
|
61
|
+
schema = {"split": str} | hf_schema
|
|
60
62
|
|
|
61
|
-
gen = HFGenerator(ds, dict_to_data_model("", schema))
|
|
63
|
+
gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
|
|
62
64
|
gen.setup()
|
|
63
65
|
row = next(iter(gen.process("train")))
|
|
64
66
|
assert np.allclose(row.audio.array, data / amplitude, atol=1e-4)
|