datachain 0.26.1__tar.gz → 0.26.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.26.1 → datachain-0.26.2}/PKG-INFO +2 -2
- {datachain-0.26.1 → datachain-0.26.2}/pyproject.toml +1 -1
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/arrow.py +1 -1
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/data_model.py +11 -1
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/hf.py +4 -2
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/hf.py +31 -10
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/PKG-INFO +2 -2
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_hf.py +6 -4
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_hf.py +23 -17
- {datachain-0.26.1 → datachain-0.26.2}/.cruft.json +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.gitattributes +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.github/codecov.yaml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.github/dependabot.yml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.github/workflows/release.yml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.github/workflows/tests.yml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.gitignore +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/.pre-commit-config.yaml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/LICENSE +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/README.rst +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/assets/datachain.svg +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/commands/auth/login.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/commands/auth/logout.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/commands/auth/team.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/commands/auth/token.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/commands/index.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/commands/job/cancel.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/commands/job/clusters.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/commands/job/logs.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/commands/job/ls.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/commands/job/run.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/contributing.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/examples.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/guide/db_migrations.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/guide/delta.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/guide/env.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/guide/index.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/guide/namespaces.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/guide/processing.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/guide/remotes.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/guide/retry.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/index.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/overrides/main.html +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/quick-start.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/file.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/index.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/pose.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/segment.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/datachain.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/func.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/index.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/toolkit.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/torch.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/references/udf.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/docs/tutorials.md +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/wds.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/mkdocs.yml +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/noxfile.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/setup.cfg +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/__main__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/asyn.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cache.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/azure.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/gcs.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/hf.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/local.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/s3.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/config.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/dataset.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/delta.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/error.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/fs/reference.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/fs/utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/array.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/base.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/conditional.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/func.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/numeric.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/path.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/random.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/string.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/window.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/job.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/audio.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/clip.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/datachain.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/file.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/image.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/listing.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/projects.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/settings.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/tar.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/text.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/udf.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/video.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/listing.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/bbox.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/pose.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/segment.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/namespace.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/node.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/progress.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/project.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/py.typed +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/batch.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/dataset.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/metrics.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/params.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/queue.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/schema.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/session.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/udf.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/remote/studio.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/script_meta.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/semver.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/types.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/studio.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/telemetry.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain/utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/conftest.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/data.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/examples/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/examples/test_examples.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/examples/wds_data.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/data/lena.jpg +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_array.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_path.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_random.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_string.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_audio.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_batching.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_catalog.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_client.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_data_storage.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_datachain.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_datasets.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_delta.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_file.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_image.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_listing.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_ls.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_metastore.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_metrics.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_pull.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_pytorch.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_query.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_read_database.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_retry.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_session.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_toolkit.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_video.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_warehouse.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/scripts/feature_class.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/test_atomicity.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/test_cli_e2e.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/test_cli_studio.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/test_import_time.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/test_query_e2e.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/test_telemetry.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/model/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_asyn.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_cache.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_catalog.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_client.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_config.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_dataset.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_func.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_listing.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_metastore.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_query.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_query_params.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_semver.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_serializer.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_session.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_utils.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.26.1 → datachain-0.26.2}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.26.
|
|
3
|
+
Version: 0.26.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
98
98
|
Requires-Dist: ultralytics; extra == "tests"
|
|
99
99
|
Provides-Extra: dev
|
|
100
100
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
101
|
-
Requires-Dist: mypy==1.
|
|
101
|
+
Requires-Dist: mypy==1.17.0; extra == "dev"
|
|
102
102
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
103
103
|
Requires-Dist: types-pytz; extra == "dev"
|
|
104
104
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -262,7 +262,7 @@ def _get_hf_schema(
|
|
|
262
262
|
from datachain.lib.hf import get_output_schema, schema_from_arrow
|
|
263
263
|
|
|
264
264
|
features = schema_from_arrow(schema)
|
|
265
|
-
return features, get_output_schema(features)
|
|
265
|
+
return features, get_output_schema(features)[0]
|
|
266
266
|
return None
|
|
267
267
|
|
|
268
268
|
|
|
@@ -3,6 +3,7 @@ from datetime import datetime
|
|
|
3
3
|
from typing import ClassVar, Optional, Union, get_args, get_origin
|
|
4
4
|
|
|
5
5
|
from pydantic import AliasChoices, BaseModel, Field, create_model
|
|
6
|
+
from pydantic.fields import FieldInfo
|
|
6
7
|
|
|
7
8
|
from datachain.lib.model_store import ModelStore
|
|
8
9
|
from datachain.lib.utils import normalize_col_names
|
|
@@ -89,7 +90,16 @@ def dict_to_data_model(
|
|
|
89
90
|
}
|
|
90
91
|
|
|
91
92
|
class _DataModelStrict(BaseModel, extra="forbid"):
|
|
92
|
-
|
|
93
|
+
@classmethod
|
|
94
|
+
def _model_fields_by_aliases(cls) -> dict[str, tuple[str, FieldInfo]]:
|
|
95
|
+
"""Returns a map of aliases to original field names and info."""
|
|
96
|
+
field_info = {}
|
|
97
|
+
for _name, field in cls.model_fields.items():
|
|
98
|
+
assert isinstance(field.validation_alias, AliasChoices)
|
|
99
|
+
# Add mapping for all aliases (both normalized and original names)
|
|
100
|
+
for alias in field.validation_alias.choices:
|
|
101
|
+
field_info[str(alias)] = (_name, field)
|
|
102
|
+
return field_info
|
|
93
103
|
|
|
94
104
|
return create_model(
|
|
95
105
|
name,
|
|
@@ -32,6 +32,7 @@ def read_hf(
|
|
|
32
32
|
Parameters:
|
|
33
33
|
dataset : Path or name of the dataset to read from Hugging Face Hub,
|
|
34
34
|
or an instance of `datasets.Dataset`-like object.
|
|
35
|
+
args : Additional positional arguments to pass to datasets.load_dataset.
|
|
35
36
|
session : Session to use for the chain.
|
|
36
37
|
settings : Settings to use for the chain.
|
|
37
38
|
column : Generated object column name.
|
|
@@ -64,8 +65,9 @@ def read_hf(
|
|
|
64
65
|
|
|
65
66
|
model_name = model_name or column or ""
|
|
66
67
|
hf_features = next(iter(ds_dict.values())).features
|
|
67
|
-
|
|
68
|
-
|
|
68
|
+
hf_output, normalized_names = get_output_schema(hf_features, list(output.keys()))
|
|
69
|
+
output = output | hf_output
|
|
70
|
+
model = dict_to_data_model(model_name, output, list(normalized_names.values()))
|
|
69
71
|
if column:
|
|
70
72
|
output = {column: model}
|
|
71
73
|
|
|
@@ -26,7 +26,7 @@ except ImportError as exc:
|
|
|
26
26
|
) from exc
|
|
27
27
|
|
|
28
28
|
from io import BytesIO
|
|
29
|
-
from typing import TYPE_CHECKING, Any, Union
|
|
29
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
30
30
|
|
|
31
31
|
import PIL
|
|
32
32
|
from tqdm.auto import tqdm
|
|
@@ -34,6 +34,7 @@ from tqdm.auto import tqdm
|
|
|
34
34
|
from datachain.lib.arrow import arrow_type_mapper
|
|
35
35
|
from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
36
36
|
from datachain.lib.udf import Generator
|
|
37
|
+
from datachain.lib.utils import normalize_col_names
|
|
37
38
|
|
|
38
39
|
if TYPE_CHECKING:
|
|
39
40
|
import pyarrow as pa
|
|
@@ -94,14 +95,18 @@ class HFGenerator(Generator):
|
|
|
94
95
|
ds = self.ds_dict[split]
|
|
95
96
|
if split:
|
|
96
97
|
desc += f" split '{split}'"
|
|
98
|
+
model_fields = self.output_schema._model_fields_by_aliases() # type: ignore[attr-defined]
|
|
97
99
|
with tqdm(desc=desc, unit=" rows", leave=False) as pbar:
|
|
98
100
|
for row in ds:
|
|
99
101
|
output_dict = {}
|
|
100
102
|
if split and "split" in self.output_schema.model_fields:
|
|
101
103
|
output_dict["split"] = split
|
|
102
104
|
for name, feat in ds.features.items():
|
|
103
|
-
|
|
104
|
-
|
|
105
|
+
normalized_name, info = model_fields[name]
|
|
106
|
+
anno = info.annotation
|
|
107
|
+
output_dict[normalized_name] = convert_feature(
|
|
108
|
+
row[name], feat, anno
|
|
109
|
+
)
|
|
105
110
|
yield self.output_schema(**output_dict)
|
|
106
111
|
pbar.update(1)
|
|
107
112
|
|
|
@@ -122,10 +127,12 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
|
|
|
122
127
|
return HFClassLabel(string=feat.names[val], integer=val)
|
|
123
128
|
if isinstance(feat, dict):
|
|
124
129
|
sdict = {}
|
|
130
|
+
model_fields = anno._model_fields_by_aliases() # type: ignore[attr-defined]
|
|
125
131
|
for sname in val:
|
|
126
132
|
sfeat = feat[sname]
|
|
127
|
-
|
|
128
|
-
|
|
133
|
+
norm_name, info = model_fields[sname]
|
|
134
|
+
sanno = info.annotation
|
|
135
|
+
sdict[norm_name] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
|
|
129
136
|
return anno(**sdict)
|
|
130
137
|
if isinstance(feat, Image):
|
|
131
138
|
if isinstance(val, dict):
|
|
@@ -135,12 +142,26 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
|
|
|
135
142
|
return HFAudio(array=val["array"], sampling_rate=val["sampling_rate"])
|
|
136
143
|
|
|
137
144
|
|
|
138
|
-
def get_output_schema(
|
|
139
|
-
|
|
145
|
+
def get_output_schema(
|
|
146
|
+
features: Features, existing_column_names: Optional[list[str]] = None
|
|
147
|
+
) -> tuple[dict[str, DataType], dict[str, str]]:
|
|
148
|
+
"""
|
|
149
|
+
Generate UDF output schema from Hugging Face datasets features. It normalizes the
|
|
150
|
+
column names and returns a mapping of normalized names to original names along with
|
|
151
|
+
the data types. `existing_column_names` is the list of column names that already
|
|
152
|
+
exist in the dataset (to avoid name collisions due to normalization).
|
|
153
|
+
"""
|
|
154
|
+
existing_column_names = existing_column_names or []
|
|
140
155
|
fields_dict = {}
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
156
|
+
normalized_names = normalize_col_names(
|
|
157
|
+
existing_column_names + list(features.keys())
|
|
158
|
+
)
|
|
159
|
+
# List of tuple(str, str) for HF dataset feature names, (normalized, original)
|
|
160
|
+
new_feature_names = list(normalized_names.items())[len(existing_column_names) :]
|
|
161
|
+
for idx, feat in enumerate(features.items()):
|
|
162
|
+
name, val = feat
|
|
163
|
+
fields_dict[new_feature_names[idx][0]] = _feature_to_chain_type(name, val)
|
|
164
|
+
return fields_dict, normalized_names
|
|
144
165
|
|
|
145
166
|
|
|
146
167
|
def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.26.
|
|
3
|
+
Version: 0.26.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
98
98
|
Requires-Dist: ultralytics; extra == "tests"
|
|
99
99
|
Provides-Extra: dev
|
|
100
100
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
101
|
-
Requires-Dist: mypy==1.
|
|
101
|
+
Requires-Dist: mypy==1.17.0; extra == "dev"
|
|
102
102
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
103
103
|
Requires-Dist: types-pytz; extra == "dev"
|
|
104
104
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -34,10 +34,11 @@ def test_hf_image(tmp_path):
|
|
|
34
34
|
img.save(train_dir / "img1.png")
|
|
35
35
|
|
|
36
36
|
ds = load_dataset("imagefolder", data_dir=tmp_path)
|
|
37
|
-
|
|
37
|
+
hf_schema, norm_names = get_output_schema(ds["train"].features, ["split"])
|
|
38
|
+
schema = {"split": str} | hf_schema
|
|
38
39
|
assert schema["image"] is HFImage
|
|
39
40
|
|
|
40
|
-
gen = HFGenerator(ds, dict_to_data_model("", schema))
|
|
41
|
+
gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
|
|
41
42
|
gen.setup()
|
|
42
43
|
row = next(iter(gen.process("train")))
|
|
43
44
|
assert row.image.img == image_to_bytes(img)
|
|
@@ -56,9 +57,10 @@ def test_hf_audio(tmp_path):
|
|
|
56
57
|
write(train_dir / "example.wav", samplerate, data.astype(np.int16))
|
|
57
58
|
|
|
58
59
|
ds = load_dataset("audiofolder", data_dir=tmp_path)
|
|
59
|
-
|
|
60
|
+
hf_schema, norm_names = get_output_schema(ds["train"].features, ["split"])
|
|
61
|
+
schema = {"split": str} | hf_schema
|
|
60
62
|
|
|
61
|
-
gen = HFGenerator(ds, dict_to_data_model("", schema))
|
|
63
|
+
gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
|
|
62
64
|
gen.setup()
|
|
63
65
|
row = next(iter(gen.process("train")))
|
|
64
66
|
assert np.allclose(row.audio.array, data / amplitude, atol=1e-4)
|
|
@@ -11,37 +11,41 @@ from datachain.lib.hf import (
|
|
|
11
11
|
|
|
12
12
|
def test_hf():
|
|
13
13
|
ds = Dataset.from_dict({"pokemon": ["bulbasaur", "squirtle"]})
|
|
14
|
-
schema = get_output_schema(ds.features)
|
|
14
|
+
schema, norm_names = get_output_schema(ds.features)
|
|
15
15
|
assert schema["pokemon"] is str
|
|
16
16
|
|
|
17
|
-
gen = HFGenerator(ds, dict_to_data_model("", schema))
|
|
17
|
+
gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
|
|
18
18
|
gen.setup()
|
|
19
19
|
row = next(iter(gen.process()))
|
|
20
20
|
assert row.pokemon == "bulbasaur"
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def test_hf_split():
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
# Space in the column name should be normalized
|
|
25
|
+
ds_train = Dataset.from_dict({"pok emon": ["bulbasaur", "squirtle"]})
|
|
26
|
+
ds_test = Dataset.from_dict({"pok emon": ["charizard", "pikachu"]})
|
|
26
27
|
ds_dict = DatasetDict({"train": ds_train, "test": ds_test})
|
|
27
28
|
ds_dict = stream_splits(ds_dict)
|
|
28
|
-
|
|
29
|
+
hf_schema, norm_names = get_output_schema(ds_dict["train"].features, ["split"])
|
|
30
|
+
schema = {"split": str} | hf_schema
|
|
29
31
|
|
|
30
|
-
gen = HFGenerator(
|
|
32
|
+
gen = HFGenerator(
|
|
33
|
+
ds_dict, dict_to_data_model("", schema, list(norm_names.values()))
|
|
34
|
+
)
|
|
31
35
|
gen.setup()
|
|
32
36
|
row = next(iter(gen.process("train")))
|
|
33
37
|
|
|
34
38
|
assert row.split == "train"
|
|
35
|
-
assert row.
|
|
39
|
+
assert row.pok_emon == "bulbasaur"
|
|
36
40
|
|
|
37
41
|
|
|
38
42
|
def test_hf_class_label():
|
|
39
43
|
ds = Dataset.from_dict({"pokemon": ["bulbasaur", "squirtle"]})
|
|
40
44
|
ds = ds.class_encode_column("pokemon")
|
|
41
|
-
schema = get_output_schema(ds.features)
|
|
45
|
+
schema, norm_names = get_output_schema(ds.features)
|
|
42
46
|
assert schema["pokemon"] is HFClassLabel
|
|
43
47
|
|
|
44
|
-
gen = HFGenerator(ds, dict_to_data_model("", schema))
|
|
48
|
+
gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
|
|
45
49
|
gen.setup()
|
|
46
50
|
row = next(iter(gen.process()))
|
|
47
51
|
assert row.pokemon.string == "bulbasaur"
|
|
@@ -50,26 +54,28 @@ def test_hf_class_label():
|
|
|
50
54
|
|
|
51
55
|
def test_hf_sequence_list():
|
|
52
56
|
ds = Dataset.from_dict({"seq": [[0, 1], [2, 3]]})
|
|
53
|
-
schema = get_output_schema(ds.features)
|
|
57
|
+
schema, norm_names = get_output_schema(ds.features)
|
|
54
58
|
assert schema["seq"] == list[int]
|
|
55
59
|
|
|
56
|
-
gen = HFGenerator(ds, dict_to_data_model("", schema))
|
|
60
|
+
gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
|
|
57
61
|
gen.setup()
|
|
58
62
|
row = next(iter(gen.process()))
|
|
59
63
|
assert row.seq == [0, 1]
|
|
60
64
|
|
|
61
65
|
|
|
62
66
|
def test_hf_sequence_dict():
|
|
67
|
+
# ? in the column name should be normalized
|
|
68
|
+
# Check if even nested names are not normalized we handle it correctly
|
|
63
69
|
ds = Dataset.from_dict(
|
|
64
|
-
{"pokemon": [{"name": ["bulbasaur"]}, {"name": ["squirtle"]}]}
|
|
70
|
+
{"pokemon": [{"name?": ["bulbasaur"]}, {"name?": ["squirtle"]}]}
|
|
65
71
|
)
|
|
66
72
|
new_features = ds.features.copy()
|
|
67
|
-
new_features["pokemon"] = Sequence(feature={"name": Value(dtype="string")})
|
|
73
|
+
new_features["pokemon"] = Sequence(feature={"name?": Value(dtype="string")})
|
|
68
74
|
ds = ds.cast(new_features)
|
|
69
|
-
schema = get_output_schema(ds.features)
|
|
75
|
+
schema, norm_names = get_output_schema(ds.features)
|
|
70
76
|
assert schema["pokemon"].model_fields["name"].annotation == list[str]
|
|
71
77
|
|
|
72
|
-
gen = HFGenerator(ds, dict_to_data_model("", schema))
|
|
78
|
+
gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
|
|
73
79
|
gen.setup()
|
|
74
80
|
row = next(iter(gen.process()))
|
|
75
81
|
assert row.pokemon.name == ["bulbasaur"]
|
|
@@ -80,10 +86,10 @@ def test_hf_array():
|
|
|
80
86
|
new_features = ds.features.copy()
|
|
81
87
|
new_features["arr"] = Array2D(shape=(2, 2), dtype="int32")
|
|
82
88
|
ds = ds.cast(new_features)
|
|
83
|
-
schema = get_output_schema(ds.features)
|
|
89
|
+
schema, norm_names = get_output_schema(ds.features)
|
|
84
90
|
assert schema["arr"] == list[list[int]]
|
|
85
91
|
|
|
86
|
-
gen = HFGenerator(ds, dict_to_data_model("", schema))
|
|
92
|
+
gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
|
|
87
93
|
gen.setup()
|
|
88
94
|
row = next(iter(gen.process()))
|
|
89
95
|
assert row.arr == [[0, 1], [2, 3]]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|