datachain 0.28.0__tar.gz → 0.28.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.28.0 → datachain-0.28.1}/.pre-commit-config.yaml +1 -1
- {datachain-0.28.0 → datachain-0.28.1}/PKG-INFO +1 -1
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/datachain.py +9 -4
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/file.py +53 -1
- datachain-0.28.1/src/datachain/lib/utils.py +155 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/PKG-INFO +1 -1
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_datachain.py +17 -6
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_file.py +47 -1
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_utils.py +70 -1
- datachain-0.28.0/src/datachain/lib/utils.py +0 -59
- {datachain-0.28.0 → datachain-0.28.1}/.cruft.json +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.gitattributes +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.github/codecov.yaml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.github/dependabot.yml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.github/workflows/release.yml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/.gitignore +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/LICENSE +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/README.rst +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/commands/auth/login.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/commands/auth/logout.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/commands/auth/team.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/commands/auth/token.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/commands/index.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/commands/job/cancel.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/commands/job/clusters.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/commands/job/logs.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/commands/job/ls.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/commands/job/run.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/contributing.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/examples.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/guide/db_migrations.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/guide/delta.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/guide/env.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/guide/index.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/guide/namespaces.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/guide/processing.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/guide/remotes.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/guide/retry.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/index.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/overrides/main.html +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/quick-start.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/file.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/index.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/pose.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/segment.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/datachain.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/func.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/index.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/toolkit.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/torch.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/references/udf.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/docs/tutorials.md +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/mkdocs.yml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/noxfile.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/pyproject.toml +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/setup.cfg +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/__main__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/asyn.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cache.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/local.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/config.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/dataset.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/delta.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/error.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/fs/reference.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/fs/utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/array.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/base.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/conditional.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/func.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/numeric.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/path.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/random.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/string.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/window.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/job.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/audio.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/projects.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/udf.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/video.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/listing.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/bbox.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/pose.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/segment.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/namespace.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/node.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/progress.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/project.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/py.typed +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/batch.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/dataset.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/params.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/schema.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/session.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/udf.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/script_meta.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/semver.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/studio.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain/utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/conftest.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/data.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/examples/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/data/lena.jpg +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_array.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_path.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_random.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_string.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_audio.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_batching.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_client.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_data_storage.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_datasets.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_delta.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_file.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_hf.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_image.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_listing.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_ls.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_metastore.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_pull.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_query.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_read_database.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_retry.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_session.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_toolkit.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_video.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_warehouse.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/test_atomicity.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/test_cli_studio.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/test_import_time.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/test_telemetry.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/model/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_client.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_config.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_func.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_query.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_semver.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_session.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.28.0 → datachain-0.28.1}/tests/utils.py +0 -0
|
@@ -2419,9 +2419,11 @@ class DataChain:
|
|
|
2419
2419
|
ds.to_storage("gs://mybucket", placement="filename")
|
|
2420
2420
|
```
|
|
2421
2421
|
"""
|
|
2422
|
+
chain = self.persist()
|
|
2423
|
+
count = chain.count()
|
|
2424
|
+
|
|
2422
2425
|
if placement == "filename" and (
|
|
2423
|
-
|
|
2424
|
-
!= self._query.count()
|
|
2426
|
+
chain._query.distinct(pathfunc.name(C(f"{signal}__path"))).count() != count
|
|
2425
2427
|
):
|
|
2426
2428
|
raise ValueError("Files with the same name found")
|
|
2427
2429
|
|
|
@@ -2433,7 +2435,7 @@ class DataChain:
|
|
|
2433
2435
|
unit=" files",
|
|
2434
2436
|
unit_scale=True,
|
|
2435
2437
|
unit_divisor=10,
|
|
2436
|
-
total=
|
|
2438
|
+
total=count,
|
|
2437
2439
|
leave=False,
|
|
2438
2440
|
)
|
|
2439
2441
|
file_exporter = FileExporter(
|
|
@@ -2444,7 +2446,10 @@ class DataChain:
|
|
|
2444
2446
|
max_threads=num_threads or 1,
|
|
2445
2447
|
client_config=client_config,
|
|
2446
2448
|
)
|
|
2447
|
-
file_exporter.run(
|
|
2449
|
+
file_exporter.run(
|
|
2450
|
+
(rows[0] for rows in chain.to_iter(signal)),
|
|
2451
|
+
progress_bar,
|
|
2452
|
+
)
|
|
2448
2453
|
|
|
2449
2454
|
def shuffle(self) -> "Self":
|
|
2450
2455
|
"""Shuffle the rows of the chain deterministically."""
|
|
@@ -23,7 +23,7 @@ from pydantic import Field, field_validator
|
|
|
23
23
|
|
|
24
24
|
from datachain.client.fileslice import FileSlice
|
|
25
25
|
from datachain.lib.data_model import DataModel
|
|
26
|
-
from datachain.lib.utils import DataChainError
|
|
26
|
+
from datachain.lib.utils import DataChainError, rebase_path
|
|
27
27
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
28
28
|
from datachain.sql.types import JSON, Boolean, DateTime, Int, String
|
|
29
29
|
from datachain.utils import TIME_ZERO
|
|
@@ -634,6 +634,40 @@ class File(DataModel):
|
|
|
634
634
|
location=self.location,
|
|
635
635
|
)
|
|
636
636
|
|
|
637
|
+
def rebase(
|
|
638
|
+
self,
|
|
639
|
+
old_base: str,
|
|
640
|
+
new_base: str,
|
|
641
|
+
suffix: str = "",
|
|
642
|
+
extension: str = "",
|
|
643
|
+
) -> str:
|
|
644
|
+
"""
|
|
645
|
+
Rebase the file's URI from one base directory to another.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
old_base: Base directory to remove from the file's URI
|
|
649
|
+
new_base: New base directory to prepend
|
|
650
|
+
suffix: Optional suffix to add before file extension
|
|
651
|
+
extension: Optional new file extension (without dot)
|
|
652
|
+
|
|
653
|
+
Returns:
|
|
654
|
+
str: Rebased URI with new base directory
|
|
655
|
+
|
|
656
|
+
Raises:
|
|
657
|
+
ValueError: If old_base is not found in the file's URI
|
|
658
|
+
|
|
659
|
+
Examples:
|
|
660
|
+
>>> file = File(source="s3://bucket", path="data/2025-05-27/file.wav")
|
|
661
|
+
>>> file.rebase("s3://bucket/data", "s3://output-bucket/processed", \
|
|
662
|
+
extension="mp3")
|
|
663
|
+
's3://output-bucket/processed/2025-05-27/file.mp3'
|
|
664
|
+
|
|
665
|
+
>>> file.rebase("data/audio", "/local/output", suffix="_ch1",
|
|
666
|
+
extension="npy")
|
|
667
|
+
'/local/output/file_ch1.npy'
|
|
668
|
+
"""
|
|
669
|
+
return rebase_path(self.get_uri(), old_base, new_base, suffix, extension)
|
|
670
|
+
|
|
637
671
|
|
|
638
672
|
def resolve(file: File) -> File:
|
|
639
673
|
"""
|
|
@@ -1219,6 +1253,24 @@ class Audio(DataModel):
|
|
|
1219
1253
|
codec: str = Field(default="")
|
|
1220
1254
|
bit_rate: int = Field(default=-1)
|
|
1221
1255
|
|
|
1256
|
+
@staticmethod
|
|
1257
|
+
def get_channel_name(num_channels: int, channel_idx: int) -> str:
|
|
1258
|
+
"""Map channel index to meaningful name based on common audio formats"""
|
|
1259
|
+
channel_mappings = {
|
|
1260
|
+
1: ["Mono"],
|
|
1261
|
+
2: ["Left", "Right"],
|
|
1262
|
+
4: ["W", "X", "Y", "Z"], # First-order Ambisonics
|
|
1263
|
+
6: ["FL", "FR", "FC", "LFE", "BL", "BR"], # 5.1 surround
|
|
1264
|
+
8: ["FL", "FR", "FC", "LFE", "BL", "BR", "SL", "SR"], # 7.1 surround
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
if num_channels in channel_mappings:
|
|
1268
|
+
channels = channel_mappings[num_channels]
|
|
1269
|
+
if 0 <= channel_idx < len(channels):
|
|
1270
|
+
return channels[channel_idx]
|
|
1271
|
+
|
|
1272
|
+
return f"Ch{channel_idx + 1}"
|
|
1273
|
+
|
|
1222
1274
|
|
|
1223
1275
|
class ArrowRow(DataModel):
|
|
1224
1276
|
"""`DataModel` for reading row from Arrow-supported file."""
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from pathlib import PurePosixPath
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AbstractUDF(ABC):
|
|
9
|
+
@abstractmethod
|
|
10
|
+
def process(self, *args, **kwargs):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def setup(self):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def teardown(self):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DataChainError(Exception):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DataChainParamsError(DataChainError):
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DataChainColumnError(DataChainParamsError):
|
|
31
|
+
def __init__(self, col_name: str, msg: str):
|
|
32
|
+
super().__init__(f"Error for column {col_name}: {msg}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
|
|
36
|
+
"""Returns normalized_name -> original_name dict."""
|
|
37
|
+
gen_col_counter = 0
|
|
38
|
+
new_col_names = {}
|
|
39
|
+
org_col_names = set(col_names)
|
|
40
|
+
|
|
41
|
+
for org_column in col_names:
|
|
42
|
+
new_column = org_column.lower()
|
|
43
|
+
new_column = re.sub("[^0-9a-z]+", "_", new_column)
|
|
44
|
+
new_column = new_column.strip("_")
|
|
45
|
+
|
|
46
|
+
generated_column = new_column
|
|
47
|
+
|
|
48
|
+
while (
|
|
49
|
+
not generated_column.isidentifier()
|
|
50
|
+
or generated_column in new_col_names
|
|
51
|
+
or (generated_column != org_column and generated_column in org_col_names)
|
|
52
|
+
):
|
|
53
|
+
if new_column:
|
|
54
|
+
generated_column = f"c{gen_col_counter}_{new_column}"
|
|
55
|
+
else:
|
|
56
|
+
generated_column = f"c{gen_col_counter}"
|
|
57
|
+
gen_col_counter += 1
|
|
58
|
+
|
|
59
|
+
new_col_names[generated_column] = org_column
|
|
60
|
+
|
|
61
|
+
return new_col_names
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def rebase_path(
|
|
65
|
+
src_path: str,
|
|
66
|
+
old_base: str,
|
|
67
|
+
new_base: str,
|
|
68
|
+
suffix: str = "",
|
|
69
|
+
extension: str = "",
|
|
70
|
+
) -> str:
|
|
71
|
+
"""
|
|
72
|
+
Rebase a file path from one base directory to another.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
src_path: Source file path (can include URI scheme like s3://)
|
|
76
|
+
old_base: Base directory to remove from src_path
|
|
77
|
+
new_base: New base directory to prepend
|
|
78
|
+
suffix: Optional suffix to add before file extension
|
|
79
|
+
extension: Optional new file extension (without dot)
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
str: Rebased path with new base directory
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
ValueError: If old_base is not found in src_path
|
|
86
|
+
"""
|
|
87
|
+
# Parse URIs to handle schemes properly
|
|
88
|
+
src_parsed = urlparse(src_path)
|
|
89
|
+
old_base_parsed = urlparse(old_base)
|
|
90
|
+
new_base_parsed = urlparse(new_base)
|
|
91
|
+
|
|
92
|
+
# Get the path component (without scheme)
|
|
93
|
+
if src_parsed.scheme:
|
|
94
|
+
src_path_only = src_parsed.netloc + src_parsed.path
|
|
95
|
+
else:
|
|
96
|
+
src_path_only = src_path
|
|
97
|
+
|
|
98
|
+
if old_base_parsed.scheme:
|
|
99
|
+
old_base_only = old_base_parsed.netloc + old_base_parsed.path
|
|
100
|
+
else:
|
|
101
|
+
old_base_only = old_base
|
|
102
|
+
|
|
103
|
+
# Normalize paths
|
|
104
|
+
src_path_norm = PurePosixPath(src_path_only).as_posix()
|
|
105
|
+
old_base_norm = PurePosixPath(old_base_only).as_posix()
|
|
106
|
+
|
|
107
|
+
# Find where old_base appears in src_path
|
|
108
|
+
if old_base_norm in src_path_norm:
|
|
109
|
+
# Find the index where old_base appears
|
|
110
|
+
idx = src_path_norm.find(old_base_norm)
|
|
111
|
+
if idx == -1:
|
|
112
|
+
raise ValueError(f"old_base '{old_base}' not found in src_path")
|
|
113
|
+
|
|
114
|
+
# Extract the relative path after old_base
|
|
115
|
+
relative_start = idx + len(old_base_norm)
|
|
116
|
+
# Skip leading slash if present
|
|
117
|
+
if relative_start < len(src_path_norm) and src_path_norm[relative_start] == "/":
|
|
118
|
+
relative_start += 1
|
|
119
|
+
relative_path = src_path_norm[relative_start:]
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError(f"old_base '{old_base}' not found in src_path")
|
|
122
|
+
|
|
123
|
+
# Parse the filename
|
|
124
|
+
path_obj = PurePosixPath(relative_path)
|
|
125
|
+
stem = path_obj.stem
|
|
126
|
+
current_ext = path_obj.suffix
|
|
127
|
+
|
|
128
|
+
# Apply suffix and extension changes
|
|
129
|
+
new_stem = stem + suffix if suffix else stem
|
|
130
|
+
if extension:
|
|
131
|
+
new_ext = f".{extension}"
|
|
132
|
+
elif current_ext:
|
|
133
|
+
new_ext = current_ext
|
|
134
|
+
else:
|
|
135
|
+
new_ext = ""
|
|
136
|
+
|
|
137
|
+
# Build new filename
|
|
138
|
+
new_name = new_stem + new_ext
|
|
139
|
+
|
|
140
|
+
# Reconstruct path with new base
|
|
141
|
+
parent = str(path_obj.parent)
|
|
142
|
+
if parent == ".":
|
|
143
|
+
new_relative_path = new_name
|
|
144
|
+
else:
|
|
145
|
+
new_relative_path = str(PurePosixPath(parent) / new_name)
|
|
146
|
+
|
|
147
|
+
# Handle new_base URI scheme
|
|
148
|
+
if new_base_parsed.scheme:
|
|
149
|
+
# Has schema like s3://
|
|
150
|
+
base_path = new_base_parsed.netloc + new_base_parsed.path
|
|
151
|
+
base_path = PurePosixPath(base_path).as_posix()
|
|
152
|
+
full_path = str(PurePosixPath(base_path) / new_relative_path)
|
|
153
|
+
return f"{new_base_parsed.scheme}://{full_path}"
|
|
154
|
+
# Regular path
|
|
155
|
+
return str(PurePosixPath(new_base) / new_relative_path)
|
|
@@ -9,7 +9,7 @@ import uuid
|
|
|
9
9
|
from collections.abc import Iterator
|
|
10
10
|
from datetime import datetime, timedelta, timezone
|
|
11
11
|
from pathlib import Path, PurePosixPath
|
|
12
|
-
from unittest.mock import patch
|
|
12
|
+
from unittest.mock import Mock, patch
|
|
13
13
|
|
|
14
14
|
import numpy as np
|
|
15
15
|
import pandas as pd
|
|
@@ -358,15 +358,24 @@ def test_to_storage(
|
|
|
358
358
|
file_type,
|
|
359
359
|
num_threads,
|
|
360
360
|
):
|
|
361
|
+
mapper = Mock(side_effect=lambda file_path: len(file_path))
|
|
362
|
+
|
|
361
363
|
ctc = cloud_test_catalog
|
|
362
364
|
df = dc.read_storage(ctc.src_uri, type=file_type, session=test_session)
|
|
363
365
|
if use_map:
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
366
|
+
(
|
|
367
|
+
df.settings(cache=use_cache)
|
|
368
|
+
.map(mapper, params=["file.path"], output={"path_len": int})
|
|
369
|
+
.map(res=lambda file: file.export(tmp_dir / "output", placement=placement))
|
|
370
|
+
.exec()
|
|
371
|
+
)
|
|
367
372
|
else:
|
|
368
|
-
|
|
369
|
-
|
|
373
|
+
(
|
|
374
|
+
df.settings(cache=use_cache)
|
|
375
|
+
.map(mapper, params=["file.path"], output={"path_len": int})
|
|
376
|
+
.to_storage(
|
|
377
|
+
tmp_dir / "output", placement=placement, num_threads=num_threads
|
|
378
|
+
)
|
|
370
379
|
)
|
|
371
380
|
|
|
372
381
|
expected = {
|
|
@@ -387,6 +396,8 @@ def test_to_storage(
|
|
|
387
396
|
with open(tmp_dir / "output" / file_path) as f:
|
|
388
397
|
assert f.read() == expected[file.name]
|
|
389
398
|
|
|
399
|
+
assert mapper.call_count == len(expected)
|
|
400
|
+
|
|
390
401
|
|
|
391
402
|
@pytest.mark.parametrize("use_cache", [True, False])
|
|
392
403
|
def test_export_images_files(test_session, tmp_dir, tmp_path, use_cache):
|
|
@@ -7,7 +7,7 @@ from fsspec.implementations.local import LocalFileSystem
|
|
|
7
7
|
from PIL import Image
|
|
8
8
|
|
|
9
9
|
from datachain.catalog import Catalog
|
|
10
|
-
from datachain.lib.file import File, FileError, ImageFile, TextFile, resolve
|
|
10
|
+
from datachain.lib.file import Audio, File, FileError, ImageFile, TextFile, resolve
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def create_file(source: str):
|
|
@@ -409,3 +409,49 @@ def test_path_normalized(path, expected, raises):
|
|
|
409
409
|
file.get_path_normalized()
|
|
410
410
|
else:
|
|
411
411
|
assert file.get_path_normalized() == expected
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def test_file_rebase_method():
|
|
415
|
+
"""Test File.rebase() method"""
|
|
416
|
+
file = File(source="s3://bucket", path="data/audio/file.wav")
|
|
417
|
+
|
|
418
|
+
# Basic rebase
|
|
419
|
+
result = file.rebase("s3://bucket/data/audio", "s3://output-bucket/waveforms")
|
|
420
|
+
assert result == "s3://output-bucket/waveforms/file.wav"
|
|
421
|
+
|
|
422
|
+
# With suffix and extension
|
|
423
|
+
result = file.rebase(
|
|
424
|
+
"s3://bucket/data/audio",
|
|
425
|
+
"s3://output-bucket/processed",
|
|
426
|
+
suffix="_ch1",
|
|
427
|
+
extension="npy",
|
|
428
|
+
)
|
|
429
|
+
assert result == "s3://output-bucket/processed/file_ch1.npy"
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def test_file_rebase_local_path():
|
|
433
|
+
"""Test File.rebase() with local file paths"""
|
|
434
|
+
file = File(source="file://", path="/data/audio/folder/file.mp3")
|
|
435
|
+
|
|
436
|
+
result = file.rebase("file:///data/audio", "/output/processed")
|
|
437
|
+
assert result == "/output/processed/folder/file.mp3"
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def test_audio_get_channel_name():
|
|
441
|
+
# Test known channel configurations
|
|
442
|
+
assert Audio.get_channel_name(1, 0) == "Mono"
|
|
443
|
+
assert Audio.get_channel_name(2, 0) == "Left"
|
|
444
|
+
assert Audio.get_channel_name(2, 1) == "Right"
|
|
445
|
+
assert Audio.get_channel_name(4, 2) == "Y" # Ambisonics
|
|
446
|
+
assert Audio.get_channel_name(6, 3) == "LFE" # 5.1 surround
|
|
447
|
+
assert Audio.get_channel_name(8, 7) == "SR" # 7.1 surround
|
|
448
|
+
|
|
449
|
+
# Test fallback for unknown configurations
|
|
450
|
+
assert Audio.get_channel_name(-1, 0) == "Ch1"
|
|
451
|
+
assert Audio.get_channel_name(3, 0) == "Ch1"
|
|
452
|
+
assert Audio.get_channel_name(5, 4) == "Ch5"
|
|
453
|
+
assert Audio.get_channel_name(10, 9) == "Ch10"
|
|
454
|
+
|
|
455
|
+
# Test out of range indices
|
|
456
|
+
assert Audio.get_channel_name(2, 5) == "Ch6"
|
|
457
|
+
assert Audio.get_channel_name(1, 1) == "Ch2"
|
|
@@ -5,7 +5,7 @@ import pytest
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
|
|
7
7
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
8
|
-
from datachain.lib.utils import normalize_col_names
|
|
8
|
+
from datachain.lib.utils import normalize_col_names, rebase_path
|
|
9
9
|
from datachain.sql.types import Array, String
|
|
10
10
|
|
|
11
11
|
|
|
@@ -110,3 +110,72 @@ def test_normalize_column_names_repeat_generated_after_normalize():
|
|
|
110
110
|
res = normalize_col_names(["c0_CoLuMn", "_column", "column"])
|
|
111
111
|
|
|
112
112
|
assert res == {"c0_column": "c0_CoLuMn", "c1_column": "_column", "column": "column"}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def test_rebase_path_basic():
|
|
116
|
+
result = rebase_path(
|
|
117
|
+
"/data/audio/folder1/file.wav", "/data/audio", "/output/waveforms"
|
|
118
|
+
)
|
|
119
|
+
assert result == "/output/waveforms/folder1/file.wav"
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def test_rebase_path_with_s3_uri():
|
|
123
|
+
result = rebase_path(
|
|
124
|
+
"s3://bucket/data/audio/folder/file.wav",
|
|
125
|
+
"data/audio",
|
|
126
|
+
"s3://output-bucket/waveforms",
|
|
127
|
+
)
|
|
128
|
+
assert result == "s3://output-bucket/waveforms/folder/file.wav"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def test_rebase_path_mixed_uri_schemes():
|
|
132
|
+
result = rebase_path(
|
|
133
|
+
"/local/data/audio/file.mp3", "/local/data/audio", "s3://bucket/output"
|
|
134
|
+
)
|
|
135
|
+
assert result == "s3://bucket/output/file.mp3"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def test_rebase_path_with_suffix():
|
|
139
|
+
result = rebase_path(
|
|
140
|
+
"/data/audio/file.wav", "/data/audio", "/output", suffix="_processed"
|
|
141
|
+
)
|
|
142
|
+
assert result == "/output/file_processed.wav"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def test_rebase_path_with_extension_change():
|
|
146
|
+
result = rebase_path("/data/audio/file.wav", "audio", "/output", extension="npy")
|
|
147
|
+
assert result == "/output/file.npy"
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def test_rebase_path_base_dir_not_in_path():
|
|
151
|
+
with pytest.raises(
|
|
152
|
+
ValueError, match="old_base '/data/audio' not found in src_path"
|
|
153
|
+
):
|
|
154
|
+
rebase_path("/different/path/file.wav", "/data/audio", "/output")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_rebase_path_partial_match_base_dir():
|
|
158
|
+
result = rebase_path("/home/user/data/audio/file.wav", "data/audio", "/output")
|
|
159
|
+
assert result == "/output/file.wav"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_rebase_path_complex_s3_paths():
|
|
163
|
+
result = rebase_path(
|
|
164
|
+
"s3://bucket/balanced_train_segments/audio/folder/file.flac",
|
|
165
|
+
"s3://bucket/balanced_train_segments",
|
|
166
|
+
"s3://output-bucket/waveforms",
|
|
167
|
+
suffix="_ch1",
|
|
168
|
+
extension="npy",
|
|
169
|
+
)
|
|
170
|
+
assert result == "s3://output-bucket/waveforms/audio/folder/file_ch1.npy"
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def test_rebase_path_file_without_extension():
|
|
174
|
+
result = rebase_path("/data/audio/file_no_ext", "/data/audio", "/output")
|
|
175
|
+
assert result == "/output/file_no_ext"
|
|
176
|
+
|
|
177
|
+
# With new extension
|
|
178
|
+
result = rebase_path(
|
|
179
|
+
"/data/audio/file_no_ext", "/data/audio", "/output", extension="txt"
|
|
180
|
+
)
|
|
181
|
+
assert result == "/output/file_no_ext.txt"
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from collections.abc import Sequence
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class AbstractUDF(ABC):
|
|
7
|
-
@abstractmethod
|
|
8
|
-
def process(self, *args, **kwargs):
|
|
9
|
-
pass
|
|
10
|
-
|
|
11
|
-
@abstractmethod
|
|
12
|
-
def setup(self):
|
|
13
|
-
pass
|
|
14
|
-
|
|
15
|
-
@abstractmethod
|
|
16
|
-
def teardown(self):
|
|
17
|
-
pass
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class DataChainError(Exception):
|
|
21
|
-
pass
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class DataChainParamsError(DataChainError):
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class DataChainColumnError(DataChainParamsError):
|
|
29
|
-
def __init__(self, col_name: str, msg: str):
|
|
30
|
-
super().__init__(f"Error for column {col_name}: {msg}")
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
|
|
34
|
-
"""Returns normalized_name -> original_name dict."""
|
|
35
|
-
gen_col_counter = 0
|
|
36
|
-
new_col_names = {}
|
|
37
|
-
org_col_names = set(col_names)
|
|
38
|
-
|
|
39
|
-
for org_column in col_names:
|
|
40
|
-
new_column = org_column.lower()
|
|
41
|
-
new_column = re.sub("[^0-9a-z]+", "_", new_column)
|
|
42
|
-
new_column = new_column.strip("_")
|
|
43
|
-
|
|
44
|
-
generated_column = new_column
|
|
45
|
-
|
|
46
|
-
while (
|
|
47
|
-
not generated_column.isidentifier()
|
|
48
|
-
or generated_column in new_col_names
|
|
49
|
-
or (generated_column != org_column and generated_column in org_col_names)
|
|
50
|
-
):
|
|
51
|
-
if new_column:
|
|
52
|
-
generated_column = f"c{gen_col_counter}_{new_column}"
|
|
53
|
-
else:
|
|
54
|
-
generated_column = f"c{gen_col_counter}"
|
|
55
|
-
gen_col_counter += 1
|
|
56
|
-
|
|
57
|
-
new_col_names[generated_column] = org_column
|
|
58
|
-
|
|
59
|
-
return new_col_names
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|