datachain 0.8.11__tar.gz → 0.8.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.8.11 → datachain-0.8.12}/PKG-INFO +1 -1
- datachain-0.8.12/docs/references/func.md +5 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/references/index.md +1 -1
- {datachain-0.8.11 → datachain-0.8.12}/mkdocs.yml +1 -1
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/catalog/catalog.py +1 -20
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/__init__.py +0 -8
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/commands/__init__.py +0 -2
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/commands/datasets.py +0 -19
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/parser/__init__.py +0 -25
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/dataset.py +0 -6
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/conditional.py +16 -9
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/func.py +4 -5
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/remote/studio.py +1 -13
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain.egg-info/PKG-INFO +1 -1
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain.egg-info/SOURCES.txt +1 -1
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_catalog.py +23 -22
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_datachain.py +4 -3
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_datasets.py +3 -3
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_pull.py +0 -32
- {datachain-0.8.11 → datachain-0.8.12}/tests/test_cli_studio.py +1 -1
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_datachain.py +23 -42
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_diff.py +20 -20
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_func.py +30 -0
- datachain-0.8.11/docs/references/sql.md +0 -18
- {datachain-0.8.11 → datachain-0.8.12}/.cruft.json +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.gitattributes +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.github/codecov.yaml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.github/dependabot.yml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.github/workflows/release.yml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.github/workflows/tests.yml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.gitignore +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/.pre-commit-config.yaml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/LICENSE +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/README.rst +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/assets/datachain.svg +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/contributing.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/examples.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/index.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/overrides/main.html +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/quick-start.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/references/datachain.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/references/datatype.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/references/file.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/references/torch.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/references/udf.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/docs/tutorials.md +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/multimodal/wds.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/noxfile.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/pyproject.toml +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/setup.cfg +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/__main__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/asyn.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cache.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/cli/utils.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/client/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/client/azure.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/client/gcs.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/client/hf.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/client/local.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/client/s3.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/config.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/error.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/array.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/base.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/numeric.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/path.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/random.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/string.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/func/window.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/job.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/clip.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/dc.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/file.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/hf.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/image.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/listing.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/settings.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/tar.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/text.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/udf.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/utils.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/listing.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/model/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/model/bbox.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/model/pose.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/model/segment.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/node.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/progress.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/py.typed +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/batch.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/dataset.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/metrics.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/params.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/queue.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/schema.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/session.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/udf.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/query/utils.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/types.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/sql/utils.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/studio.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/telemetry.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain/utils.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/conftest.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/data.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/examples/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/examples/test_examples.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/examples/wds_data.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_client.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_data_storage.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_file.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_hf.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_listing.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_ls.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_metrics.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_pytorch.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_query.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_session.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_toolkit.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/func/test_warehouse.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/scripts/feature_class.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/test_atomicity.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/test_cli_e2e.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/test_query_e2e.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/test_telemetry.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_asyn.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_cache.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_catalog.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_client.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_config.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_dataset.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_listing.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_metastore.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_query.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_query_params.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_serializer.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_session.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_utils.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.8.11 → datachain-0.8.12}/tests/utils.py +0 -0
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# Functions
|
|
2
|
+
|
|
3
|
+
Use built-in functions for data manipulation and analysis to operate on the underlying database storing the chain data. These functions are useful for operations like [`DataChain.filter`](datachain.md#datachain.lib.dc.DataChain.filter) and [`DataChain.mutate`](datachain.md#datachain.lib.dc.DataChain.mutate). Import these functions from `datachain.func`.
|
|
4
|
+
|
|
5
|
+
::: datachain.func
|
|
@@ -10,5 +10,5 @@ DataChain's API is organized into several modules:
|
|
|
10
10
|
- [DataType](./datatype.md) - Type system and schema definitions
|
|
11
11
|
- [File](./file.md) - File handling and storage operations
|
|
12
12
|
- [UDF](./udf.md) - User-defined functions and transformations
|
|
13
|
-
- [
|
|
13
|
+
- [Functions](./func.md) - Built-in functions for data manipulation and analysis
|
|
14
14
|
- [Torch](./torch.md) - PyTorch data loading utilities
|
|
@@ -73,7 +73,7 @@ nav:
|
|
|
73
73
|
- File: references/file.md
|
|
74
74
|
- UDF: references/udf.md
|
|
75
75
|
- Torch: references/torch.md
|
|
76
|
-
-
|
|
76
|
+
- Functions: references/func.md
|
|
77
77
|
- 🤝 Contributing: contributing.md
|
|
78
78
|
|
|
79
79
|
- DataChain Website ↗: https://datachain.ai" target="_blank"
|
|
@@ -38,7 +38,6 @@ from datachain.dataset import (
|
|
|
38
38
|
DatasetDependency,
|
|
39
39
|
DatasetListRecord,
|
|
40
40
|
DatasetRecord,
|
|
41
|
-
DatasetStats,
|
|
42
41
|
DatasetStatus,
|
|
43
42
|
StorageURI,
|
|
44
43
|
create_dataset_uri,
|
|
@@ -1235,17 +1234,6 @@ class Catalog:
|
|
|
1235
1234
|
dataset = self.get_dataset(name)
|
|
1236
1235
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1237
1236
|
|
|
1238
|
-
def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
|
|
1239
|
-
"""
|
|
1240
|
-
Returns tuple with dataset stats: total number of rows and total dataset size.
|
|
1241
|
-
"""
|
|
1242
|
-
dataset = self.get_dataset(name)
|
|
1243
|
-
dataset_version = dataset.get_version(version or dataset.latest_version)
|
|
1244
|
-
return DatasetStats(
|
|
1245
|
-
num_objects=dataset_version.num_objects,
|
|
1246
|
-
size=dataset_version.size,
|
|
1247
|
-
)
|
|
1248
|
-
|
|
1249
1237
|
def remove_dataset(
|
|
1250
1238
|
self,
|
|
1251
1239
|
name: str,
|
|
@@ -1391,19 +1379,12 @@ class Catalog:
|
|
|
1391
1379
|
except DatasetNotFoundError:
|
|
1392
1380
|
pass
|
|
1393
1381
|
|
|
1394
|
-
stats_response = studio_client.dataset_stats(
|
|
1395
|
-
remote_ds_name, remote_ds_version.version
|
|
1396
|
-
)
|
|
1397
|
-
if not stats_response.ok:
|
|
1398
|
-
raise_remote_error(stats_response.message)
|
|
1399
|
-
ds_stats = stats_response.data
|
|
1400
|
-
|
|
1401
1382
|
dataset_save_progress_bar = tqdm(
|
|
1402
1383
|
desc=f"Saving dataset {remote_ds_uri} locally: ",
|
|
1403
1384
|
unit=" rows",
|
|
1404
1385
|
unit_scale=True,
|
|
1405
1386
|
unit_divisor=1000,
|
|
1406
|
-
total=
|
|
1387
|
+
total=remote_ds_version.num_objects, # type: ignore [union-attr]
|
|
1407
1388
|
leave=False,
|
|
1408
1389
|
)
|
|
1409
1390
|
|
|
@@ -11,7 +11,6 @@ from datachain.telemetry import telemetry
|
|
|
11
11
|
from .commands import (
|
|
12
12
|
clear_cache,
|
|
13
13
|
completion,
|
|
14
|
-
dataset_stats,
|
|
15
14
|
du,
|
|
16
15
|
edit_dataset,
|
|
17
16
|
garbage_collect,
|
|
@@ -182,13 +181,6 @@ def handle_dataset_command(args, catalog):
|
|
|
182
181
|
all=args.all,
|
|
183
182
|
team=args.team,
|
|
184
183
|
),
|
|
185
|
-
"stats": lambda: dataset_stats(
|
|
186
|
-
catalog,
|
|
187
|
-
args.name,
|
|
188
|
-
args.version,
|
|
189
|
-
show_bytes=args.bytes,
|
|
190
|
-
si=args.si,
|
|
191
|
-
),
|
|
192
184
|
}
|
|
193
185
|
|
|
194
186
|
handler = dataset_commands.get(args.datasets_cmd)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from .datasets import (
|
|
2
|
-
dataset_stats,
|
|
3
2
|
edit_dataset,
|
|
4
3
|
list_datasets,
|
|
5
4
|
list_datasets_local,
|
|
@@ -15,7 +14,6 @@ from .show import show
|
|
|
15
14
|
__all__ = [
|
|
16
15
|
"clear_cache",
|
|
17
16
|
"completion",
|
|
18
|
-
"dataset_stats",
|
|
19
17
|
"du",
|
|
20
18
|
"edit_dataset",
|
|
21
19
|
"garbage_collect",
|
|
@@ -3,8 +3,6 @@ from typing import TYPE_CHECKING, Optional
|
|
|
3
3
|
|
|
4
4
|
from tabulate import tabulate
|
|
5
5
|
|
|
6
|
-
from datachain import utils
|
|
7
|
-
|
|
8
6
|
if TYPE_CHECKING:
|
|
9
7
|
from datachain.catalog import Catalog
|
|
10
8
|
|
|
@@ -109,20 +107,3 @@ def edit_dataset(
|
|
|
109
107
|
|
|
110
108
|
if (all or studio) and token:
|
|
111
109
|
edit_studio_dataset(team, name, new_name, description, labels)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def dataset_stats(
|
|
115
|
-
catalog: "Catalog",
|
|
116
|
-
name: str,
|
|
117
|
-
version: int,
|
|
118
|
-
show_bytes=False,
|
|
119
|
-
si=False,
|
|
120
|
-
):
|
|
121
|
-
stats = catalog.dataset_stats(name, version)
|
|
122
|
-
|
|
123
|
-
if stats:
|
|
124
|
-
print(f"Number of objects: {stats.num_objects}")
|
|
125
|
-
if show_bytes:
|
|
126
|
-
print(f"Total objects size: {stats.size}")
|
|
127
|
-
else:
|
|
128
|
-
print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")
|
|
@@ -307,31 +307,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
307
307
|
help="The team to delete a dataset. By default, it will use team from config",
|
|
308
308
|
)
|
|
309
309
|
|
|
310
|
-
dataset_stats_parser = datasets_subparser.add_parser(
|
|
311
|
-
"stats", parents=[parent_parser], description="Show basic dataset statistics."
|
|
312
|
-
)
|
|
313
|
-
dataset_stats_parser.add_argument("name", type=str, help="Dataset name")
|
|
314
|
-
dataset_stats_parser.add_argument(
|
|
315
|
-
"--version",
|
|
316
|
-
action="store",
|
|
317
|
-
default=None,
|
|
318
|
-
type=int,
|
|
319
|
-
help="Dataset version",
|
|
320
|
-
)
|
|
321
|
-
dataset_stats_parser.add_argument(
|
|
322
|
-
"-b",
|
|
323
|
-
"--bytes",
|
|
324
|
-
default=False,
|
|
325
|
-
action="store_true",
|
|
326
|
-
help="Display size in bytes instead of human-readable size",
|
|
327
|
-
)
|
|
328
|
-
dataset_stats_parser.add_argument(
|
|
329
|
-
"--si",
|
|
330
|
-
default=False,
|
|
331
|
-
action="store_true",
|
|
332
|
-
help="Display size using powers of 1000 not 1024",
|
|
333
|
-
)
|
|
334
|
-
|
|
335
310
|
parse_ls = subp.add_parser(
|
|
336
311
|
"ls", parents=[parent_parser], description="List storage contents."
|
|
337
312
|
)
|
|
@@ -150,12 +150,6 @@ class DatasetDependency:
|
|
|
150
150
|
return hash(f"{self.type}_{self.name}_{self.version}")
|
|
151
151
|
|
|
152
152
|
|
|
153
|
-
@dataclass
|
|
154
|
-
class DatasetStats:
|
|
155
|
-
num_objects: Optional[int] # None if table is missing
|
|
156
|
-
size: Optional[int] # in bytes None if table is missing or empty
|
|
157
|
-
|
|
158
|
-
|
|
159
153
|
class DatasetStatus:
|
|
160
154
|
CREATED = 1
|
|
161
155
|
PENDING = 2
|
|
@@ -9,7 +9,7 @@ from datachain.sql.functions import conditional
|
|
|
9
9
|
|
|
10
10
|
from .func import ColT, Func
|
|
11
11
|
|
|
12
|
-
CaseT = Union[int, float, complex, bool, str, Func]
|
|
12
|
+
CaseT = Union[int, float, complex, bool, str, Func, ColumnElement]
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def greatest(*args: Union[ColT, float]) -> Func:
|
|
@@ -94,11 +94,12 @@ def case(
|
|
|
94
94
|
"""
|
|
95
95
|
Returns the case function that produces case expression which has a list of
|
|
96
96
|
conditions and corresponding results. Results can be python primitives like string,
|
|
97
|
-
numbers or booleans but can also be other nested
|
|
97
|
+
numbers or booleans but can also be other nested functions (including case function)
|
|
98
|
+
or columns.
|
|
98
99
|
Result type is inferred from condition results.
|
|
99
100
|
|
|
100
101
|
Args:
|
|
101
|
-
args
|
|
102
|
+
args tuple((ColumnElement | Func),(str | int | float | complex | bool, Func, ColumnElement)):
|
|
102
103
|
Tuple of condition and values pair.
|
|
103
104
|
else_ (str | int | float | complex | bool, Func): optional else value in case
|
|
104
105
|
expression. If omitted, and no case conditions are satisfied, the result
|
|
@@ -113,13 +114,16 @@ def case(
|
|
|
113
114
|
res=func.case((C("num") > 0, "P"), (C("num") < 0, "N"), else_="Z"),
|
|
114
115
|
)
|
|
115
116
|
```
|
|
116
|
-
"""
|
|
117
|
+
""" # noqa: E501
|
|
117
118
|
supported_types = [int, float, complex, str, bool]
|
|
118
119
|
|
|
119
120
|
def _get_type(val):
|
|
120
121
|
if isinstance(val, Func):
|
|
121
122
|
# nested functions
|
|
122
123
|
return val.result_type
|
|
124
|
+
if isinstance(val, Column):
|
|
125
|
+
# at this point we cannot know what is the type of a column
|
|
126
|
+
return None
|
|
123
127
|
return type(val)
|
|
124
128
|
|
|
125
129
|
if not args:
|
|
@@ -129,13 +133,16 @@ def case(
|
|
|
129
133
|
|
|
130
134
|
for arg in args:
|
|
131
135
|
arg_type = _get_type(arg[1])
|
|
136
|
+
if arg_type is None:
|
|
137
|
+
# we couldn't figure out the type of case value
|
|
138
|
+
continue
|
|
132
139
|
if type_ and arg_type != type_:
|
|
133
140
|
raise DataChainParamsError(
|
|
134
141
|
f"Statement values must be of the same type, got {type_} and {arg_type}"
|
|
135
142
|
)
|
|
136
143
|
type_ = arg_type
|
|
137
144
|
|
|
138
|
-
if type_ not in supported_types:
|
|
145
|
+
if type_ is not None and type_ not in supported_types:
|
|
139
146
|
raise DataChainParamsError(
|
|
140
147
|
f"Only python literals ({supported_types}) are supported for values"
|
|
141
148
|
)
|
|
@@ -151,15 +158,15 @@ def ifelse(
|
|
|
151
158
|
"""
|
|
152
159
|
Returns the ifelse function that produces if expression which has a condition
|
|
153
160
|
and values for true and false outcome. Results can be one of python primitives
|
|
154
|
-
like string, numbers or booleans, but can also be nested functions.
|
|
161
|
+
like string, numbers or booleans, but can also be nested functions or columns.
|
|
155
162
|
Result type is inferred from the values.
|
|
156
163
|
|
|
157
164
|
Args:
|
|
158
165
|
condition (ColumnElement, Func): Condition which is evaluated.
|
|
159
|
-
if_val (str | int | float | complex | bool, Func): Value for true
|
|
166
|
+
if_val (str | int | float | complex | bool, Func, ColumnElement): Value for true
|
|
160
167
|
condition outcome.
|
|
161
|
-
else_val (str | int | float | complex | bool, Func): Value for
|
|
162
|
-
outcome.
|
|
168
|
+
else_val (str | int | float | complex | bool, Func, ColumnElement): Value for
|
|
169
|
+
false condition outcome.
|
|
163
170
|
|
|
164
171
|
Returns:
|
|
165
172
|
Func: A Func object that represents the ifelse function.
|
|
@@ -424,10 +424,9 @@ class Func(Function):
|
|
|
424
424
|
|
|
425
425
|
def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
|
|
426
426
|
if isinstance(col, tuple):
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
427
|
+
# we can only get tuple from case statement where the first tuple item
|
|
428
|
+
# is condition, and second one is value which type is important
|
|
429
|
+
col = col[1]
|
|
431
430
|
if isinstance(col, Func):
|
|
432
431
|
return col.get_result_type(signals_schema)
|
|
433
432
|
|
|
@@ -435,7 +434,7 @@ def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
|
|
|
435
434
|
return sql_to_python(col)
|
|
436
435
|
|
|
437
436
|
return signals_schema.get_column_type(
|
|
438
|
-
col.name if isinstance(col, ColumnElement) else col
|
|
437
|
+
col.name if isinstance(col, ColumnElement) else col # type: ignore[arg-type]
|
|
439
438
|
)
|
|
440
439
|
|
|
441
440
|
|
|
@@ -16,14 +16,12 @@ from urllib.parse import urlparse, urlunparse
|
|
|
16
16
|
import websockets
|
|
17
17
|
|
|
18
18
|
from datachain.config import Config
|
|
19
|
-
from datachain.dataset import DatasetStats
|
|
20
19
|
from datachain.error import DataChainError
|
|
21
20
|
from datachain.utils import STUDIO_URL, retry_with_backoff
|
|
22
21
|
|
|
23
22
|
T = TypeVar("T")
|
|
24
23
|
LsData = Optional[list[dict[str, Any]]]
|
|
25
24
|
DatasetInfoData = Optional[dict[str, Any]]
|
|
26
|
-
DatasetStatsData = Optional[DatasetStats]
|
|
27
25
|
DatasetRowsData = Optional[Iterable[dict[str, Any]]]
|
|
28
26
|
DatasetJobVersionsData = Optional[dict[str, Any]]
|
|
29
27
|
DatasetExportStatus = Optional[dict[str, Any]]
|
|
@@ -309,7 +307,7 @@ class StudioClient:
|
|
|
309
307
|
"datachain/datasets",
|
|
310
308
|
{
|
|
311
309
|
"dataset_name": name,
|
|
312
|
-
"
|
|
310
|
+
"dataset_version": version,
|
|
313
311
|
"force": force,
|
|
314
312
|
},
|
|
315
313
|
method="DELETE",
|
|
@@ -347,16 +345,6 @@ class StudioClient:
|
|
|
347
345
|
method="GET",
|
|
348
346
|
)
|
|
349
347
|
|
|
350
|
-
def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
|
|
351
|
-
response = self._send_request(
|
|
352
|
-
"datachain/datasets/stats",
|
|
353
|
-
{"dataset_name": name, "dataset_version": version},
|
|
354
|
-
method="GET",
|
|
355
|
-
)
|
|
356
|
-
if response.ok:
|
|
357
|
-
response.data = DatasetStats(**response.data)
|
|
358
|
-
return response
|
|
359
|
-
|
|
360
348
|
def export_dataset_table(
|
|
361
349
|
self, name: str, version: int
|
|
362
350
|
) -> Response[DatasetExportSignedUrls]:
|
|
@@ -31,8 +31,8 @@ docs/overrides/main.html
|
|
|
31
31
|
docs/references/datachain.md
|
|
32
32
|
docs/references/datatype.md
|
|
33
33
|
docs/references/file.md
|
|
34
|
+
docs/references/func.md
|
|
34
35
|
docs/references/index.md
|
|
35
|
-
docs/references/sql.md
|
|
36
36
|
docs/references/torch.md
|
|
37
37
|
docs/references/udf.md
|
|
38
38
|
examples/computer_vision/iptc_exif_xmp_lib.py
|
|
@@ -17,7 +17,8 @@ from tests.utils import DEFAULT_TREE, skip_if_not_sqlite, tree_from_path
|
|
|
17
17
|
def listing_stats(uri, catalog):
|
|
18
18
|
list_dataset_name, _, _ = parse_listing_uri(uri, catalog.client_config)
|
|
19
19
|
dataset = catalog.get_dataset(list_dataset_name)
|
|
20
|
-
|
|
20
|
+
dataset_version = dataset.get_version(dataset.latest_version)
|
|
21
|
+
return dataset_version.num_objects, dataset_version.size
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
@pytest.fixture
|
|
@@ -582,23 +583,23 @@ def test_listing_stats(cloud_test_catalog):
|
|
|
582
583
|
listing_stats(src_uri, catalog)
|
|
583
584
|
|
|
584
585
|
catalog.enlist_source(src_uri)
|
|
585
|
-
|
|
586
|
-
assert
|
|
587
|
-
assert
|
|
586
|
+
num_objects, size = listing_stats(src_uri, catalog)
|
|
587
|
+
assert num_objects == 7
|
|
588
|
+
assert size == 36
|
|
588
589
|
|
|
589
590
|
catalog.enlist_source(f"{src_uri}/dogs/", update=True)
|
|
590
|
-
|
|
591
|
-
assert
|
|
592
|
-
assert
|
|
591
|
+
num_objects, size = listing_stats(src_uri, catalog)
|
|
592
|
+
assert num_objects == 7
|
|
593
|
+
assert size == 36
|
|
593
594
|
|
|
594
|
-
|
|
595
|
-
assert
|
|
596
|
-
assert
|
|
595
|
+
num_objects, size = listing_stats(f"{src_uri}/dogs/", catalog)
|
|
596
|
+
assert num_objects == 4
|
|
597
|
+
assert size == 15
|
|
597
598
|
|
|
598
599
|
catalog.enlist_source(f"{src_uri}/dogs/")
|
|
599
|
-
|
|
600
|
-
assert
|
|
601
|
-
assert
|
|
600
|
+
num_objects, size = listing_stats(src_uri, catalog)
|
|
601
|
+
assert num_objects == 7
|
|
602
|
+
assert size == 36
|
|
602
603
|
|
|
603
604
|
|
|
604
605
|
@pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
|
|
@@ -608,15 +609,15 @@ def test_enlist_source_handles_slash(cloud_test_catalog):
|
|
|
608
609
|
src_path = f"{src_uri}/dogs"
|
|
609
610
|
|
|
610
611
|
catalog.enlist_source(src_path)
|
|
611
|
-
|
|
612
|
-
assert
|
|
613
|
-
assert
|
|
612
|
+
num_objects, size = listing_stats(src_path, catalog)
|
|
613
|
+
assert num_objects == len(DEFAULT_TREE["dogs"])
|
|
614
|
+
assert size == 15
|
|
614
615
|
|
|
615
616
|
src_path = f"{src_uri}/dogs"
|
|
616
617
|
catalog.enlist_source(src_path, update=True)
|
|
617
|
-
|
|
618
|
-
assert
|
|
619
|
-
assert
|
|
618
|
+
num_objects, size = listing_stats(src_path, catalog)
|
|
619
|
+
assert num_objects == len(DEFAULT_TREE["dogs"])
|
|
620
|
+
assert size == 15
|
|
620
621
|
|
|
621
622
|
|
|
622
623
|
@pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
|
|
@@ -626,10 +627,10 @@ def test_enlist_source_handles_glob(cloud_test_catalog):
|
|
|
626
627
|
src_path = f"{src_uri}/dogs/*.jpg"
|
|
627
628
|
|
|
628
629
|
catalog.enlist_source(src_path)
|
|
629
|
-
|
|
630
|
+
num_objects, size = listing_stats(src_path, catalog)
|
|
630
631
|
|
|
631
|
-
assert
|
|
632
|
-
assert
|
|
632
|
+
assert num_objects == len(DEFAULT_TREE["dogs"])
|
|
633
|
+
assert size == 15
|
|
633
634
|
|
|
634
635
|
|
|
635
636
|
@pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
|
|
@@ -20,7 +20,7 @@ from sqlalchemy import Column
|
|
|
20
20
|
from datachain import DataModel, func
|
|
21
21
|
from datachain.catalog.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
|
|
22
22
|
from datachain.data_storage.sqlite import SQLiteWarehouse
|
|
23
|
-
from datachain.dataset import DatasetDependencyType
|
|
23
|
+
from datachain.dataset import DatasetDependencyType
|
|
24
24
|
from datachain.func import path as pathfunc
|
|
25
25
|
from datachain.lib.dc import C, DataChain
|
|
26
26
|
from datachain.lib.file import File, ImageFile
|
|
@@ -515,8 +515,9 @@ def test_from_storage_dataset_stats(tmp_dir, test_session):
|
|
|
515
515
|
dc = DataChain.from_storage(tmp_dir.as_uri(), session=test_session).save(
|
|
516
516
|
"test-data"
|
|
517
517
|
)
|
|
518
|
-
|
|
519
|
-
assert
|
|
518
|
+
version = test_session.catalog.get_dataset(dc.name).get_version(dc.version)
|
|
519
|
+
assert version.num_objects == 4
|
|
520
|
+
assert version.size == 20
|
|
520
521
|
|
|
521
522
|
|
|
522
523
|
def test_from_storage_check_rows(tmp_dir, test_session):
|
|
@@ -845,9 +845,9 @@ def test_row_random(cloud_test_catalog):
|
|
|
845
845
|
|
|
846
846
|
def test_dataset_stats_registered_ds(cloud_test_catalog, dogs_dataset):
|
|
847
847
|
catalog = cloud_test_catalog.catalog
|
|
848
|
-
|
|
849
|
-
assert
|
|
850
|
-
assert
|
|
848
|
+
dataset = catalog.get_dataset(dogs_dataset.name).get_version(1)
|
|
849
|
+
assert dataset.num_objects == 4
|
|
850
|
+
assert dataset.size == 15
|
|
851
851
|
rows_count = catalog.warehouse.dataset_rows_count(dogs_dataset, 1)
|
|
852
852
|
assert rows_count == 4
|
|
853
853
|
|
|
@@ -154,14 +154,6 @@ def remote_dataset_info(requests_mock, remote_dataset):
|
|
|
154
154
|
requests_mock.get(f"{STUDIO_URL}/api/datachain/datasets/info", json=remote_dataset)
|
|
155
155
|
|
|
156
156
|
|
|
157
|
-
@pytest.fixture
|
|
158
|
-
def remote_dataset_stats(requests_mock):
|
|
159
|
-
requests_mock.get(
|
|
160
|
-
f"{STUDIO_URL}/api/datachain/datasets/stats",
|
|
161
|
-
json={"num_objects": 5, "size": 1000},
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
|
|
165
157
|
@pytest.fixture
|
|
166
158
|
def dataset_export(requests_mock, remote_dataset_chunk_url):
|
|
167
159
|
requests_mock.get(
|
|
@@ -194,7 +186,6 @@ def test_pull_dataset_success(
|
|
|
194
186
|
mocker,
|
|
195
187
|
cloud_test_catalog,
|
|
196
188
|
remote_dataset_info,
|
|
197
|
-
remote_dataset_stats,
|
|
198
189
|
dataset_export,
|
|
199
190
|
dataset_export_status,
|
|
200
191
|
dataset_export_data_chunk,
|
|
@@ -322,25 +313,6 @@ def test_pull_dataset_not_found_in_remote(
|
|
|
322
313
|
assert str(exc_info.value) == "Error from server: Dataset not found"
|
|
323
314
|
|
|
324
315
|
|
|
325
|
-
@pytest.mark.parametrize("cloud_type, version_aware", [("s3", False)], indirect=True)
|
|
326
|
-
@skip_if_not_sqlite
|
|
327
|
-
def test_pull_dataset_error_on_fetching_stats(
|
|
328
|
-
requests_mock,
|
|
329
|
-
cloud_test_catalog,
|
|
330
|
-
remote_dataset_info,
|
|
331
|
-
):
|
|
332
|
-
requests_mock.get(
|
|
333
|
-
f"{STUDIO_URL}/api/datachain/datasets/stats",
|
|
334
|
-
status_code=400,
|
|
335
|
-
json={"message": "Internal error"},
|
|
336
|
-
)
|
|
337
|
-
catalog = cloud_test_catalog.catalog
|
|
338
|
-
|
|
339
|
-
with pytest.raises(DataChainError) as exc_info:
|
|
340
|
-
catalog.pull_dataset("ds://dogs@v1")
|
|
341
|
-
assert str(exc_info.value) == "Error from server: Internal error"
|
|
342
|
-
|
|
343
|
-
|
|
344
316
|
@pytest.mark.parametrize("cloud_type, version_aware", [("s3", False)], indirect=True)
|
|
345
317
|
@pytest.mark.parametrize("export_status", ["failed", "removed"])
|
|
346
318
|
@skip_if_not_sqlite
|
|
@@ -348,7 +320,6 @@ def test_pull_dataset_exporting_dataset_failed_in_remote(
|
|
|
348
320
|
requests_mock,
|
|
349
321
|
cloud_test_catalog,
|
|
350
322
|
remote_dataset_info,
|
|
351
|
-
remote_dataset_stats,
|
|
352
323
|
dataset_export,
|
|
353
324
|
export_status,
|
|
354
325
|
):
|
|
@@ -372,7 +343,6 @@ def test_pull_dataset_empty_parquet(
|
|
|
372
343
|
requests_mock,
|
|
373
344
|
cloud_test_catalog,
|
|
374
345
|
remote_dataset_info,
|
|
375
|
-
remote_dataset_stats,
|
|
376
346
|
dataset_export,
|
|
377
347
|
dataset_export_status,
|
|
378
348
|
remote_dataset_chunk_url,
|
|
@@ -389,7 +359,6 @@ def test_pull_dataset_empty_parquet(
|
|
|
389
359
|
def test_pull_dataset_already_exists_locally(
|
|
390
360
|
cloud_test_catalog,
|
|
391
361
|
remote_dataset_info,
|
|
392
|
-
remote_dataset_stats,
|
|
393
362
|
dataset_export,
|
|
394
363
|
dataset_export_status,
|
|
395
364
|
dataset_export_data_chunk,
|
|
@@ -416,7 +385,6 @@ def test_pull_dataset_already_exists_locally(
|
|
|
416
385
|
def test_pull_dataset_local_name_already_exists(
|
|
417
386
|
cloud_test_catalog,
|
|
418
387
|
remote_dataset_info,
|
|
419
|
-
remote_dataset_stats,
|
|
420
388
|
dataset_export,
|
|
421
389
|
dataset_export_status,
|
|
422
390
|
dataset_export_data_chunk,
|