datachain 0.6.9__tar.gz → 0.6.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.6.9/src/datachain.egg-info → datachain-0.6.11}/PKG-INFO +2 -2
- {datachain-0.6.9 → datachain-0.6.11}/mkdocs.yml +1 -1
- {datachain-0.6.9 → datachain-0.6.11}/pyproject.toml +1 -1
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/catalog/catalog.py +15 -3
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/sqlite.py +6 -2
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/dc.py +53 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/listing.py +24 -7
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/sqlite/types.py +8 -1
- datachain-0.6.11/src/datachain/toolkit/__init__.py +3 -0
- datachain-0.6.11/src/datachain/toolkit/split.py +67 -0
- {datachain-0.6.9 → datachain-0.6.11/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/SOURCES.txt +5 -1
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.6.9 → datachain-0.6.11}/tests/conftest.py +41 -1
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_dataset_query.py +66 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_pull.py +33 -6
- datachain-0.6.11/tests/func/test_toolkit.py +42 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_datachain.py +42 -0
- datachain-0.6.11/tests/unit/sql/sqlite/test_types.py +19 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_listing.py +2 -1
- {datachain-0.6.9 → datachain-0.6.11}/.cruft.json +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.gitattributes +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.github/codecov.yaml +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.github/dependabot.yml +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.github/workflows/release.yml +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.github/workflows/tests.yml +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.gitignore +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/.pre-commit-config.yaml +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/CONTRIBUTING.rst +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/LICENSE +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/README.rst +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/assets/datachain.svg +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/index.md +0 -0
- {datachain-0.6.9 → datachain-0.6.11/docs}/overrides/main.html +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/references/datachain.md +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/references/datatype.md +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/references/file.md +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/references/index.md +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/references/sql.md +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/references/torch.md +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/docs/references/udf.md +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/multimodal/wds.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/noxfile.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/setup.cfg +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/__main__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/asyn.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/cache.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/cli.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/cli_utils.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/azure.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/gcs.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/hf.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/local.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/s3.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/config.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/dataset.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/error.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/job.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/clip.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/file.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/func/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/func/aggregate.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/func/func.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/hf.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/image.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/listing.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/models/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/models/bbox.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/models/pose.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/models/yolo.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/settings.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/tar.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/text.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/udf.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/utils.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/node.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/progress.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/py.typed +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/batch.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/dataset.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/metrics.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/params.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/queue.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/schema.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/session.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/remote/studio.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/types.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/utils.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/studio.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/telemetry.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain/utils.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/data.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/examples/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/examples/test_examples.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/examples/wds_data.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_catalog.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_client.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_datachain.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_datasets.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_listing.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_ls.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_metrics.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_pytorch.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_query.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/scripts/feature_class.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/test_atomicity.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/test_cli_e2e.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/test_cli_studio.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/test_query_e2e.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/test_telemetry.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_asyn.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_cache.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_catalog.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_client.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_config.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_dataset.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_metastore.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_query.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_query_params.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_serializer.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_session.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_utils.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.6.9 → datachain-0.6.11}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -71,7 +71,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
|
71
71
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
72
72
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
73
73
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
74
|
-
Requires-Dist: pytest-servers[all]>=0.5.
|
|
74
|
+
Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
|
|
75
75
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
76
76
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
77
77
|
Requires-Dist: virtualenv; extra == "tests"
|
|
@@ -603,9 +603,10 @@ class Catalog:
|
|
|
603
603
|
)
|
|
604
604
|
|
|
605
605
|
lst = Listing(
|
|
606
|
+
self.metastore.clone(),
|
|
606
607
|
self.warehouse.clone(),
|
|
607
608
|
Client.get_client(list_uri, self.cache, **self.client_config),
|
|
608
|
-
|
|
609
|
+
dataset_name=list_ds_name,
|
|
609
610
|
object_name=object_name,
|
|
610
611
|
)
|
|
611
612
|
|
|
@@ -698,9 +699,13 @@ class Catalog:
|
|
|
698
699
|
|
|
699
700
|
client = self.get_client(source, **client_config)
|
|
700
701
|
uri = client.uri
|
|
701
|
-
st = self.warehouse.clone()
|
|
702
702
|
dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
|
|
703
|
-
listing = Listing(
|
|
703
|
+
listing = Listing(
|
|
704
|
+
self.metastore.clone(),
|
|
705
|
+
self.warehouse.clone(),
|
|
706
|
+
client,
|
|
707
|
+
dataset_name=dataset_name,
|
|
708
|
+
)
|
|
704
709
|
rows = DatasetQuery(
|
|
705
710
|
name=dataset.name, version=ds_version, catalog=self
|
|
706
711
|
).to_db_records()
|
|
@@ -1354,6 +1359,13 @@ class Catalog:
|
|
|
1354
1359
|
# we will create new one if it doesn't exist
|
|
1355
1360
|
pass
|
|
1356
1361
|
|
|
1362
|
+
if dataset and version and dataset.has_version(version):
|
|
1363
|
+
"""No need to communicate with Studio at all"""
|
|
1364
|
+
dataset_uri = create_dataset_uri(remote_dataset_name, version)
|
|
1365
|
+
print(f"Local copy of dataset {dataset_uri} already present")
|
|
1366
|
+
_instantiate_dataset()
|
|
1367
|
+
return
|
|
1368
|
+
|
|
1357
1369
|
remote_dataset = self.get_remote_dataset(remote_dataset_name)
|
|
1358
1370
|
# if version is not specified in uri, take the latest one
|
|
1359
1371
|
if not version:
|
|
@@ -747,8 +747,12 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
747
747
|
|
|
748
748
|
ids = self.db.execute(select_ids).fetchall()
|
|
749
749
|
|
|
750
|
-
select_q =
|
|
751
|
-
|
|
750
|
+
select_q = (
|
|
751
|
+
query.with_only_columns(
|
|
752
|
+
*[c for c in query.selected_columns if c.name != "sys__id"]
|
|
753
|
+
)
|
|
754
|
+
.offset(None)
|
|
755
|
+
.limit(None)
|
|
752
756
|
)
|
|
753
757
|
|
|
754
758
|
for batch in batched_it(ids, 10_000):
|
|
@@ -642,6 +642,59 @@ class DataChain:
|
|
|
642
642
|
}
|
|
643
643
|
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
644
644
|
|
|
645
|
+
def explode(
|
|
646
|
+
self,
|
|
647
|
+
col: str,
|
|
648
|
+
model_name: Optional[str] = None,
|
|
649
|
+
object_name: Optional[str] = None,
|
|
650
|
+
) -> "DataChain":
|
|
651
|
+
"""Explodes a column containing JSON objects (dict or str DataChain type) into
|
|
652
|
+
individual columns based on the schema of the JSON. Schema is inferred from
|
|
653
|
+
the first row of the column.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
col: the name of the column containing JSON to be exploded.
|
|
657
|
+
model_name: optional generated model name. By default generates the name
|
|
658
|
+
automatically.
|
|
659
|
+
object_name: optional generated object column name. By default generates the
|
|
660
|
+
name automatically.
|
|
661
|
+
|
|
662
|
+
Returns:
|
|
663
|
+
DataChain: A new DataChain instance with the new set of columns.
|
|
664
|
+
"""
|
|
665
|
+
import json
|
|
666
|
+
|
|
667
|
+
import pyarrow as pa
|
|
668
|
+
|
|
669
|
+
from datachain.lib.arrow import schema_to_output
|
|
670
|
+
|
|
671
|
+
json_value = next(self.limit(1).collect(col))
|
|
672
|
+
json_dict = (
|
|
673
|
+
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
if not isinstance(json_dict, dict):
|
|
677
|
+
raise TypeError(f"Column {col} should be a string or dict type with JSON")
|
|
678
|
+
|
|
679
|
+
schema = pa.Table.from_pylist([json_dict]).schema
|
|
680
|
+
output = schema_to_output(schema, None)
|
|
681
|
+
|
|
682
|
+
if not model_name:
|
|
683
|
+
model_name = f"{col.title()}ExplodedModel"
|
|
684
|
+
|
|
685
|
+
model = dict_to_data_model(model_name, output)
|
|
686
|
+
|
|
687
|
+
def json_to_model(json_value: Union[str, dict]):
|
|
688
|
+
json_dict = (
|
|
689
|
+
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
690
|
+
)
|
|
691
|
+
return model.model_validate(json_dict)
|
|
692
|
+
|
|
693
|
+
if not object_name:
|
|
694
|
+
object_name = f"{col}_expl"
|
|
695
|
+
|
|
696
|
+
return self.map(json_to_model, params=col, output={object_name: model})
|
|
697
|
+
|
|
645
698
|
@classmethod
|
|
646
699
|
def datasets(
|
|
647
700
|
cls,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import glob
|
|
2
2
|
import os
|
|
3
3
|
from collections.abc import Iterable, Iterator
|
|
4
|
+
from functools import cached_property
|
|
4
5
|
from itertools import zip_longest
|
|
5
6
|
from typing import TYPE_CHECKING, Optional
|
|
6
7
|
|
|
@@ -15,28 +16,34 @@ from datachain.utils import suffix_to_number
|
|
|
15
16
|
if TYPE_CHECKING:
|
|
16
17
|
from datachain.catalog.datasource import DataSource
|
|
17
18
|
from datachain.client import Client
|
|
18
|
-
from datachain.data_storage import AbstractWarehouse
|
|
19
|
+
from datachain.data_storage import AbstractMetastore, AbstractWarehouse
|
|
19
20
|
from datachain.dataset import DatasetRecord
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class Listing:
|
|
23
24
|
def __init__(
|
|
24
25
|
self,
|
|
26
|
+
metastore: "AbstractMetastore",
|
|
25
27
|
warehouse: "AbstractWarehouse",
|
|
26
28
|
client: "Client",
|
|
27
|
-
|
|
29
|
+
dataset_name: Optional["str"] = None,
|
|
30
|
+
dataset_version: Optional[int] = None,
|
|
28
31
|
object_name: str = "file",
|
|
29
32
|
):
|
|
33
|
+
self.metastore = metastore
|
|
30
34
|
self.warehouse = warehouse
|
|
31
35
|
self.client = client
|
|
32
|
-
self.
|
|
36
|
+
self.dataset_name = dataset_name # dataset representing bucket listing
|
|
37
|
+
self.dataset_version = dataset_version # dataset representing bucket listing
|
|
33
38
|
self.object_name = object_name
|
|
34
39
|
|
|
35
40
|
def clone(self) -> "Listing":
|
|
36
41
|
return self.__class__(
|
|
42
|
+
self.metastore.clone(),
|
|
37
43
|
self.warehouse.clone(),
|
|
38
44
|
self.client,
|
|
39
|
-
self.
|
|
45
|
+
self.dataset_name,
|
|
46
|
+
self.dataset_version,
|
|
40
47
|
self.object_name,
|
|
41
48
|
)
|
|
42
49
|
|
|
@@ -53,12 +60,22 @@ class Listing:
|
|
|
53
60
|
def uri(self):
|
|
54
61
|
from datachain.lib.listing import listing_uri_from_name
|
|
55
62
|
|
|
56
|
-
|
|
63
|
+
assert self.dataset_name
|
|
57
64
|
|
|
58
|
-
|
|
65
|
+
return listing_uri_from_name(self.dataset_name)
|
|
66
|
+
|
|
67
|
+
@cached_property
|
|
68
|
+
def dataset(self) -> "DatasetRecord":
|
|
69
|
+
assert self.dataset_name
|
|
70
|
+
return self.metastore.get_dataset(self.dataset_name)
|
|
71
|
+
|
|
72
|
+
@cached_property
|
|
59
73
|
def dataset_rows(self):
|
|
74
|
+
dataset = self.dataset
|
|
60
75
|
return self.warehouse.dataset_rows(
|
|
61
|
-
|
|
76
|
+
dataset,
|
|
77
|
+
self.dataset_version or dataset.latest_version,
|
|
78
|
+
object_name=self.object_name,
|
|
62
79
|
)
|
|
63
80
|
|
|
64
81
|
def expand_path(self, path, use_glob=True) -> list[Node]:
|
|
@@ -36,7 +36,14 @@ def convert_array(arr):
|
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
def adapt_np_array(arr):
|
|
39
|
-
|
|
39
|
+
def _json_serialize(obj):
|
|
40
|
+
if isinstance(obj, np.ndarray):
|
|
41
|
+
return obj.tolist()
|
|
42
|
+
return obj
|
|
43
|
+
|
|
44
|
+
return orjson.dumps(
|
|
45
|
+
arr, option=orjson.OPT_SERIALIZE_NUMPY, default=_json_serialize
|
|
46
|
+
).decode("utf-8")
|
|
40
47
|
|
|
41
48
|
|
|
42
49
|
def adapt_np_generic(val):
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from datachain import C, DataChain
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
|
|
5
|
+
"""
|
|
6
|
+
Splits a DataChain into multiple subsets based on the provided weights.
|
|
7
|
+
|
|
8
|
+
This function partitions the rows or items of a DataChain into disjoint subsets,
|
|
9
|
+
ensuring that the relative sizes of the subsets correspond to the given weights.
|
|
10
|
+
It is particularly useful for creating training, validation, and test datasets.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
dc (DataChain):
|
|
14
|
+
The DataChain instance to split.
|
|
15
|
+
weights (list[float]):
|
|
16
|
+
A list of weights indicating the relative proportions of the splits.
|
|
17
|
+
The weights do not need to sum to 1; they will be normalized internally.
|
|
18
|
+
For example:
|
|
19
|
+
- `[0.7, 0.3]` corresponds to a 70/30 split;
|
|
20
|
+
- `[2, 1, 1]` corresponds to a 50/25/25 split.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
list[DataChain]:
|
|
24
|
+
A list of DataChain instances, one for each weight in the weights list.
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
Train-test split:
|
|
28
|
+
```python
|
|
29
|
+
from datachain import DataChain
|
|
30
|
+
from datachain.toolkit import train_test_split
|
|
31
|
+
|
|
32
|
+
# Load a DataChain from a storage source (e.g., S3 bucket)
|
|
33
|
+
dc = DataChain.from_storage("s3://bucket/dir/")
|
|
34
|
+
|
|
35
|
+
# Perform a 70/30 train-test split
|
|
36
|
+
train, test = train_test_split(dc, [0.7, 0.3])
|
|
37
|
+
|
|
38
|
+
# Save the resulting splits
|
|
39
|
+
train.save("dataset_train")
|
|
40
|
+
test.save("dataset_test")
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Train-test-validation split:
|
|
44
|
+
```python
|
|
45
|
+
train, test, val = train_test_split(dc, [0.7, 0.2, 0.1])
|
|
46
|
+
train.save("dataset_train")
|
|
47
|
+
test.save("dataset_test")
|
|
48
|
+
val.save("dataset_val")
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Note:
|
|
52
|
+
The splits are random but deterministic, based on Dataset `sys__rand` field.
|
|
53
|
+
"""
|
|
54
|
+
if len(weights) < 2:
|
|
55
|
+
raise ValueError("Weights should have at least two elements")
|
|
56
|
+
if any(weight < 0 for weight in weights):
|
|
57
|
+
raise ValueError("Weights should be non-negative")
|
|
58
|
+
|
|
59
|
+
weights_normalized = [weight / sum(weights) for weight in weights]
|
|
60
|
+
|
|
61
|
+
return [
|
|
62
|
+
dc.filter(
|
|
63
|
+
C("sys__rand") % 1000 >= round(sum(weights_normalized[:index]) * 1000),
|
|
64
|
+
C("sys__rand") % 1000 < round(sum(weights_normalized[: index + 1]) * 1000),
|
|
65
|
+
)
|
|
66
|
+
for index, _ in enumerate(weights_normalized)
|
|
67
|
+
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -71,7 +71,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
|
71
71
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
72
72
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
73
73
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
74
|
-
Requires-Dist: pytest-servers[all]>=0.5.
|
|
74
|
+
Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
|
|
75
75
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
76
76
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
77
77
|
Requires-Dist: virtualenv; extra == "tests"
|
|
@@ -23,6 +23,7 @@ docs/index.md
|
|
|
23
23
|
docs/assets/captioned_cartoons.png
|
|
24
24
|
docs/assets/datachain-white.svg
|
|
25
25
|
docs/assets/datachain.svg
|
|
26
|
+
docs/overrides/main.html
|
|
26
27
|
docs/references/datachain.md
|
|
27
28
|
docs/references/datatype.md
|
|
28
29
|
docs/references/file.md
|
|
@@ -48,7 +49,6 @@ examples/multimodal/hf_pipeline.py
|
|
|
48
49
|
examples/multimodal/openai_image_desc_lib.py
|
|
49
50
|
examples/multimodal/wds.py
|
|
50
51
|
examples/multimodal/wds_filtered.py
|
|
51
|
-
overrides/main.html
|
|
52
52
|
src/datachain/__init__.py
|
|
53
53
|
src/datachain/__main__.py
|
|
54
54
|
src/datachain/asyn.py
|
|
@@ -160,6 +160,8 @@ src/datachain/sql/sqlite/__init__.py
|
|
|
160
160
|
src/datachain/sql/sqlite/base.py
|
|
161
161
|
src/datachain/sql/sqlite/types.py
|
|
162
162
|
src/datachain/sql/sqlite/vector.py
|
|
163
|
+
src/datachain/toolkit/__init__.py
|
|
164
|
+
src/datachain/toolkit/split.py
|
|
163
165
|
src/datachain/torch/__init__.py
|
|
164
166
|
tests/__init__.py
|
|
165
167
|
tests/conftest.py
|
|
@@ -197,6 +199,7 @@ tests/func/test_metrics.py
|
|
|
197
199
|
tests/func/test_pull.py
|
|
198
200
|
tests/func/test_pytorch.py
|
|
199
201
|
tests/func/test_query.py
|
|
202
|
+
tests/func/test_toolkit.py
|
|
200
203
|
tests/scripts/feature_class.py
|
|
201
204
|
tests/scripts/feature_class_exception.py
|
|
202
205
|
tests/scripts/feature_class_parallel.py
|
|
@@ -256,4 +259,5 @@ tests/unit/sql/test_random.py
|
|
|
256
259
|
tests/unit/sql/test_selectable.py
|
|
257
260
|
tests/unit/sql/test_string.py
|
|
258
261
|
tests/unit/sql/sqlite/__init__.py
|
|
262
|
+
tests/unit/sql/sqlite/test_types.py
|
|
259
263
|
tests/unit/sql/sqlite/test_utils.py
|
|
@@ -22,7 +22,7 @@ from datachain.data_storage.sqlite import (
|
|
|
22
22
|
SQLiteWarehouse,
|
|
23
23
|
)
|
|
24
24
|
from datachain.dataset import DatasetRecord
|
|
25
|
-
from datachain.lib.dc import DataChain
|
|
25
|
+
from datachain.lib.dc import DataChain, Sys
|
|
26
26
|
from datachain.query.session import Session
|
|
27
27
|
from datachain.utils import (
|
|
28
28
|
ENV_DATACHAIN_GLOBAL_CONFIG_DIR,
|
|
@@ -701,3 +701,43 @@ def studio_datasets(requests_mock):
|
|
|
701
701
|
]
|
|
702
702
|
|
|
703
703
|
requests_mock.post(f"{STUDIO_URL}/api/datachain/ls-datasets", json=datasets)
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
@pytest.fixture
|
|
707
|
+
def not_random_ds(test_session):
|
|
708
|
+
return DataChain.from_records(
|
|
709
|
+
[
|
|
710
|
+
{"sys__id": 1, "sys__rand": 50, "fib": 0},
|
|
711
|
+
{"sys__id": 2, "sys__rand": 150, "fib": 1},
|
|
712
|
+
{"sys__id": 3, "sys__rand": 250, "fib": 1},
|
|
713
|
+
{"sys__id": 4, "sys__rand": 350, "fib": 2},
|
|
714
|
+
{"sys__id": 5, "sys__rand": 450, "fib": 3},
|
|
715
|
+
{"sys__id": 6, "sys__rand": 550, "fib": 5},
|
|
716
|
+
{"sys__id": 7, "sys__rand": 650, "fib": 8},
|
|
717
|
+
{"sys__id": 8, "sys__rand": 750, "fib": 13},
|
|
718
|
+
{"sys__id": 9, "sys__rand": 850, "fib": 21},
|
|
719
|
+
{"sys__id": 10, "sys__rand": 950, "fib": 34},
|
|
720
|
+
],
|
|
721
|
+
session=test_session,
|
|
722
|
+
schema={"sys": Sys, "fib": int},
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
@pytest.fixture
|
|
727
|
+
def pseudo_random_ds(test_session):
|
|
728
|
+
return DataChain.from_records(
|
|
729
|
+
[
|
|
730
|
+
{"sys__id": 1, "sys__rand": 1344339883, "fib": 0},
|
|
731
|
+
{"sys__id": 2, "sys__rand": 3901153096, "fib": 1},
|
|
732
|
+
{"sys__id": 3, "sys__rand": 4255991360, "fib": 1},
|
|
733
|
+
{"sys__id": 4, "sys__rand": 2526403609, "fib": 2},
|
|
734
|
+
{"sys__id": 5, "sys__rand": 1871733386, "fib": 3},
|
|
735
|
+
{"sys__id": 6, "sys__rand": 9380910850, "fib": 5},
|
|
736
|
+
{"sys__id": 7, "sys__rand": 2770679740, "fib": 8},
|
|
737
|
+
{"sys__id": 8, "sys__rand": 2538886575, "fib": 13},
|
|
738
|
+
{"sys__id": 9, "sys__rand": 3969542617, "fib": 21},
|
|
739
|
+
{"sys__id": 10, "sys__rand": 7541790992, "fib": 34},
|
|
740
|
+
],
|
|
741
|
+
session=test_session,
|
|
742
|
+
schema={"sys": Sys, "fib": int},
|
|
743
|
+
)
|
|
@@ -459,6 +459,72 @@ def test_order_by_limit(cloud_test_catalog, save, animal_dataset):
|
|
|
459
459
|
]
|
|
460
460
|
|
|
461
461
|
|
|
462
|
+
@pytest.mark.parametrize("save", [True, False])
|
|
463
|
+
def test_limit(cloud_test_catalog, save, animal_dataset):
|
|
464
|
+
catalog = cloud_test_catalog.catalog
|
|
465
|
+
q = (
|
|
466
|
+
DatasetQuery(animal_dataset.name, catalog=catalog)
|
|
467
|
+
.order_by(C("file.path"))
|
|
468
|
+
.limit(2)
|
|
469
|
+
)
|
|
470
|
+
if save:
|
|
471
|
+
ds_name = "animals_cats"
|
|
472
|
+
q.save(ds_name)
|
|
473
|
+
result = DatasetQuery(name=ds_name, catalog=catalog).db_results()
|
|
474
|
+
dataset_record = catalog.get_dataset(ds_name)
|
|
475
|
+
assert dataset_record.status == DatasetStatus.COMPLETE
|
|
476
|
+
else:
|
|
477
|
+
result = q.db_results()
|
|
478
|
+
|
|
479
|
+
assert len(result) == 2
|
|
480
|
+
assert [posixpath.basename(r[3]) for r in result] == ["cat1", "cat2"]
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
@pytest.mark.parametrize("save", [True, False])
|
|
484
|
+
def test_offset_limit(cloud_test_catalog, save, animal_dataset):
|
|
485
|
+
catalog = cloud_test_catalog.catalog
|
|
486
|
+
q = (
|
|
487
|
+
DatasetQuery(animal_dataset.name, catalog=catalog)
|
|
488
|
+
.order_by(C("file.path"))
|
|
489
|
+
.offset(3)
|
|
490
|
+
.limit(2)
|
|
491
|
+
)
|
|
492
|
+
if save:
|
|
493
|
+
ds_name = "animals_cats"
|
|
494
|
+
q.save(ds_name)
|
|
495
|
+
result = DatasetQuery(name=ds_name, catalog=catalog).db_results()
|
|
496
|
+
dataset_record = catalog.get_dataset(ds_name)
|
|
497
|
+
assert dataset_record.status == DatasetStatus.COMPLETE
|
|
498
|
+
else:
|
|
499
|
+
result = q.db_results()
|
|
500
|
+
|
|
501
|
+
assert len(result) == 2
|
|
502
|
+
assert [posixpath.basename(r[3]) for r in result] == ["dog1", "dog2"]
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
@pytest.mark.parametrize("save", [True, False])
|
|
506
|
+
def test_mutate_offset_limit(cloud_test_catalog, save, animal_dataset):
|
|
507
|
+
catalog = cloud_test_catalog.catalog
|
|
508
|
+
q = (
|
|
509
|
+
DatasetQuery(animal_dataset.name, catalog=catalog)
|
|
510
|
+
.order_by(C("file.path"))
|
|
511
|
+
.mutate(size10x=C("file.size") * 10)
|
|
512
|
+
.offset(3)
|
|
513
|
+
.limit(2)
|
|
514
|
+
)
|
|
515
|
+
if save:
|
|
516
|
+
ds_name = "animals_cats"
|
|
517
|
+
q.save(ds_name)
|
|
518
|
+
result = DatasetQuery(name=ds_name, catalog=catalog).db_results()
|
|
519
|
+
dataset_record = catalog.get_dataset(ds_name)
|
|
520
|
+
assert dataset_record.status == DatasetStatus.COMPLETE
|
|
521
|
+
else:
|
|
522
|
+
result = q.db_results()
|
|
523
|
+
|
|
524
|
+
assert len(result) == 2
|
|
525
|
+
assert [posixpath.basename(r[3]) for r in result] == ["dog1", "dog2"]
|
|
526
|
+
|
|
527
|
+
|
|
462
528
|
@pytest.mark.parametrize(
|
|
463
529
|
"cloud_type,version_aware",
|
|
464
530
|
[("s3", True)],
|
|
@@ -6,12 +6,13 @@ import lz4.frame
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import pytest
|
|
8
8
|
|
|
9
|
+
from datachain.client.fsspec import Client
|
|
9
10
|
from datachain.config import Config, ConfigLevel
|
|
10
11
|
from datachain.dataset import DatasetStatus
|
|
11
12
|
from datachain.error import DataChainError
|
|
12
13
|
from datachain.utils import STUDIO_URL, JSONSerialize
|
|
13
14
|
from tests.data import ENTRIES
|
|
14
|
-
from tests.utils import assert_row_names, skip_if_not_sqlite
|
|
15
|
+
from tests.utils import assert_row_names, skip_if_not_sqlite, tree_from_path
|
|
15
16
|
|
|
16
17
|
DATASET_UUID = "20f5a2f1-fc9a-4e36-8b91-5a530f289451"
|
|
17
18
|
|
|
@@ -40,10 +41,11 @@ def dog_entries():
|
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
@pytest.fixture
|
|
43
|
-
def dog_entries_parquet_lz4(dog_entries) -> bytes:
|
|
44
|
+
def dog_entries_parquet_lz4(dog_entries, cloud_test_catalog) -> bytes:
|
|
44
45
|
"""
|
|
45
46
|
Returns dogs entries in lz4 compressed parquet format
|
|
46
47
|
"""
|
|
48
|
+
src_uri = cloud_test_catalog.src_uri
|
|
47
49
|
|
|
48
50
|
def _adapt_row(row):
|
|
49
51
|
"""
|
|
@@ -61,7 +63,7 @@ def dog_entries_parquet_lz4(dog_entries) -> bytes:
|
|
|
61
63
|
adapted["sys__id"] = 1
|
|
62
64
|
adapted["sys__rand"] = 1
|
|
63
65
|
adapted["file__location"] = ""
|
|
64
|
-
adapted["file__source"] =
|
|
66
|
+
adapted["file__source"] = src_uri
|
|
65
67
|
return adapted
|
|
66
68
|
|
|
67
69
|
dog_entries = [_adapt_row(e) for e in dog_entries]
|
|
@@ -141,6 +143,7 @@ def remote_dataset(remote_dataset_version, schema):
|
|
|
141
143
|
|
|
142
144
|
@pytest.mark.parametrize("cloud_type, version_aware", [("s3", False)], indirect=True)
|
|
143
145
|
@pytest.mark.parametrize("dataset_uri", ["ds://dogs@v1", "ds://dogs"])
|
|
146
|
+
@pytest.mark.parametrize("instantiate", [True, False])
|
|
144
147
|
@skip_if_not_sqlite
|
|
145
148
|
def test_pull_dataset_success(
|
|
146
149
|
requests_mock,
|
|
@@ -148,7 +151,10 @@ def test_pull_dataset_success(
|
|
|
148
151
|
remote_dataset,
|
|
149
152
|
dog_entries_parquet_lz4,
|
|
150
153
|
dataset_uri,
|
|
154
|
+
instantiate,
|
|
151
155
|
):
|
|
156
|
+
src_uri = cloud_test_catalog.src_uri
|
|
157
|
+
working_dir = cloud_test_catalog.working_dir
|
|
152
158
|
data_url = (
|
|
153
159
|
"https://studio-blobvault.s3.amazonaws.com/datachain_ds_export_1_0.parquet.lz4"
|
|
154
160
|
)
|
|
@@ -165,9 +171,16 @@ def test_pull_dataset_success(
|
|
|
165
171
|
requests_mock.get(data_url, content=dog_entries_parquet_lz4)
|
|
166
172
|
catalog = cloud_test_catalog.catalog
|
|
167
173
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
174
|
+
dest = None
|
|
175
|
+
|
|
176
|
+
if instantiate:
|
|
177
|
+
dest = working_dir / "data"
|
|
178
|
+
dest.mkdir()
|
|
179
|
+
catalog.pull_dataset(dataset_uri, output=str(dest), no_cp=False)
|
|
180
|
+
else:
|
|
181
|
+
# trying to pull multiple times since that should work as well
|
|
182
|
+
catalog.pull_dataset(dataset_uri, no_cp=True)
|
|
183
|
+
catalog.pull_dataset(dataset_uri, no_cp=True)
|
|
171
184
|
|
|
172
185
|
dataset = catalog.get_dataset("dogs")
|
|
173
186
|
assert dataset.versions_values == [1]
|
|
@@ -196,6 +209,20 @@ def test_pull_dataset_success(
|
|
|
196
209
|
},
|
|
197
210
|
)
|
|
198
211
|
|
|
212
|
+
client = Client.get_client(src_uri, None)
|
|
213
|
+
|
|
214
|
+
if instantiate:
|
|
215
|
+
assert tree_from_path(dest) == {
|
|
216
|
+
f"{client.name}": {
|
|
217
|
+
"dogs": {
|
|
218
|
+
"dog1": "woof",
|
|
219
|
+
"dog2": "arf",
|
|
220
|
+
"dog3": "bark",
|
|
221
|
+
"others": {"dog4": "ruff"},
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
199
226
|
|
|
200
227
|
@pytest.mark.parametrize("cloud_type, version_aware", [("s3", False)], indirect=True)
|
|
201
228
|
@skip_if_not_sqlite
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from datachain.toolkit import train_test_split
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.mark.parametrize(
|
|
7
|
+
"weights,expected",
|
|
8
|
+
[
|
|
9
|
+
[[1, 1], [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]],
|
|
10
|
+
[[4, 1], [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10]]],
|
|
11
|
+
[[0.7, 0.2, 0.1], [[1, 2, 3, 4, 5, 6, 7], [8, 9], [10]]],
|
|
12
|
+
],
|
|
13
|
+
)
|
|
14
|
+
def test_train_test_split_not_random(not_random_ds, weights, expected):
|
|
15
|
+
res = train_test_split(not_random_ds, weights)
|
|
16
|
+
assert len(res) == len(expected)
|
|
17
|
+
|
|
18
|
+
for i, dc in enumerate(res):
|
|
19
|
+
assert list(dc.collect("sys.id")) == expected[i]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.mark.parametrize(
|
|
23
|
+
"weights,expected",
|
|
24
|
+
[
|
|
25
|
+
[[1, 1], [[2, 3, 5], [1, 4, 6, 7, 8, 9, 10]]],
|
|
26
|
+
[[4, 1], [[2, 3, 4, 5, 7, 8, 9], [1, 6, 10]]],
|
|
27
|
+
[[0.7, 0.2, 0.1], [[2, 3, 4, 5, 8, 9], [1, 6, 7], [10]]],
|
|
28
|
+
],
|
|
29
|
+
)
|
|
30
|
+
def test_train_test_split_random(pseudo_random_ds, weights, expected):
|
|
31
|
+
res = train_test_split(pseudo_random_ds, weights)
|
|
32
|
+
assert len(res) == len(expected)
|
|
33
|
+
|
|
34
|
+
for i, dc in enumerate(res):
|
|
35
|
+
assert list(dc.collect("sys.id")) == expected[i]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_train_test_split_errors(not_random_ds):
|
|
39
|
+
with pytest.raises(ValueError, match="Weights should have at least two elements"):
|
|
40
|
+
train_test_split(not_random_ds, [0.5])
|
|
41
|
+
with pytest.raises(ValueError, match="Weights should be non-negative"):
|
|
42
|
+
train_test_split(not_random_ds, [-1, 1])
|