datachain 0.1.10__tar.gz → 0.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.1.10/src/datachain.egg-info → datachain-0.1.12}/PKG-INFO +3 -1
- {datachain-0.1.10 → datachain-0.1.12}/examples/clip.py +8 -12
- datachain-0.1.12/examples/json-csv-reader.py +87 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/torch-loader.py +4 -9
- {datachain-0.1.10 → datachain-0.1.12}/pyproject.toml +3 -1
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/_version.py +2 -2
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/catalog/catalog.py +47 -3
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/metastore.py +2 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/dataset.py +5 -7
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/dc.py +150 -7
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/feature.py +0 -10
- datachain-0.1.12/src/datachain/lib/meta_formats.py +164 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/pytorch.py +33 -4
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/signal_schema.py +63 -6
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/dataset.py +1 -1
- {datachain-0.1.10 → datachain-0.1.12/src/datachain.egg-info}/PKG-INFO +3 -1
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain.egg-info/SOURCES.txt +2 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain.egg-info/requires.txt +2 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/conftest.py +1 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_catalog.py +67 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_dataset_query.py +4 -24
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_datachain.py +66 -20
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_datachain_merge.py +23 -9
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_feature_utils.py +2 -2
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_signal_schema.py +41 -6
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_id_generator.py +2 -2
- {datachain-0.1.10 → datachain-0.1.12}/tests/utils.py +15 -0
- {datachain-0.1.10 → datachain-0.1.12}/.cruft.json +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.gitattributes +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.github/codecov.yaml +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.github/dependabot.yml +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.github/workflows/release.yml +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.github/workflows/tests.yml +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.gitignore +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.pre-commit-config.yaml +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/.reuse/dep5 +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/CONTRIBUTING.rst +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/LICENSE +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/LICENSES/Apache-2.0.txt +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/LICENSES/BSD-3-Clause.txt +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/LICENSES/Python-2.0.txt +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/README.rst +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/docs/cv_intro.md +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/docs/udfs.md +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/blip2_image_desc_lib.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/common_sql_functions.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/dir_expansion.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/hf_pipeline.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/llava2_image_desc_lib.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/llm-claude-aggregate-query.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/llm-claude-simple-query.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/llm-claude.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/loader.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/README +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/distance_to_query.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/llm_chat.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/requirements.txt +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/single_query.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/text_loaders.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/notebooks/clip_fine_tuning.ipynb +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/openai_image_desc_lib.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/openimage-detect.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/pose_detection.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/batching.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/image_transformation.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/parallel.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/simple.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/stateful.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/stateful_similarity.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/unstructured-text.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/wds.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/wds_filtered.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/zalando/zalando_clip.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/zalando/zalando_dir_as_class.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/zalando/zalando_splits_and_classes_ds.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/examples/zalando/zalando_splits_and_classes_output.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/noxfile.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/setup.cfg +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/__main__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/asyn.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/cache.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/cli.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/cli_utils.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/azure.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/gcs.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/local.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/s3.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/config.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/error.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/cached_stream.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/claude.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/feature_registry.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/feature_utils.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/file.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/gpt4_vision.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/hf_image_to_text.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/hf_pipeline.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/image.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/image_transform.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/iptc_exif_xmp.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/parquet.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/reader.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/settings.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/text.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/udf.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/unstructured.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/utils.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/listing.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/node.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/progress.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/py.typed +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/batch.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/builtins.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/params.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/schema.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/session.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/udf.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/remote/studio.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/types.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/utils.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/storage.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain/utils.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/data.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/func/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_client.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_datasets.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_ls.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_pull.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_pytorch.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_query.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/scripts/feature_class.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/scripts/name_len_normal.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/test_cli_e2e.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/test_query_e2e.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_cached_stream.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_parquet.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_reader.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_asyn.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_cache.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_catalog.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_client.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_dataset.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_listing.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_metastore.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_query_params.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_serializer.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_session.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_storage.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_udf.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_utils.py +0 -0
- {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_warehouse.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.12
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -36,6 +36,8 @@ Requires-Dist: multiprocess==0.70.16
|
|
|
36
36
|
Requires-Dist: dill==0.3.8
|
|
37
37
|
Requires-Dist: ujson>=5.9.0
|
|
38
38
|
Requires-Dist: pydantic<3,>=2
|
|
39
|
+
Requires-Dist: jmespath>=1.0
|
|
40
|
+
Requires-Dist: datamodel-code-generator>=0.25
|
|
39
41
|
Provides-Extra: cv
|
|
40
42
|
Requires-Dist: Pillow<11,>=10.0.0; extra == "cv"
|
|
41
43
|
Requires-Dist: torch>=2.1.0; extra == "cv"
|
|
@@ -4,26 +4,22 @@ from torch.nn.functional import cosine_similarity
|
|
|
4
4
|
from torch.utils.data import DataLoader
|
|
5
5
|
|
|
6
6
|
from datachain.lib.dc import C, DataChain
|
|
7
|
-
from datachain.lib.image import ImageReader
|
|
8
|
-
from datachain.lib.text import TextReader
|
|
9
|
-
from datachain.sql.functions import path
|
|
10
7
|
|
|
11
8
|
source = "gs://dvcx-50k-laion-files/000000/00000000*"
|
|
12
9
|
|
|
13
10
|
|
|
14
11
|
def create_dataset():
|
|
15
12
|
imgs = (
|
|
16
|
-
DataChain(source)
|
|
13
|
+
DataChain.from_storage(source, type="image")
|
|
17
14
|
.filter(C.name.glob("*.jpg"))
|
|
18
|
-
.
|
|
15
|
+
.map(stem=lambda name: name.split(".")[0], output=str)
|
|
19
16
|
)
|
|
20
17
|
captions = (
|
|
21
|
-
DataChain.from_storage(source,
|
|
18
|
+
DataChain.from_storage(source, type="text")
|
|
22
19
|
.filter(C.name.glob("*.txt"))
|
|
23
|
-
.
|
|
24
|
-
.map(lambda file: file.get_value(), output={"caption": str})
|
|
20
|
+
.map(stem=lambda name: name.split(".")[0], output=str)
|
|
25
21
|
)
|
|
26
|
-
return imgs.
|
|
22
|
+
return imgs.merge(captions, on="stem")
|
|
27
23
|
|
|
28
24
|
|
|
29
25
|
if __name__ == "__main__":
|
|
@@ -34,9 +30,9 @@ if __name__ == "__main__":
|
|
|
34
30
|
)
|
|
35
31
|
tokenizer = open_clip.get_tokenizer("ViT-B-32")
|
|
36
32
|
|
|
37
|
-
ds = q.to_pytorch(
|
|
38
|
-
|
|
39
|
-
|
|
33
|
+
ds = q.select("file", "right_file").to_pytorch(
|
|
34
|
+
transform=preprocess,
|
|
35
|
+
tokenizer=tokenizer,
|
|
40
36
|
)
|
|
41
37
|
loader = DataLoader(ds, batch_size=16)
|
|
42
38
|
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
#
|
|
2
|
+
# TODO:
|
|
3
|
+
# refactor lib/meta_formats/read_scema into a Datachain method
|
|
4
|
+
#
|
|
5
|
+
# ER: add support for Optional fields in read_schema()
|
|
6
|
+
# ER: add support for headless CSV within static schema only
|
|
7
|
+
# ER: fix the bug in datamodel-codegen failing to recognize csv float and int columns
|
|
8
|
+
#
|
|
9
|
+
# Open issues:
|
|
10
|
+
# 1. A single filename cannot be passed as schema source (#1563)
|
|
11
|
+
# 2. Need syntax like "file.open(encoding='utf-8')" to avoid "type=text" (#1614)
|
|
12
|
+
# 3. Need syntax like "datachain.collate(func -> Any)" (#1615)
|
|
13
|
+
# 4. "Feature" does not tolerate creating a class twice (#1617)
|
|
14
|
+
# 5. Unsure how to deal with 'folder' pseudo-files in cloud systems(#1618)
|
|
15
|
+
# 6. There should be exec() method to force-run the existing chain (#1616)
|
|
16
|
+
# 7. data-model-codegenerator: datamodel-codegen reports all CSV fields as 'str'.
|
|
17
|
+
# 8. from_json and from_csv methods do not filter empty files from AWS
|
|
18
|
+
# dependencies:
|
|
19
|
+
# pip install datamodel-code-generator
|
|
20
|
+
# pip install jmespath
|
|
21
|
+
|
|
22
|
+
from typing import Optional
|
|
23
|
+
|
|
24
|
+
from pydantic import BaseModel
|
|
25
|
+
|
|
26
|
+
from datachain.lib.dc import C, DataChain
|
|
27
|
+
from datachain.lib.feature_utils import pydantic_to_feature
|
|
28
|
+
from datachain.lib.meta_formats import read_schema
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Sample model for static JSON model
|
|
32
|
+
class LicenseModel(BaseModel):
|
|
33
|
+
url: str
|
|
34
|
+
id: int
|
|
35
|
+
name: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
LicenseFeature = pydantic_to_feature(LicenseModel)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Sample model for static CSV model
|
|
42
|
+
class ChatDialog(BaseModel):
|
|
43
|
+
id: Optional[int] = None
|
|
44
|
+
count: Optional[int] = None
|
|
45
|
+
sender: Optional[str] = None
|
|
46
|
+
text: Optional[str] = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
ChatFeature = pydantic_to_feature(ChatDialog)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def main():
|
|
53
|
+
uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
54
|
+
|
|
55
|
+
print("Reading schema from the root COCO annotation")
|
|
56
|
+
chain = (
|
|
57
|
+
DataChain.from_storage(uri)
|
|
58
|
+
.filter(C.name.glob("*.json"))
|
|
59
|
+
.limit(1)
|
|
60
|
+
.map( # dummy column created (#1615)
|
|
61
|
+
meta_schema=lambda file: read_schema(file, data_type="json"), output=str
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
# dummy executor (#1616)
|
|
65
|
+
chain.save()
|
|
66
|
+
|
|
67
|
+
print("static JSON schema test parsing 7 objects")
|
|
68
|
+
static_json_ds = DataChain.from_json(uri, jmespath="licenses", spec=LicenseFeature)
|
|
69
|
+
print(static_json_ds.to_pandas())
|
|
70
|
+
|
|
71
|
+
print("dynamic JSON schema test parsing 5K objects")
|
|
72
|
+
dynamic_json_ds = DataChain.from_json(uri, jmespath="images", show_schema=True)
|
|
73
|
+
print(dynamic_json_ds.to_pandas())
|
|
74
|
+
|
|
75
|
+
uri = "gs://datachain-demo/chatbot-csv/"
|
|
76
|
+
print("static CSV with header schema test parsing 3.5K objects")
|
|
77
|
+
static_csv_ds = DataChain.from_csv(uri, spec=ChatFeature)
|
|
78
|
+
print(static_csv_ds.to_pandas())
|
|
79
|
+
|
|
80
|
+
uri = "gs://datachain-demo/laion-aesthetics-csv"
|
|
81
|
+
print("dynamic CSV with header schema test parsing 3M objects")
|
|
82
|
+
dynamic_csv_ds = DataChain.from_csv(uri, show_schema=True)
|
|
83
|
+
print(dynamic_csv_ds.to_pandas())
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
main()
|
|
@@ -6,8 +6,7 @@ from torch.utils.data import DataLoader
|
|
|
6
6
|
from torchvision.transforms import v2
|
|
7
7
|
|
|
8
8
|
from datachain.lib.dc import C, DataChain
|
|
9
|
-
from datachain.lib.
|
|
10
|
-
from datachain.lib.reader import LabelReader
|
|
9
|
+
from datachain.lib.pytorch import label_to_int
|
|
11
10
|
|
|
12
11
|
STORAGE = "gs://dvcx-datalakes/dogs-and-cats/"
|
|
13
12
|
|
|
@@ -45,17 +44,13 @@ class CNN(nn.Module):
|
|
|
45
44
|
|
|
46
45
|
if __name__ == "__main__":
|
|
47
46
|
ds = (
|
|
48
|
-
DataChain(STORAGE)
|
|
47
|
+
DataChain.from_storage(STORAGE, type="image")
|
|
49
48
|
.filter(C.name.glob("*.jpg"))
|
|
50
|
-
.map(lambda name: (name[:3],), output=
|
|
49
|
+
.map(label=lambda name: label_to_int(name[:3], CLASSES), output=int)
|
|
51
50
|
)
|
|
52
51
|
|
|
53
52
|
train_loader = DataLoader(
|
|
54
|
-
ds.to_pytorch(
|
|
55
|
-
ImageReader(),
|
|
56
|
-
LabelReader("label", classes=CLASSES),
|
|
57
|
-
transform=transform,
|
|
58
|
-
),
|
|
53
|
+
ds.to_pytorch(transform=transform),
|
|
59
54
|
batch_size=16,
|
|
60
55
|
num_workers=2,
|
|
61
56
|
)
|
|
@@ -1580,10 +1580,54 @@ class Catalog:
|
|
|
1580
1580
|
|
|
1581
1581
|
return dst
|
|
1582
1582
|
|
|
1583
|
-
def
|
|
1583
|
+
def get_file_signals(
|
|
1584
|
+
self, dataset_name: str, dataset_version: int, row: RowDict
|
|
1585
|
+
) -> Optional[dict]:
|
|
1586
|
+
"""
|
|
1587
|
+
Function that returns file signals from dataset row.
|
|
1588
|
+
Note that signal names are without prefix, so if there was 'laion__file__source'
|
|
1589
|
+
in original row, result will have just 'source'
|
|
1590
|
+
Example output:
|
|
1591
|
+
{
|
|
1592
|
+
"source": "s3://ldb-public",
|
|
1593
|
+
"parent": "animals/dogs",
|
|
1594
|
+
"name": "dog.jpg",
|
|
1595
|
+
...
|
|
1596
|
+
}
|
|
1597
|
+
"""
|
|
1598
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
1599
|
+
|
|
1600
|
+
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1601
|
+
|
|
1602
|
+
file_signals_values = SignalSchema.deserialize(
|
|
1603
|
+
version.feature_schema
|
|
1604
|
+
).get_file_signals_values(row)
|
|
1605
|
+
if not file_signals_values:
|
|
1606
|
+
return None
|
|
1607
|
+
|
|
1608
|
+
# there can be multiple file signals in a schema, but taking the first
|
|
1609
|
+
# one for now. In future we might add ability to choose from which one
|
|
1610
|
+
# to open object
|
|
1611
|
+
return next(iter(file_signals_values.values()))
|
|
1612
|
+
|
|
1613
|
+
def open_object(
|
|
1614
|
+
self,
|
|
1615
|
+
dataset_name: str,
|
|
1616
|
+
dataset_version: int,
|
|
1617
|
+
row: RowDict,
|
|
1618
|
+
use_cache: bool = True,
|
|
1619
|
+
**config: Any,
|
|
1620
|
+
):
|
|
1621
|
+
file_signals = self.get_file_signals(dataset_name, dataset_version, row)
|
|
1622
|
+
if not file_signals:
|
|
1623
|
+
raise RuntimeError("Cannot open object without file signals")
|
|
1624
|
+
|
|
1584
1625
|
config = config or self.client_config
|
|
1585
|
-
client = self.get_client(
|
|
1586
|
-
return client.open_object(
|
|
1626
|
+
client = self.get_client(file_signals["source"], **config)
|
|
1627
|
+
return client.open_object(
|
|
1628
|
+
self._get_row_uid(file_signals), # type: ignore [arg-type]
|
|
1629
|
+
use_cache=use_cache,
|
|
1630
|
+
)
|
|
1587
1631
|
|
|
1588
1632
|
def _get_row_uid(self, row: RowDict) -> UniqueId:
|
|
1589
1633
|
return UniqueId(
|
|
@@ -1142,6 +1142,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1142
1142
|
if field == "schema":
|
|
1143
1143
|
dataset_version.update(**{field: DatasetRecord.parse_schema(value)})
|
|
1144
1144
|
values[field] = json.dumps(value) if value else None
|
|
1145
|
+
elif field == "feature_schema":
|
|
1146
|
+
values[field] = json.dumps(value) if value else None
|
|
1145
1147
|
elif field == "preview" and isinstance(value, list):
|
|
1146
1148
|
values[field] = json.dumps(value, cls=JSONSerialize)
|
|
1147
1149
|
else:
|
|
@@ -157,7 +157,7 @@ class DatasetVersion:
|
|
|
157
157
|
dataset_id: int
|
|
158
158
|
version: int
|
|
159
159
|
status: int
|
|
160
|
-
feature_schema:
|
|
160
|
+
feature_schema: dict
|
|
161
161
|
created_at: datetime
|
|
162
162
|
finished_at: Optional[datetime]
|
|
163
163
|
error_message: str
|
|
@@ -199,7 +199,7 @@ class DatasetVersion:
|
|
|
199
199
|
dataset_id,
|
|
200
200
|
version,
|
|
201
201
|
status,
|
|
202
|
-
feature_schema,
|
|
202
|
+
json.loads(feature_schema) if feature_schema else {},
|
|
203
203
|
created_at,
|
|
204
204
|
finished_at,
|
|
205
205
|
error_message,
|
|
@@ -263,9 +263,9 @@ class DatasetRecord:
|
|
|
263
263
|
labels: list[str]
|
|
264
264
|
shadow: bool
|
|
265
265
|
schema: dict[str, Union[SQLType, type[SQLType]]]
|
|
266
|
+
feature_schema: dict
|
|
266
267
|
versions: list[DatasetVersion]
|
|
267
268
|
status: int = DatasetStatus.CREATED
|
|
268
|
-
feature_schema: Optional[dict] = None
|
|
269
269
|
created_at: Optional[datetime] = None
|
|
270
270
|
finished_at: Optional[datetime] = None
|
|
271
271
|
error_message: str = ""
|
|
@@ -320,8 +320,6 @@ class DatasetRecord:
|
|
|
320
320
|
version_job_id: Optional[str] = None,
|
|
321
321
|
version_is_job_result: bool = False,
|
|
322
322
|
) -> "DatasetRecord":
|
|
323
|
-
fr_schema = json.loads(feature_schema) if feature_schema else {}
|
|
324
|
-
|
|
325
323
|
labels_lst: list[str] = json.loads(labels) if labels else []
|
|
326
324
|
schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
|
|
327
325
|
version_schema_dct: dict[str, str] = (
|
|
@@ -333,7 +331,7 @@ class DatasetRecord:
|
|
|
333
331
|
version_dataset_id,
|
|
334
332
|
version,
|
|
335
333
|
version_status,
|
|
336
|
-
|
|
334
|
+
version_feature_schema,
|
|
337
335
|
version_created_at,
|
|
338
336
|
version_finished_at,
|
|
339
337
|
version_error_message,
|
|
@@ -356,9 +354,9 @@ class DatasetRecord:
|
|
|
356
354
|
labels_lst,
|
|
357
355
|
bool(shadow),
|
|
358
356
|
cls.parse_schema(schema_dct), # type: ignore[arg-type]
|
|
357
|
+
json.loads(feature_schema) if feature_schema else {},
|
|
359
358
|
[dataset_version],
|
|
360
359
|
status,
|
|
361
|
-
fr_schema,
|
|
362
360
|
created_at,
|
|
363
361
|
finished_at,
|
|
364
362
|
error_message,
|
|
@@ -6,6 +6,7 @@ import sqlalchemy
|
|
|
6
6
|
from datachain.lib.feature import Feature, FeatureType
|
|
7
7
|
from datachain.lib.feature_utils import features_to_tuples
|
|
8
8
|
from datachain.lib.file import File, get_file
|
|
9
|
+
from datachain.lib.meta_formats import read_meta
|
|
9
10
|
from datachain.lib.settings import Settings
|
|
10
11
|
from datachain.lib.signal_schema import SignalSchema
|
|
11
12
|
from datachain.lib.udf import (
|
|
@@ -219,6 +220,89 @@ class DataChain(DatasetQuery):
|
|
|
219
220
|
"""
|
|
220
221
|
return DataChain(name=name, version=version)
|
|
221
222
|
|
|
223
|
+
@classmethod
|
|
224
|
+
def from_csv(
|
|
225
|
+
cls,
|
|
226
|
+
path,
|
|
227
|
+
type: Literal["binary", "text", "image"] = "text",
|
|
228
|
+
anon: bool = False,
|
|
229
|
+
spec: Optional[FeatureType] = None,
|
|
230
|
+
schema_from: Optional[str] = "auto",
|
|
231
|
+
show_schema: Optional[bool] = False,
|
|
232
|
+
) -> "DataChain":
|
|
233
|
+
"""Get data from CSV. It returns the chain itself.
|
|
234
|
+
|
|
235
|
+
Parameters
|
|
236
|
+
----------
|
|
237
|
+
path : storage URI with directory. URI must start with storage prefix such
|
|
238
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
239
|
+
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
240
|
+
anon : use anonymous mode to access the storage.
|
|
241
|
+
spec : optional Data Model
|
|
242
|
+
schema_from : path to sample to infer spec from
|
|
243
|
+
show_schema : print auto-generated schema
|
|
244
|
+
|
|
245
|
+
Examples
|
|
246
|
+
--------
|
|
247
|
+
|
|
248
|
+
>>> chain = DataChain.from_csv("gs://csv")
|
|
249
|
+
"""
|
|
250
|
+
if schema_from == "auto":
|
|
251
|
+
schema_from = path
|
|
252
|
+
|
|
253
|
+
chain = DataChain.from_storage(path=path, type=type, anon=anon)
|
|
254
|
+
return chain.gen(
|
|
255
|
+
csv=read_meta(
|
|
256
|
+
schema_from=schema_from,
|
|
257
|
+
meta_type="csv",
|
|
258
|
+
spec=spec,
|
|
259
|
+
show_schema=show_schema,
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
@classmethod
|
|
264
|
+
def from_json(
|
|
265
|
+
cls,
|
|
266
|
+
path,
|
|
267
|
+
type: Literal["binary", "text", "image"] = "text",
|
|
268
|
+
anon: bool = False,
|
|
269
|
+
spec: Optional[FeatureType] = None,
|
|
270
|
+
schema_from: Optional[str] = "auto",
|
|
271
|
+
jmespath: Optional[str] = None,
|
|
272
|
+
show_schema: Optional[bool] = False,
|
|
273
|
+
) -> "DataChain":
|
|
274
|
+
"""Get data from CSV. It returns the chain itself.
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
path : storage URI with directory. URI must start with storage prefix such
|
|
279
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
280
|
+
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
281
|
+
anon : use anonymous mode to access the storage.
|
|
282
|
+
spec : optional Data Model
|
|
283
|
+
schema_from : path to sample to infer spec from
|
|
284
|
+
show_schema : print auto-generated schema
|
|
285
|
+
jmespath : JMESPATH expression to reduce JSON
|
|
286
|
+
name : return object name
|
|
287
|
+
Examples
|
|
288
|
+
--------
|
|
289
|
+
|
|
290
|
+
>>> chain = DataChain.from_json("gs://json")
|
|
291
|
+
"""
|
|
292
|
+
if schema_from == "auto":
|
|
293
|
+
schema_from = path
|
|
294
|
+
|
|
295
|
+
chain = DataChain.from_storage(path=path, type=type, anon=anon)
|
|
296
|
+
return chain.gen(
|
|
297
|
+
json=read_meta(
|
|
298
|
+
schema_from=schema_from,
|
|
299
|
+
meta_type="json",
|
|
300
|
+
spec=spec,
|
|
301
|
+
show_schema=show_schema,
|
|
302
|
+
jmespath=jmespath,
|
|
303
|
+
)
|
|
304
|
+
)
|
|
305
|
+
|
|
222
306
|
def save( # type: ignore[override]
|
|
223
307
|
self, name: Optional[str] = None, version: Optional[int] = None
|
|
224
308
|
) -> "DataChain":
|
|
@@ -408,16 +492,43 @@ class DataChain(DatasetQuery):
|
|
|
408
492
|
chain.signals_schema = new_schema
|
|
409
493
|
return chain
|
|
410
494
|
|
|
411
|
-
def get_values(self) -> Iterator[
|
|
412
|
-
"""Iterate over rows, getting feature values and applying reader calls.
|
|
413
|
-
|
|
414
|
-
|
|
495
|
+
def get_values(self, *cols: str) -> Iterator[list]:
|
|
496
|
+
"""Iterate over rows, getting feature values and applying reader calls.
|
|
497
|
+
If columns are specified - limit them to specified columns.
|
|
498
|
+
"""
|
|
499
|
+
for features in self.iterate(*cols):
|
|
500
|
+
yield [fr.get_value() if isinstance(fr, Feature) else fr for fr in features] # type: ignore[union-attr,call-arg]
|
|
501
|
+
|
|
502
|
+
def get_one_value(self, col: str) -> Iterator:
|
|
503
|
+
for item in self.get_values(col):
|
|
504
|
+
yield item[0]
|
|
415
505
|
|
|
416
|
-
def iterate(self) -> Iterator[
|
|
417
|
-
|
|
506
|
+
def iterate(self, *cols: str) -> Iterator[list[FeatureType]]:
|
|
507
|
+
"""Iterate over rows. If columns are specified - limit them to specified
|
|
508
|
+
columns.
|
|
509
|
+
"""
|
|
510
|
+
chain = self.select(*cols) if cols else self
|
|
511
|
+
|
|
512
|
+
db_signals = chain.signals_schema.db_signals()
|
|
418
513
|
with super().select(*db_signals).as_iterable() as rows_iter:
|
|
419
514
|
for row in rows_iter:
|
|
420
|
-
yield
|
|
515
|
+
yield chain.signals_schema.row_to_features(row, chain.session.catalog)
|
|
516
|
+
|
|
517
|
+
def iterate_one(self, col: str) -> Iterator[FeatureType]:
|
|
518
|
+
for item in self.iterate(col):
|
|
519
|
+
yield item[0]
|
|
520
|
+
|
|
521
|
+
def collect(self, *cols: str) -> list[list[FeatureType]]:
|
|
522
|
+
return list(self.iterate(*cols))
|
|
523
|
+
|
|
524
|
+
def collect_one(self, col: str) -> list[FeatureType]:
|
|
525
|
+
return list(self.iterate_one(col))
|
|
526
|
+
|
|
527
|
+
def collect_values(self, *cols: str) -> list[list]:
|
|
528
|
+
return list(self.get_values(*cols))
|
|
529
|
+
|
|
530
|
+
def collect_one_value(self, col: str) -> list:
|
|
531
|
+
return list(self.get_one_value(col))
|
|
421
532
|
|
|
422
533
|
def to_pytorch(self, **kwargs):
|
|
423
534
|
"""Convert to pytorch dataset format."""
|
|
@@ -607,3 +718,35 @@ class DataChain(DatasetQuery):
|
|
|
607
718
|
|
|
608
719
|
def max(self, fr: FeatureType): # type: ignore[override]
|
|
609
720
|
return self._extend_features("max", fr)
|
|
721
|
+
|
|
722
|
+
@detach
|
|
723
|
+
def gen_random(self) -> "DataChain":
|
|
724
|
+
from random import getrandbits
|
|
725
|
+
|
|
726
|
+
from datachain.data_storage.warehouse import RANDOM_BITS
|
|
727
|
+
|
|
728
|
+
if "random" not in self.signals_schema.values:
|
|
729
|
+
chain = self.map(random=lambda: getrandbits(RANDOM_BITS), output=int).save()
|
|
730
|
+
return chain.select_except("random")
|
|
731
|
+
|
|
732
|
+
return self
|
|
733
|
+
|
|
734
|
+
@detach
|
|
735
|
+
def shuffle(self) -> "DataChain":
|
|
736
|
+
"""Return results in deterministic random order."""
|
|
737
|
+
chain = self.gen_random()
|
|
738
|
+
return DatasetQuery.shuffle(chain)
|
|
739
|
+
|
|
740
|
+
@detach
|
|
741
|
+
def chunk(self, index: int, total: int) -> "DataChain":
|
|
742
|
+
"""Split a query into smaller chunks for e.g. parallelization.
|
|
743
|
+
Example:
|
|
744
|
+
>>> dc = DataChain(...)
|
|
745
|
+
>>> chunk_1 = dc._chunk(0, 2)
|
|
746
|
+
>>> chunk_2 = dc._chunk(1, 2)
|
|
747
|
+
Note:
|
|
748
|
+
Bear in mind that `index` is 0-indexed but `total` isn't.
|
|
749
|
+
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|
|
750
|
+
"""
|
|
751
|
+
chain = self.gen_random()
|
|
752
|
+
return DatasetQuery.chunk(chain, index, total)
|
|
@@ -78,16 +78,6 @@ DATACHAIN_TO_TYPE = {
|
|
|
78
78
|
JSON: dict,
|
|
79
79
|
}
|
|
80
80
|
|
|
81
|
-
NAMES_TO_TYPES = {
|
|
82
|
-
"int": int,
|
|
83
|
-
"str": str,
|
|
84
|
-
"float": float,
|
|
85
|
-
"bool": bool,
|
|
86
|
-
"list": list,
|
|
87
|
-
"dict": dict,
|
|
88
|
-
"bytes": bytes,
|
|
89
|
-
"datetime": datetime,
|
|
90
|
-
}
|
|
91
81
|
|
|
92
82
|
NUMPY_TO_DATACHAIN = {
|
|
93
83
|
np.dtype("int8"): Int,
|