datachain 0.3.8__tar.gz → 0.3.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.8 → datachain-0.3.9}/.github/workflows/tests.yml +1 -1
- {datachain-0.3.8 → datachain-0.3.9}/.pre-commit-config.yaml +1 -1
- {datachain-0.3.8/src/datachain.egg-info → datachain-0.3.9}/PKG-INFO +12 -13
- {datachain-0.3.8 → datachain-0.3.9}/README.rst +11 -12
- {datachain-0.3.8 → datachain-0.3.9}/examples/llm_and_nlp/unstructured-text.py +1 -1
- {datachain-0.3.8 → datachain-0.3.9}/examples/multimodal/wds_filtered.py +1 -3
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/catalog/catalog.py +2 -11
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/arrow.py +1 -1
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/dc.py +41 -10
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/webdataset.py +1 -1
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/dataset.py +14 -6
- {datachain-0.3.8 → datachain-0.3.9/src/datachain.egg-info}/PKG-INFO +12 -13
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_catalog.py +30 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_datachain.py +35 -0
- {datachain-0.3.8 → datachain-0.3.9}/.cruft.json +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.gitattributes +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.github/codecov.yaml +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.github/dependabot.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.github/workflows/release.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/.gitignore +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/LICENSE +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/assets/datachain.png +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/index.md +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/references/datachain.md +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/references/datatype.md +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/references/file.md +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/references/index.md +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/references/sql.md +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/references/torch.md +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/docs/references/udf.md +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/mkdocs.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/noxfile.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/pyproject.toml +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/setup.cfg +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/__main__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/asyn.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/cache.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/cli.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/client/local.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/config.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/dataset.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/error.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/job.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/file.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/hf.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/listing.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/listing.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/node.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/progress.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/py.typed +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/builtins.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/params.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/schema.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/session.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/storage.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain/utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/conftest.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/data.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/examples/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_client.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_datachain.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_datasets.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_listing.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_ls.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_pull.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/func/test_query.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_client.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_session.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_udf.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.8 → datachain-0.3.9}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.9
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -115,31 +115,30 @@ AI 🔗 DataChain
|
|
|
115
115
|
|
|
116
116
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
117
117
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
118
|
-
your local machine.
|
|
118
|
+
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
119
119
|
|
|
120
120
|
Key Features
|
|
121
121
|
============
|
|
122
122
|
|
|
123
123
|
📂 **Storage as a Source of Truth.**
|
|
124
|
-
- Process unstructured data without redundant copies
|
|
124
|
+
- Process unstructured data without redundant copies from S3, GCP, Azure, and local
|
|
125
125
|
file systems.
|
|
126
|
-
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
127
|
-
-
|
|
126
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
127
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
128
128
|
|
|
129
129
|
🐍 **Python-friendly data pipelines.**
|
|
130
130
|
- Operate on Python objects and object fields.
|
|
131
|
-
- Built-in parallelization and out-of-memory compute without
|
|
132
|
-
Spark jobs.
|
|
131
|
+
- Built-in parallelization and out-of-memory compute without SQL or Spark.
|
|
133
132
|
|
|
134
133
|
🧠 **Data Enrichment and Processing.**
|
|
135
|
-
- Generate metadata
|
|
136
|
-
- Filter, join, and group by
|
|
137
|
-
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
134
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
135
|
+
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
136
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
138
137
|
|
|
139
138
|
🚀 **Efficiency.**
|
|
140
139
|
- Parallelization, out-of-memory workloads and data caching.
|
|
141
140
|
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
142
|
-
-
|
|
141
|
+
- Optimized vector search.
|
|
143
142
|
|
|
144
143
|
|
|
145
144
|
Quick Start
|
|
@@ -164,7 +163,7 @@ where each image has a matching JSON file like `cat.1009.json`:
|
|
|
164
163
|
"inference": {"class": "dog", "confidence": 0.68}
|
|
165
164
|
}
|
|
166
165
|
|
|
167
|
-
Example of downloading only high-confidence cat images using JSON metadata:
|
|
166
|
+
Example of downloading only "high-confidence cat" inferred images using JSON metadata:
|
|
168
167
|
|
|
169
168
|
|
|
170
169
|
.. code:: py
|
|
@@ -234,7 +233,7 @@ detected are then copied to the local directory.
|
|
|
234
233
|
LLM judging chatbots
|
|
235
234
|
=============================
|
|
236
235
|
|
|
237
|
-
LLMs can work as
|
|
236
|
+
LLMs can work as universal classifiers. In the example below,
|
|
238
237
|
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
239
238
|
Mistral API key at https://console.mistral.ai
|
|
240
239
|
|
|
@@ -18,31 +18,30 @@ AI 🔗 DataChain
|
|
|
18
18
|
|
|
19
19
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
20
20
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
21
|
-
your local machine.
|
|
21
|
+
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
22
22
|
|
|
23
23
|
Key Features
|
|
24
24
|
============
|
|
25
25
|
|
|
26
26
|
📂 **Storage as a Source of Truth.**
|
|
27
|
-
- Process unstructured data without redundant copies
|
|
27
|
+
- Process unstructured data without redundant copies from S3, GCP, Azure, and local
|
|
28
28
|
file systems.
|
|
29
|
-
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
30
|
-
-
|
|
29
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
30
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
31
31
|
|
|
32
32
|
🐍 **Python-friendly data pipelines.**
|
|
33
33
|
- Operate on Python objects and object fields.
|
|
34
|
-
- Built-in parallelization and out-of-memory compute without
|
|
35
|
-
Spark jobs.
|
|
34
|
+
- Built-in parallelization and out-of-memory compute without SQL or Spark.
|
|
36
35
|
|
|
37
36
|
🧠 **Data Enrichment and Processing.**
|
|
38
|
-
- Generate metadata
|
|
39
|
-
- Filter, join, and group by
|
|
40
|
-
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
37
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
38
|
+
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
39
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
41
40
|
|
|
42
41
|
🚀 **Efficiency.**
|
|
43
42
|
- Parallelization, out-of-memory workloads and data caching.
|
|
44
43
|
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
45
|
-
-
|
|
44
|
+
- Optimized vector search.
|
|
46
45
|
|
|
47
46
|
|
|
48
47
|
Quick Start
|
|
@@ -67,7 +66,7 @@ where each image has a matching JSON file like `cat.1009.json`:
|
|
|
67
66
|
"inference": {"class": "dog", "confidence": 0.68}
|
|
68
67
|
}
|
|
69
68
|
|
|
70
|
-
Example of downloading only high-confidence cat images using JSON metadata:
|
|
69
|
+
Example of downloading only "high-confidence cat" inferred images using JSON metadata:
|
|
71
70
|
|
|
72
71
|
|
|
73
72
|
.. code:: py
|
|
@@ -137,7 +136,7 @@ detected are then copied to the local directory.
|
|
|
137
136
|
LLM judging chatbots
|
|
138
137
|
=============================
|
|
139
138
|
|
|
140
|
-
LLMs can work as
|
|
139
|
+
LLMs can work as universal classifiers. In the example below,
|
|
141
140
|
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
142
141
|
Mistral API key at https://console.mistral.ai
|
|
143
142
|
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
import datachain.error
|
|
2
2
|
from datachain import C, DataChain
|
|
3
|
-
from datachain.lib.model_store import ModelStore
|
|
4
3
|
from datachain.lib.webdataset import process_webdataset
|
|
5
|
-
from datachain.lib.webdataset_laion import
|
|
4
|
+
from datachain.lib.webdataset_laion import WDSLaion
|
|
6
5
|
from datachain.sql import literal
|
|
7
6
|
from datachain.sql.functions import array, greatest, least, string
|
|
8
7
|
|
|
9
8
|
name = "wds"
|
|
10
|
-
ModelStore.register(LaionMeta)
|
|
11
9
|
try:
|
|
12
10
|
wds = DataChain.from_dataset(name=name)
|
|
13
11
|
except datachain.error.DatasetNotFoundError:
|
|
@@ -1560,17 +1560,8 @@ class Catalog:
|
|
|
1560
1560
|
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1561
1561
|
|
|
1562
1562
|
file_signals_values = {}
|
|
1563
|
-
file_schemas = {}
|
|
1564
|
-
# TODO: To remove after we properly fix deserialization
|
|
1565
|
-
for signal, type_name in version.feature_schema.items():
|
|
1566
|
-
from datachain.lib.model_store import ModelStore
|
|
1567
1563
|
|
|
1568
|
-
|
|
1569
|
-
fr = ModelStore.get(type_name_parsed, v)
|
|
1570
|
-
if fr and issubclass(fr, File):
|
|
1571
|
-
file_schemas[signal] = type_name
|
|
1572
|
-
|
|
1573
|
-
schema = SignalSchema.deserialize(file_schemas)
|
|
1564
|
+
schema = SignalSchema.deserialize(version.feature_schema)
|
|
1574
1565
|
for file_signals in schema.get_signals(File):
|
|
1575
1566
|
prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
|
|
1576
1567
|
file_signals_values[file_signals] = {
|
|
@@ -1916,7 +1907,7 @@ class Catalog:
|
|
|
1916
1907
|
"""
|
|
1917
1908
|
from datachain.query.dataset import ExecutionResult
|
|
1918
1909
|
|
|
1919
|
-
feature_file = tempfile.NamedTemporaryFile(
|
|
1910
|
+
feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
|
|
1920
1911
|
dir=os.getcwd(), suffix=".py", delete=False
|
|
1921
1912
|
)
|
|
1922
1913
|
_, feature_module = os.path.split(feature_file.name)
|
|
@@ -131,7 +131,7 @@ def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
|
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
def _nrows_file(file: File, nrows: int) -> str:
|
|
134
|
-
tf = NamedTemporaryFile(delete=False)
|
|
134
|
+
tf = NamedTemporaryFile(delete=False) # noqa: SIM115
|
|
135
135
|
with file.open(mode="r") as reader:
|
|
136
136
|
with open(tf.name, "a") as writer:
|
|
137
137
|
for row, line in enumerate(reader):
|
|
@@ -1153,17 +1153,35 @@ class DataChain(DatasetQuery):
|
|
|
1153
1153
|
self,
|
|
1154
1154
|
other: "DataChain",
|
|
1155
1155
|
on: Optional[Union[str, Sequence[str]]] = None,
|
|
1156
|
+
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
1156
1157
|
) -> "Self":
|
|
1157
1158
|
"""Remove rows that appear in another chain.
|
|
1158
1159
|
|
|
1159
1160
|
Parameters:
|
|
1160
1161
|
other: chain whose rows will be removed from `self`
|
|
1161
|
-
on: columns to consider for determining row equality
|
|
1162
|
-
defaults to all common columns
|
|
1162
|
+
on: columns to consider for determining row equality in `self`.
|
|
1163
|
+
If unspecified, defaults to all common columns
|
|
1164
|
+
between `self` and `other`.
|
|
1165
|
+
right_on: columns to consider for determining row equality in `other`.
|
|
1166
|
+
If unspecified, defaults to the same values as `on`.
|
|
1163
1167
|
"""
|
|
1164
1168
|
if isinstance(on, str):
|
|
1169
|
+
if not on:
|
|
1170
|
+
raise DataChainParamsError("'on' cannot be an empty string")
|
|
1165
1171
|
on = [on]
|
|
1166
|
-
|
|
1172
|
+
elif isinstance(on, Sequence):
|
|
1173
|
+
if not on or any(not col for col in on):
|
|
1174
|
+
raise DataChainParamsError("'on' cannot contain empty strings")
|
|
1175
|
+
|
|
1176
|
+
if isinstance(right_on, str):
|
|
1177
|
+
if not right_on:
|
|
1178
|
+
raise DataChainParamsError("'right_on' cannot be an empty string")
|
|
1179
|
+
right_on = [right_on]
|
|
1180
|
+
elif isinstance(right_on, Sequence):
|
|
1181
|
+
if not right_on or any(not col for col in right_on):
|
|
1182
|
+
raise DataChainParamsError("'right_on' cannot contain empty strings")
|
|
1183
|
+
|
|
1184
|
+
if on is None and right_on is None:
|
|
1167
1185
|
other_columns = set(other._effective_signals_schema.db_signals())
|
|
1168
1186
|
signals = [
|
|
1169
1187
|
c
|
|
@@ -1172,16 +1190,29 @@ class DataChain(DatasetQuery):
|
|
|
1172
1190
|
]
|
|
1173
1191
|
if not signals:
|
|
1174
1192
|
raise DataChainParamsError("subtract(): no common columns")
|
|
1175
|
-
elif not
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
elif not on:
|
|
1193
|
+
elif on is not None and right_on is None:
|
|
1194
|
+
right_on = on
|
|
1195
|
+
signals = list(self.signals_schema.resolve(*on).db_signals())
|
|
1196
|
+
elif on is None and right_on is not None:
|
|
1180
1197
|
raise DataChainParamsError(
|
|
1181
|
-
"'on'
|
|
1198
|
+
"'on' must be specified when 'right_on' is provided"
|
|
1182
1199
|
)
|
|
1183
1200
|
else:
|
|
1184
|
-
|
|
1201
|
+
if not isinstance(on, Sequence) or not isinstance(right_on, Sequence):
|
|
1202
|
+
raise TypeError(
|
|
1203
|
+
"'on' and 'right_on' must be 'str' or 'Sequence' object"
|
|
1204
|
+
)
|
|
1205
|
+
if len(on) != len(right_on):
|
|
1206
|
+
raise DataChainParamsError(
|
|
1207
|
+
"'on' and 'right_on' must have the same length"
|
|
1208
|
+
)
|
|
1209
|
+
signals = list(
|
|
1210
|
+
zip(
|
|
1211
|
+
self.signals_schema.resolve(*on).db_signals(),
|
|
1212
|
+
other.signals_schema.resolve(*right_on).db_signals(),
|
|
1213
|
+
) # type: ignore[arg-type]
|
|
1214
|
+
)
|
|
1215
|
+
|
|
1185
1216
|
return super()._subtract(other, signals) # type: ignore[arg-type]
|
|
1186
1217
|
|
|
1187
1218
|
@classmethod
|
|
@@ -222,7 +222,7 @@ class TarStream(File):
|
|
|
222
222
|
self._tar = None
|
|
223
223
|
|
|
224
224
|
def open(self):
|
|
225
|
-
self._tar = tarfile.open(fileobj=super().open())
|
|
225
|
+
self._tar = tarfile.open(fileobj=super().open()) # noqa: SIM115
|
|
226
226
|
return self
|
|
227
227
|
|
|
228
228
|
def getmembers(self) -> list[tarfile.TarInfo]:
|
|
@@ -296,15 +296,23 @@ class DatasetDiffOperation(Step):
|
|
|
296
296
|
|
|
297
297
|
@frozen
|
|
298
298
|
class Subtract(DatasetDiffOperation):
|
|
299
|
-
on: Sequence[str]
|
|
299
|
+
on: Sequence[tuple[str, str]]
|
|
300
300
|
|
|
301
301
|
def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
|
|
302
302
|
sq = source_query.alias("source_query")
|
|
303
303
|
tq = target_query.alias("target_query")
|
|
304
304
|
where_clause = sa.and_(
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
305
|
+
*[
|
|
306
|
+
getattr(
|
|
307
|
+
sq.c, col_name[0] if isinstance(col_name, tuple) else col_name
|
|
308
|
+
).is_not_distinct_from(
|
|
309
|
+
getattr(
|
|
310
|
+
tq.c, col_name[1] if isinstance(col_name, tuple) else col_name
|
|
311
|
+
)
|
|
312
|
+
)
|
|
313
|
+
for col_name in self.on
|
|
314
|
+
]
|
|
315
|
+
)
|
|
308
316
|
return sq.select().except_(sq.select().where(where_clause))
|
|
309
317
|
|
|
310
318
|
|
|
@@ -1571,10 +1579,10 @@ class DatasetQuery:
|
|
|
1571
1579
|
|
|
1572
1580
|
@detach
|
|
1573
1581
|
def subtract(self, dq: "DatasetQuery") -> "Self":
|
|
1574
|
-
return self._subtract(dq, on=["source", "path"])
|
|
1582
|
+
return self._subtract(dq, on=[("source", "source"), ("path", "path")])
|
|
1575
1583
|
|
|
1576
1584
|
@detach
|
|
1577
|
-
def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":
|
|
1585
|
+
def _subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
|
|
1578
1586
|
query = self.clone()
|
|
1579
1587
|
query.steps.append(Subtract(dq, self.catalog, on=on))
|
|
1580
1588
|
return query
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.9
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -115,31 +115,30 @@ AI 🔗 DataChain
|
|
|
115
115
|
|
|
116
116
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
117
117
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
118
|
-
your local machine.
|
|
118
|
+
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
119
119
|
|
|
120
120
|
Key Features
|
|
121
121
|
============
|
|
122
122
|
|
|
123
123
|
📂 **Storage as a Source of Truth.**
|
|
124
|
-
- Process unstructured data without redundant copies
|
|
124
|
+
- Process unstructured data without redundant copies from S3, GCP, Azure, and local
|
|
125
125
|
file systems.
|
|
126
|
-
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
127
|
-
-
|
|
126
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
127
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
128
128
|
|
|
129
129
|
🐍 **Python-friendly data pipelines.**
|
|
130
130
|
- Operate on Python objects and object fields.
|
|
131
|
-
- Built-in parallelization and out-of-memory compute without
|
|
132
|
-
Spark jobs.
|
|
131
|
+
- Built-in parallelization and out-of-memory compute without SQL or Spark.
|
|
133
132
|
|
|
134
133
|
🧠 **Data Enrichment and Processing.**
|
|
135
|
-
- Generate metadata
|
|
136
|
-
- Filter, join, and group by
|
|
137
|
-
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
134
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
135
|
+
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
136
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
138
137
|
|
|
139
138
|
🚀 **Efficiency.**
|
|
140
139
|
- Parallelization, out-of-memory workloads and data caching.
|
|
141
140
|
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
142
|
-
-
|
|
141
|
+
- Optimized vector search.
|
|
143
142
|
|
|
144
143
|
|
|
145
144
|
Quick Start
|
|
@@ -164,7 +163,7 @@ where each image has a matching JSON file like `cat.1009.json`:
|
|
|
164
163
|
"inference": {"class": "dog", "confidence": 0.68}
|
|
165
164
|
}
|
|
166
165
|
|
|
167
|
-
Example of downloading only high-confidence cat images using JSON metadata:
|
|
166
|
+
Example of downloading only "high-confidence cat" inferred images using JSON metadata:
|
|
168
167
|
|
|
169
168
|
|
|
170
169
|
.. code:: py
|
|
@@ -234,7 +233,7 @@ detected are then copied to the local directory.
|
|
|
234
233
|
LLM judging chatbots
|
|
235
234
|
=============================
|
|
236
235
|
|
|
237
|
-
LLMs can work as
|
|
236
|
+
LLMs can work as universal classifiers. In the example below,
|
|
238
237
|
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
239
238
|
Mistral API key at https://console.mistral.ai
|
|
240
239
|
|
|
@@ -1151,6 +1151,36 @@ def test_get_file_signals(cloud_test_catalog, dogs_dataset):
|
|
|
1151
1151
|
}
|
|
1152
1152
|
|
|
1153
1153
|
|
|
1154
|
+
def test_get_file_signals_with_custom_types(cloud_test_catalog, dogs_dataset):
|
|
1155
|
+
catalog = cloud_test_catalog.catalog
|
|
1156
|
+
catalog.metastore.update_dataset_version(
|
|
1157
|
+
dogs_dataset,
|
|
1158
|
+
1,
|
|
1159
|
+
feature_schema={
|
|
1160
|
+
"name": "str",
|
|
1161
|
+
"age": "str",
|
|
1162
|
+
"f1": "File@v1",
|
|
1163
|
+
"f2": "File@v1",
|
|
1164
|
+
"_custom_types": {
|
|
1165
|
+
"File@v1": {"source": "str", "name": "str"},
|
|
1166
|
+
},
|
|
1167
|
+
},
|
|
1168
|
+
)
|
|
1169
|
+
row = {
|
|
1170
|
+
"name": "Jon",
|
|
1171
|
+
"age": 25,
|
|
1172
|
+
"f1__source": "s3://first_bucket",
|
|
1173
|
+
"f1__name": "image1.jpg",
|
|
1174
|
+
"f2__source": "s3://second_bucket",
|
|
1175
|
+
"f2__name": "image2.jpg",
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
assert catalog.get_file_signals(dogs_dataset.name, 1, row) == {
|
|
1179
|
+
"source": "s3://first_bucket",
|
|
1180
|
+
"name": "image1.jpg",
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
|
|
1154
1184
|
def test_get_file_signals_no_signals(cloud_test_catalog, dogs_dataset):
|
|
1155
1185
|
catalog = cloud_test_catalog.catalog
|
|
1156
1186
|
catalog.metastore.update_dataset_version(
|
|
@@ -1504,6 +1504,11 @@ def test_subtract(test_session):
|
|
|
1504
1504
|
assert set(chain1.subtract(chain3, on="a").collect()) == {(2, "z")}
|
|
1505
1505
|
assert set(chain1.subtract(chain3).collect()) == {(2, "z")}
|
|
1506
1506
|
|
|
1507
|
+
chain4 = DataChain.from_values(d=[1, 2, 3], e=["x", "y", "z"], session=test_session)
|
|
1508
|
+
chain5 = DataChain.from_values(a=[1, 2], b=["x", "y"], session=test_session)
|
|
1509
|
+
|
|
1510
|
+
assert set(chain4.subtract(chain5, on="d", right_on="a").collect()) == {(3, "z")}
|
|
1511
|
+
|
|
1507
1512
|
|
|
1508
1513
|
def test_subtract_error(test_session):
|
|
1509
1514
|
chain1 = DataChain.from_values(a=[1, 1, 2], b=["x", "y", "z"], session=test_session)
|
|
@@ -1513,6 +1518,36 @@ def test_subtract_error(test_session):
|
|
|
1513
1518
|
with pytest.raises(TypeError):
|
|
1514
1519
|
chain1.subtract(chain2, on=42)
|
|
1515
1520
|
|
|
1521
|
+
with pytest.raises(DataChainParamsError):
|
|
1522
|
+
chain1.subtract(chain2, on="")
|
|
1523
|
+
|
|
1524
|
+
with pytest.raises(DataChainParamsError):
|
|
1525
|
+
chain1.subtract(chain2, on="a", right_on="")
|
|
1526
|
+
|
|
1527
|
+
with pytest.raises(DataChainParamsError):
|
|
1528
|
+
chain1.subtract(chain2, on=["a", "b"], right_on=["c", ""])
|
|
1529
|
+
|
|
1530
|
+
with pytest.raises(DataChainParamsError):
|
|
1531
|
+
chain1.subtract(chain2, on=["a", "b"], right_on=[])
|
|
1532
|
+
|
|
1533
|
+
with pytest.raises(DataChainParamsError):
|
|
1534
|
+
chain1.subtract(chain2, on=["a", "b"], right_on=["d"])
|
|
1535
|
+
|
|
1536
|
+
with pytest.raises(DataChainParamsError):
|
|
1537
|
+
chain1.subtract(chain2, right_on=[])
|
|
1538
|
+
|
|
1539
|
+
with pytest.raises(DataChainParamsError):
|
|
1540
|
+
chain1.subtract(chain2, right_on="")
|
|
1541
|
+
|
|
1542
|
+
with pytest.raises(DataChainParamsError):
|
|
1543
|
+
chain1.subtract(chain2, right_on=42)
|
|
1544
|
+
|
|
1545
|
+
with pytest.raises(DataChainParamsError):
|
|
1546
|
+
chain1.subtract(chain2, right_on=["a"])
|
|
1547
|
+
|
|
1548
|
+
with pytest.raises(TypeError):
|
|
1549
|
+
chain1.subtract(chain2, on=42, right_on=42)
|
|
1550
|
+
|
|
1516
1551
|
chain3 = DataChain.from_values(c=["foo", "bar"], session=test_session)
|
|
1517
1552
|
with pytest.raises(DataChainParamsError):
|
|
1518
1553
|
chain1.subtract(chain3)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|