datachain 0.3.15__tar.gz → 0.3.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.15 → datachain-0.3.17}/.pre-commit-config.yaml +1 -1
- {datachain-0.3.15/src/datachain.egg-info → datachain-0.3.17}/PKG-INFO +1 -1
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/catalog/catalog.py +13 -37
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/cli.py +0 -25
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/metastore.py +7 -66
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/sqlite.py +24 -2
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/warehouse.py +19 -25
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/dc.py +1 -2
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/listing.py +1 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/tar.py +2 -1
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/node.py +17 -3
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/__init__.py +0 -2
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/dataset.py +58 -145
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/schema.py +23 -12
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/udf.py +2 -42
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/utils.py +0 -40
- {datachain-0.3.15 → datachain-0.3.17/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain.egg-info/SOURCES.txt +0 -2
- {datachain-0.3.15 → datachain-0.3.17}/tests/conftest.py +15 -9
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_catalog.py +0 -116
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_datachain.py +628 -12
- datachain-0.3.17/tests/func/test_dataset_query.py +1195 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_datasets.py +101 -88
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_feature_pickling.py +0 -8
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_pull.py +23 -11
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_query.py +16 -10
- {datachain-0.3.15 → datachain-0.3.17}/tests/scripts/name_len_slow.py +9 -15
- {datachain-0.3.15 → datachain-0.3.17}/tests/test_cli_e2e.py +1 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_datachain.py +15 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_datachain_merge.py +98 -1
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_data_storage.py +17 -10
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_utils.py +0 -25
- {datachain-0.3.15 → datachain-0.3.17}/tests/utils.py +22 -63
- datachain-0.3.15/src/datachain/query/builtins.py +0 -96
- datachain-0.3.15/tests/func/test_dataset_query.py +0 -3463
- datachain-0.3.15/tests/unit/test_udf.py +0 -98
- {datachain-0.3.15 → datachain-0.3.17}/.cruft.json +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.gitattributes +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.github/codecov.yaml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.github/dependabot.yml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.github/workflows/release.yml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.github/workflows/tests.yml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/.gitignore +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/LICENSE +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/README.rst +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/assets/datachain.svg +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/index.md +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/references/datachain.md +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/references/datatype.md +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/references/file.md +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/references/index.md +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/references/sql.md +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/references/torch.md +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/docs/references/udf.md +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/mkdocs.yml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/noxfile.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/pyproject.toml +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/setup.cfg +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/__main__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/asyn.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/cache.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/hf.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/local.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/config.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/dataset.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/error.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/job.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/file.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/hf.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/listing.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/progress.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/py.typed +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/params.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/session.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/storage.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/data.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/examples/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_client.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_listing.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_ls.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_client.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_session.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_warehouse.py +0 -0
|
@@ -68,8 +68,6 @@ from datachain.utils import (
|
|
|
68
68
|
DataChainDir,
|
|
69
69
|
batched,
|
|
70
70
|
datachain_paths_join,
|
|
71
|
-
import_object,
|
|
72
|
-
parse_params_string,
|
|
73
71
|
)
|
|
74
72
|
|
|
75
73
|
from .datasource import DataSource
|
|
@@ -843,7 +841,7 @@ class Catalog:
|
|
|
843
841
|
from datachain.query import DatasetQuery
|
|
844
842
|
|
|
845
843
|
def _row_to_node(d: dict[str, Any]) -> Node:
|
|
846
|
-
del d["
|
|
844
|
+
del d["file__source"]
|
|
847
845
|
return Node.from_dict(d)
|
|
848
846
|
|
|
849
847
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
@@ -1148,30 +1146,28 @@ class Catalog:
|
|
|
1148
1146
|
if not sources:
|
|
1149
1147
|
raise ValueError("Sources needs to be non empty list")
|
|
1150
1148
|
|
|
1151
|
-
from datachain.
|
|
1149
|
+
from datachain.lib.dc import DataChain
|
|
1150
|
+
from datachain.query.session import Session
|
|
1151
|
+
|
|
1152
|
+
session = Session.get(catalog=self, client_config=client_config)
|
|
1152
1153
|
|
|
1153
|
-
|
|
1154
|
+
chains = []
|
|
1154
1155
|
for source in sources:
|
|
1155
1156
|
if source.startswith(DATASET_PREFIX):
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
catalog=self,
|
|
1159
|
-
client_config=client_config,
|
|
1157
|
+
dc = DataChain.from_dataset(
|
|
1158
|
+
source[len(DATASET_PREFIX) :], session=session
|
|
1160
1159
|
)
|
|
1161
1160
|
else:
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
catalog=self,
|
|
1165
|
-
client_config=client_config,
|
|
1166
|
-
recursive=recursive,
|
|
1161
|
+
dc = DataChain.from_storage(
|
|
1162
|
+
source, session=session, recursive=recursive
|
|
1167
1163
|
)
|
|
1168
1164
|
|
|
1169
|
-
|
|
1165
|
+
chains.append(dc)
|
|
1170
1166
|
|
|
1171
1167
|
# create union of all dataset queries created from sources
|
|
1172
|
-
|
|
1168
|
+
dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
|
|
1173
1169
|
try:
|
|
1174
|
-
|
|
1170
|
+
dc.save(name)
|
|
1175
1171
|
except Exception as e: # noqa: BLE001
|
|
1176
1172
|
try:
|
|
1177
1173
|
ds = self.get_dataset(name)
|
|
@@ -1731,26 +1727,6 @@ class Catalog:
|
|
|
1731
1727
|
output, sources, client_config=client_config, recursive=recursive
|
|
1732
1728
|
)
|
|
1733
1729
|
|
|
1734
|
-
def apply_udf(
|
|
1735
|
-
self,
|
|
1736
|
-
udf_location: str,
|
|
1737
|
-
source: str,
|
|
1738
|
-
target_name: str,
|
|
1739
|
-
parallel: Optional[int] = None,
|
|
1740
|
-
params: Optional[str] = None,
|
|
1741
|
-
):
|
|
1742
|
-
from datachain.query import DatasetQuery
|
|
1743
|
-
|
|
1744
|
-
if source.startswith(DATASET_PREFIX):
|
|
1745
|
-
ds = DatasetQuery(name=source[len(DATASET_PREFIX) :], catalog=self)
|
|
1746
|
-
else:
|
|
1747
|
-
ds = DatasetQuery(path=source, catalog=self)
|
|
1748
|
-
udf = import_object(udf_location)
|
|
1749
|
-
if params:
|
|
1750
|
-
args, kwargs = parse_params_string(params)
|
|
1751
|
-
udf = udf(*args, **kwargs)
|
|
1752
|
-
ds.add_signals(udf, parallel=parallel).save(target_name)
|
|
1753
|
-
|
|
1754
1730
|
def query(
|
|
1755
1731
|
self,
|
|
1756
1732
|
query_script: str,
|
|
@@ -494,27 +494,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
494
494
|
help="Query parameters",
|
|
495
495
|
)
|
|
496
496
|
|
|
497
|
-
apply_udf_parser = subp.add_parser(
|
|
498
|
-
"apply-udf", parents=[parent_parser], description="Apply UDF"
|
|
499
|
-
)
|
|
500
|
-
apply_udf_parser.add_argument("udf", type=str, help="UDF location")
|
|
501
|
-
apply_udf_parser.add_argument("source", type=str, help="Source storage or dataset")
|
|
502
|
-
apply_udf_parser.add_argument("target", type=str, help="Target dataset name")
|
|
503
|
-
apply_udf_parser.add_argument(
|
|
504
|
-
"--parallel",
|
|
505
|
-
nargs="?",
|
|
506
|
-
type=int,
|
|
507
|
-
const=-1,
|
|
508
|
-
default=None,
|
|
509
|
-
metavar="N",
|
|
510
|
-
help=(
|
|
511
|
-
"Use multiprocessing to run the UDF with N worker processes. "
|
|
512
|
-
"N defaults to the CPU count."
|
|
513
|
-
),
|
|
514
|
-
)
|
|
515
|
-
apply_udf_parser.add_argument(
|
|
516
|
-
"--udf-params", type=str, default=None, help="UDF class parameters"
|
|
517
|
-
)
|
|
518
497
|
subp.add_parser(
|
|
519
498
|
"clear-cache", parents=[parent_parser], description="Clear the local file cache"
|
|
520
499
|
)
|
|
@@ -1016,10 +995,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1016
995
|
parallel=args.parallel,
|
|
1017
996
|
params=args.param,
|
|
1018
997
|
)
|
|
1019
|
-
elif args.command == "apply-udf":
|
|
1020
|
-
catalog.apply_udf(
|
|
1021
|
-
args.udf, args.source, args.target, args.parallel, args.udf_params
|
|
1022
|
-
)
|
|
1023
998
|
elif args.command == "clear-cache":
|
|
1024
999
|
clear_cache(catalog)
|
|
1025
1000
|
elif args.command == "gc":
|
|
@@ -297,39 +297,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
297
297
|
#
|
|
298
298
|
# Dataset dependencies
|
|
299
299
|
#
|
|
300
|
-
|
|
301
|
-
def add_dependency(
|
|
302
|
-
self,
|
|
303
|
-
dependency: DatasetDependency,
|
|
304
|
-
source_dataset_name: str,
|
|
305
|
-
source_dataset_version: int,
|
|
306
|
-
) -> None:
|
|
307
|
-
"""Add dependency to dataset or storage."""
|
|
308
|
-
if dependency.is_dataset:
|
|
309
|
-
self.add_dataset_dependency(
|
|
310
|
-
source_dataset_name,
|
|
311
|
-
source_dataset_version,
|
|
312
|
-
dependency.dataset_name,
|
|
313
|
-
int(dependency.version),
|
|
314
|
-
)
|
|
315
|
-
else:
|
|
316
|
-
self.add_storage_dependency(
|
|
317
|
-
source_dataset_name,
|
|
318
|
-
source_dataset_version,
|
|
319
|
-
StorageURI(dependency.name),
|
|
320
|
-
dependency.version,
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
@abstractmethod
|
|
324
|
-
def add_storage_dependency(
|
|
325
|
-
self,
|
|
326
|
-
source_dataset_name: str,
|
|
327
|
-
source_dataset_version: int,
|
|
328
|
-
storage_uri: StorageURI,
|
|
329
|
-
storage_timestamp_str: Optional[str] = None,
|
|
330
|
-
) -> None:
|
|
331
|
-
"""Adds storage dependency to dataset."""
|
|
332
|
-
|
|
333
300
|
@abstractmethod
|
|
334
301
|
def add_dataset_dependency(
|
|
335
302
|
self,
|
|
@@ -1268,32 +1235,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1268
1235
|
#
|
|
1269
1236
|
# Dataset dependencies
|
|
1270
1237
|
#
|
|
1271
|
-
|
|
1272
|
-
def _insert_dataset_dependency(self, data: dict[str, Any]) -> None:
|
|
1273
|
-
"""Method for inserting dependencies."""
|
|
1274
|
-
self.db.execute(self._datasets_dependencies_insert().values(**data))
|
|
1275
|
-
|
|
1276
|
-
def add_storage_dependency(
|
|
1277
|
-
self,
|
|
1278
|
-
source_dataset_name: str,
|
|
1279
|
-
source_dataset_version: int,
|
|
1280
|
-
storage_uri: StorageURI,
|
|
1281
|
-
storage_timestamp_str: Optional[str] = None,
|
|
1282
|
-
) -> None:
|
|
1283
|
-
source_dataset = self.get_dataset(source_dataset_name)
|
|
1284
|
-
storage = self.get_storage(storage_uri)
|
|
1285
|
-
|
|
1286
|
-
self._insert_dataset_dependency(
|
|
1287
|
-
{
|
|
1288
|
-
"source_dataset_id": source_dataset.id,
|
|
1289
|
-
"source_dataset_version_id": (
|
|
1290
|
-
source_dataset.get_version(source_dataset_version).id
|
|
1291
|
-
),
|
|
1292
|
-
"bucket_id": storage.id,
|
|
1293
|
-
"bucket_version": storage_timestamp_str,
|
|
1294
|
-
}
|
|
1295
|
-
)
|
|
1296
|
-
|
|
1297
1238
|
def add_dataset_dependency(
|
|
1298
1239
|
self,
|
|
1299
1240
|
source_dataset_name: str,
|
|
@@ -1305,15 +1246,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1305
1246
|
source_dataset = self.get_dataset(source_dataset_name)
|
|
1306
1247
|
dataset = self.get_dataset(dataset_name)
|
|
1307
1248
|
|
|
1308
|
-
self.
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1249
|
+
self.db.execute(
|
|
1250
|
+
self._datasets_dependencies_insert().values(
|
|
1251
|
+
source_dataset_id=source_dataset.id,
|
|
1252
|
+
source_dataset_version_id=(
|
|
1312
1253
|
source_dataset.get_version(source_dataset_version).id
|
|
1313
1254
|
),
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1255
|
+
dataset_id=dataset.id,
|
|
1256
|
+
dataset_version_id=dataset.get_version(dataset_version).id,
|
|
1257
|
+
)
|
|
1317
1258
|
)
|
|
1318
1259
|
|
|
1319
1260
|
def update_dataset_dependency_source(
|
|
@@ -40,7 +40,9 @@ if TYPE_CHECKING:
|
|
|
40
40
|
from sqlalchemy.dialects.sqlite import Insert
|
|
41
41
|
from sqlalchemy.engine.base import Engine
|
|
42
42
|
from sqlalchemy.schema import SchemaItem
|
|
43
|
+
from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
|
|
43
44
|
from sqlalchemy.sql.elements import ColumnElement
|
|
45
|
+
from sqlalchemy.sql.selectable import Join
|
|
44
46
|
from sqlalchemy.types import TypeEngine
|
|
45
47
|
|
|
46
48
|
from datachain.lib.file import File
|
|
@@ -649,11 +651,14 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
649
651
|
self, dataset: DatasetRecord, version: int
|
|
650
652
|
) -> list[StorageURI]:
|
|
651
653
|
dr = self.dataset_rows(dataset, version)
|
|
652
|
-
query = dr.select(dr.c.
|
|
654
|
+
query = dr.select(dr.c.file__source).distinct()
|
|
653
655
|
cur = self.db.cursor()
|
|
654
656
|
cur.row_factory = sqlite3.Row # type: ignore[assignment]
|
|
655
657
|
|
|
656
|
-
return [
|
|
658
|
+
return [
|
|
659
|
+
StorageURI(row["file__source"])
|
|
660
|
+
for row in self.db.execute(query, cursor=cur)
|
|
661
|
+
]
|
|
657
662
|
|
|
658
663
|
def merge_dataset_rows(
|
|
659
664
|
self,
|
|
@@ -788,6 +793,23 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
788
793
|
if progress_cb:
|
|
789
794
|
progress_cb(len(batch_ids))
|
|
790
795
|
|
|
796
|
+
def join(
|
|
797
|
+
self,
|
|
798
|
+
left: "_FromClauseArgument",
|
|
799
|
+
right: "_FromClauseArgument",
|
|
800
|
+
onclause: "_OnClauseArgument",
|
|
801
|
+
inner: bool = True,
|
|
802
|
+
) -> "Join":
|
|
803
|
+
"""
|
|
804
|
+
Join two tables together.
|
|
805
|
+
"""
|
|
806
|
+
return sqlalchemy.join(
|
|
807
|
+
left,
|
|
808
|
+
right,
|
|
809
|
+
onclause,
|
|
810
|
+
isouter=not inner,
|
|
811
|
+
)
|
|
812
|
+
|
|
791
813
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
792
814
|
"""
|
|
793
815
|
Create a temporary table from a query for use in a UDF.
|
|
@@ -27,8 +27,12 @@ from datachain.storage import StorageURI
|
|
|
27
27
|
from datachain.utils import sql_escape_like
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
30
|
-
from sqlalchemy.sql._typing import
|
|
31
|
-
|
|
30
|
+
from sqlalchemy.sql._typing import (
|
|
31
|
+
_ColumnsClauseArgument,
|
|
32
|
+
_FromClauseArgument,
|
|
33
|
+
_OnClauseArgument,
|
|
34
|
+
)
|
|
35
|
+
from sqlalchemy.sql.selectable import Join, Select
|
|
32
36
|
from sqlalchemy.types import TypeEngine
|
|
33
37
|
|
|
34
38
|
from datachain.data_storage import AbstractIDGenerator, schema
|
|
@@ -894,6 +898,18 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
894
898
|
Copy the results of a query into a table.
|
|
895
899
|
"""
|
|
896
900
|
|
|
901
|
+
@abstractmethod
|
|
902
|
+
def join(
|
|
903
|
+
self,
|
|
904
|
+
left: "_FromClauseArgument",
|
|
905
|
+
right: "_FromClauseArgument",
|
|
906
|
+
onclause: "_OnClauseArgument",
|
|
907
|
+
inner: bool = True,
|
|
908
|
+
) -> "Join":
|
|
909
|
+
"""
|
|
910
|
+
Join two tables together.
|
|
911
|
+
"""
|
|
912
|
+
|
|
897
913
|
@abstractmethod
|
|
898
914
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
899
915
|
"""
|
|
@@ -922,32 +938,10 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
922
938
|
are cleaned up as soon as they are no longer needed.
|
|
923
939
|
"""
|
|
924
940
|
with tqdm(desc="Cleanup", unit=" tables") as pbar:
|
|
925
|
-
for name in names:
|
|
941
|
+
for name in set(names):
|
|
926
942
|
self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
|
|
927
943
|
pbar.update(1)
|
|
928
944
|
|
|
929
|
-
def changed_query(
|
|
930
|
-
self,
|
|
931
|
-
source_query: sa.sql.selectable.Select,
|
|
932
|
-
target_query: sa.sql.selectable.Select,
|
|
933
|
-
) -> sa.sql.selectable.Select:
|
|
934
|
-
sq = source_query.alias("source_query")
|
|
935
|
-
tq = target_query.alias("target_query")
|
|
936
|
-
|
|
937
|
-
source_target_join = sa.join(
|
|
938
|
-
sq, tq, (sq.c.source == tq.c.source) & (sq.c.path == tq.c.path)
|
|
939
|
-
)
|
|
940
|
-
|
|
941
|
-
return (
|
|
942
|
-
select(*sq.c)
|
|
943
|
-
.select_from(source_target_join)
|
|
944
|
-
.where(
|
|
945
|
-
(sq.c.last_modified > tq.c.last_modified)
|
|
946
|
-
& (sq.c.is_latest == true())
|
|
947
|
-
& (tq.c.is_latest == true())
|
|
948
|
-
)
|
|
949
|
-
)
|
|
950
|
-
|
|
951
945
|
|
|
952
946
|
def _random_string(length: int) -> str:
|
|
953
947
|
return "".join(
|
|
@@ -1337,8 +1337,7 @@ class DataChain(DatasetQuery):
|
|
|
1337
1337
|
other.signals_schema.resolve(*right_on).db_signals(),
|
|
1338
1338
|
) # type: ignore[arg-type]
|
|
1339
1339
|
)
|
|
1340
|
-
|
|
1341
|
-
return super()._subtract(other, signals) # type: ignore[arg-type]
|
|
1340
|
+
return super().subtract(other, signals) # type: ignore[arg-type]
|
|
1342
1341
|
|
|
1343
1342
|
@classmethod
|
|
1344
1343
|
def from_values(
|
|
@@ -77,6 +77,7 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
|
|
|
77
77
|
"""
|
|
78
78
|
Parsing uri and returns listing dataset name, listing uri and listing path
|
|
79
79
|
"""
|
|
80
|
+
client_config = client_config or {}
|
|
80
81
|
client = Client.get_client(uri, cache, **client_config)
|
|
81
82
|
storage_uri, path = Client.parse_url(uri)
|
|
82
83
|
|
|
@@ -114,9 +114,23 @@ class Node:
|
|
|
114
114
|
)
|
|
115
115
|
|
|
116
116
|
@classmethod
|
|
117
|
-
def from_dict(cls, d: dict[str, Any]) -> "Self":
|
|
118
|
-
|
|
119
|
-
|
|
117
|
+
def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
|
|
118
|
+
def _dval(field_name: str):
|
|
119
|
+
return d.get(f"{file_prefix}__{field_name}")
|
|
120
|
+
|
|
121
|
+
return cls(
|
|
122
|
+
sys__id=d["sys__id"],
|
|
123
|
+
sys__rand=d["sys__rand"],
|
|
124
|
+
source=_dval("source"),
|
|
125
|
+
path=_dval("path"),
|
|
126
|
+
etag=_dval("etag"),
|
|
127
|
+
is_latest=_dval("is_latest"),
|
|
128
|
+
size=_dval("size"),
|
|
129
|
+
last_modified=_dval("last_modified"),
|
|
130
|
+
version=_dval("version"),
|
|
131
|
+
location=_dval("location"),
|
|
132
|
+
dir_type=DirType.FILE,
|
|
133
|
+
)
|
|
120
134
|
|
|
121
135
|
@classmethod
|
|
122
136
|
def from_dir(cls, path, **kwargs) -> "Node":
|
|
@@ -2,7 +2,6 @@ from .dataset import DatasetQuery
|
|
|
2
2
|
from .params import param
|
|
3
3
|
from .schema import C, DatasetRow, LocalFilename, Object, Stream
|
|
4
4
|
from .session import Session
|
|
5
|
-
from .udf import udf
|
|
6
5
|
|
|
7
6
|
__all__ = [
|
|
8
7
|
"C",
|
|
@@ -13,5 +12,4 @@ __all__ = [
|
|
|
13
12
|
"Session",
|
|
14
13
|
"Stream",
|
|
15
14
|
"param",
|
|
16
|
-
"udf",
|
|
17
15
|
]
|