datachain 0.3.16__tar.gz → 0.3.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.16 → datachain-0.3.17}/.pre-commit-config.yaml +1 -1
- {datachain-0.3.16/src/datachain.egg-info → datachain-0.3.17}/PKG-INFO +1 -1
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/catalog/catalog.py +13 -37
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/cli.py +0 -25
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/data_storage/metastore.py +7 -66
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/data_storage/sqlite.py +5 -2
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/data_storage/warehouse.py +0 -22
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/dc.py +1 -2
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/listing.py +1 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/tar.py +2 -1
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/node.py +17 -3
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/query/__init__.py +0 -2
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/query/dataset.py +20 -126
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/query/schema.py +23 -12
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/query/udf.py +2 -42
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/utils.py +0 -40
- {datachain-0.3.16 → datachain-0.3.17/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain.egg-info/SOURCES.txt +0 -2
- {datachain-0.3.16 → datachain-0.3.17}/tests/conftest.py +15 -9
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_catalog.py +0 -116
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_datachain.py +628 -12
- datachain-0.3.17/tests/func/test_dataset_query.py +1195 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_datasets.py +101 -88
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_feature_pickling.py +0 -8
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_pull.py +23 -11
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_query.py +16 -10
- {datachain-0.3.16 → datachain-0.3.17}/tests/scripts/name_len_slow.py +9 -15
- {datachain-0.3.16 → datachain-0.3.17}/tests/test_cli_e2e.py +1 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_datachain.py +15 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_datachain_merge.py +98 -1
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_data_storage.py +17 -10
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_utils.py +0 -25
- {datachain-0.3.16 → datachain-0.3.17}/tests/utils.py +22 -63
- datachain-0.3.16/src/datachain/query/builtins.py +0 -96
- datachain-0.3.16/tests/func/test_dataset_query.py +0 -3580
- datachain-0.3.16/tests/unit/test_udf.py +0 -98
- {datachain-0.3.16 → datachain-0.3.17}/.cruft.json +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.gitattributes +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.github/codecov.yaml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.github/dependabot.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.github/workflows/release.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.github/workflows/tests.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/.gitignore +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/LICENSE +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/README.rst +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/assets/datachain.svg +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/index.md +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/references/datachain.md +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/references/datatype.md +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/references/file.md +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/references/index.md +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/references/sql.md +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/references/torch.md +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/docs/references/udf.md +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/mkdocs.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/noxfile.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/pyproject.toml +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/setup.cfg +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/__main__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/asyn.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/cache.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/client/hf.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/client/local.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/config.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/dataset.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/error.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/job.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/file.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/hf.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/listing.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/progress.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/py.typed +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/query/params.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/query/session.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/storage.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/data.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/examples/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_client.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_listing.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_ls.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_client.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_session.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.16 → datachain-0.3.17}/tests/unit/test_warehouse.py +0 -0
|
@@ -68,8 +68,6 @@ from datachain.utils import (
|
|
|
68
68
|
DataChainDir,
|
|
69
69
|
batched,
|
|
70
70
|
datachain_paths_join,
|
|
71
|
-
import_object,
|
|
72
|
-
parse_params_string,
|
|
73
71
|
)
|
|
74
72
|
|
|
75
73
|
from .datasource import DataSource
|
|
@@ -843,7 +841,7 @@ class Catalog:
|
|
|
843
841
|
from datachain.query import DatasetQuery
|
|
844
842
|
|
|
845
843
|
def _row_to_node(d: dict[str, Any]) -> Node:
|
|
846
|
-
del d["
|
|
844
|
+
del d["file__source"]
|
|
847
845
|
return Node.from_dict(d)
|
|
848
846
|
|
|
849
847
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
@@ -1148,30 +1146,28 @@ class Catalog:
|
|
|
1148
1146
|
if not sources:
|
|
1149
1147
|
raise ValueError("Sources needs to be non empty list")
|
|
1150
1148
|
|
|
1151
|
-
from datachain.
|
|
1149
|
+
from datachain.lib.dc import DataChain
|
|
1150
|
+
from datachain.query.session import Session
|
|
1151
|
+
|
|
1152
|
+
session = Session.get(catalog=self, client_config=client_config)
|
|
1152
1153
|
|
|
1153
|
-
|
|
1154
|
+
chains = []
|
|
1154
1155
|
for source in sources:
|
|
1155
1156
|
if source.startswith(DATASET_PREFIX):
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
catalog=self,
|
|
1159
|
-
client_config=client_config,
|
|
1157
|
+
dc = DataChain.from_dataset(
|
|
1158
|
+
source[len(DATASET_PREFIX) :], session=session
|
|
1160
1159
|
)
|
|
1161
1160
|
else:
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
catalog=self,
|
|
1165
|
-
client_config=client_config,
|
|
1166
|
-
recursive=recursive,
|
|
1161
|
+
dc = DataChain.from_storage(
|
|
1162
|
+
source, session=session, recursive=recursive
|
|
1167
1163
|
)
|
|
1168
1164
|
|
|
1169
|
-
|
|
1165
|
+
chains.append(dc)
|
|
1170
1166
|
|
|
1171
1167
|
# create union of all dataset queries created from sources
|
|
1172
|
-
|
|
1168
|
+
dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
|
|
1173
1169
|
try:
|
|
1174
|
-
|
|
1170
|
+
dc.save(name)
|
|
1175
1171
|
except Exception as e: # noqa: BLE001
|
|
1176
1172
|
try:
|
|
1177
1173
|
ds = self.get_dataset(name)
|
|
@@ -1731,26 +1727,6 @@ class Catalog:
|
|
|
1731
1727
|
output, sources, client_config=client_config, recursive=recursive
|
|
1732
1728
|
)
|
|
1733
1729
|
|
|
1734
|
-
def apply_udf(
|
|
1735
|
-
self,
|
|
1736
|
-
udf_location: str,
|
|
1737
|
-
source: str,
|
|
1738
|
-
target_name: str,
|
|
1739
|
-
parallel: Optional[int] = None,
|
|
1740
|
-
params: Optional[str] = None,
|
|
1741
|
-
):
|
|
1742
|
-
from datachain.query import DatasetQuery
|
|
1743
|
-
|
|
1744
|
-
if source.startswith(DATASET_PREFIX):
|
|
1745
|
-
ds = DatasetQuery(name=source[len(DATASET_PREFIX) :], catalog=self)
|
|
1746
|
-
else:
|
|
1747
|
-
ds = DatasetQuery(path=source, catalog=self)
|
|
1748
|
-
udf = import_object(udf_location)
|
|
1749
|
-
if params:
|
|
1750
|
-
args, kwargs = parse_params_string(params)
|
|
1751
|
-
udf = udf(*args, **kwargs)
|
|
1752
|
-
ds.add_signals(udf, parallel=parallel).save(target_name)
|
|
1753
|
-
|
|
1754
1730
|
def query(
|
|
1755
1731
|
self,
|
|
1756
1732
|
query_script: str,
|
|
@@ -494,27 +494,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
494
494
|
help="Query parameters",
|
|
495
495
|
)
|
|
496
496
|
|
|
497
|
-
apply_udf_parser = subp.add_parser(
|
|
498
|
-
"apply-udf", parents=[parent_parser], description="Apply UDF"
|
|
499
|
-
)
|
|
500
|
-
apply_udf_parser.add_argument("udf", type=str, help="UDF location")
|
|
501
|
-
apply_udf_parser.add_argument("source", type=str, help="Source storage or dataset")
|
|
502
|
-
apply_udf_parser.add_argument("target", type=str, help="Target dataset name")
|
|
503
|
-
apply_udf_parser.add_argument(
|
|
504
|
-
"--parallel",
|
|
505
|
-
nargs="?",
|
|
506
|
-
type=int,
|
|
507
|
-
const=-1,
|
|
508
|
-
default=None,
|
|
509
|
-
metavar="N",
|
|
510
|
-
help=(
|
|
511
|
-
"Use multiprocessing to run the UDF with N worker processes. "
|
|
512
|
-
"N defaults to the CPU count."
|
|
513
|
-
),
|
|
514
|
-
)
|
|
515
|
-
apply_udf_parser.add_argument(
|
|
516
|
-
"--udf-params", type=str, default=None, help="UDF class parameters"
|
|
517
|
-
)
|
|
518
497
|
subp.add_parser(
|
|
519
498
|
"clear-cache", parents=[parent_parser], description="Clear the local file cache"
|
|
520
499
|
)
|
|
@@ -1016,10 +995,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1016
995
|
parallel=args.parallel,
|
|
1017
996
|
params=args.param,
|
|
1018
997
|
)
|
|
1019
|
-
elif args.command == "apply-udf":
|
|
1020
|
-
catalog.apply_udf(
|
|
1021
|
-
args.udf, args.source, args.target, args.parallel, args.udf_params
|
|
1022
|
-
)
|
|
1023
998
|
elif args.command == "clear-cache":
|
|
1024
999
|
clear_cache(catalog)
|
|
1025
1000
|
elif args.command == "gc":
|
|
@@ -297,39 +297,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
297
297
|
#
|
|
298
298
|
# Dataset dependencies
|
|
299
299
|
#
|
|
300
|
-
|
|
301
|
-
def add_dependency(
|
|
302
|
-
self,
|
|
303
|
-
dependency: DatasetDependency,
|
|
304
|
-
source_dataset_name: str,
|
|
305
|
-
source_dataset_version: int,
|
|
306
|
-
) -> None:
|
|
307
|
-
"""Add dependency to dataset or storage."""
|
|
308
|
-
if dependency.is_dataset:
|
|
309
|
-
self.add_dataset_dependency(
|
|
310
|
-
source_dataset_name,
|
|
311
|
-
source_dataset_version,
|
|
312
|
-
dependency.dataset_name,
|
|
313
|
-
int(dependency.version),
|
|
314
|
-
)
|
|
315
|
-
else:
|
|
316
|
-
self.add_storage_dependency(
|
|
317
|
-
source_dataset_name,
|
|
318
|
-
source_dataset_version,
|
|
319
|
-
StorageURI(dependency.name),
|
|
320
|
-
dependency.version,
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
@abstractmethod
|
|
324
|
-
def add_storage_dependency(
|
|
325
|
-
self,
|
|
326
|
-
source_dataset_name: str,
|
|
327
|
-
source_dataset_version: int,
|
|
328
|
-
storage_uri: StorageURI,
|
|
329
|
-
storage_timestamp_str: Optional[str] = None,
|
|
330
|
-
) -> None:
|
|
331
|
-
"""Adds storage dependency to dataset."""
|
|
332
|
-
|
|
333
300
|
@abstractmethod
|
|
334
301
|
def add_dataset_dependency(
|
|
335
302
|
self,
|
|
@@ -1268,32 +1235,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1268
1235
|
#
|
|
1269
1236
|
# Dataset dependencies
|
|
1270
1237
|
#
|
|
1271
|
-
|
|
1272
|
-
def _insert_dataset_dependency(self, data: dict[str, Any]) -> None:
|
|
1273
|
-
"""Method for inserting dependencies."""
|
|
1274
|
-
self.db.execute(self._datasets_dependencies_insert().values(**data))
|
|
1275
|
-
|
|
1276
|
-
def add_storage_dependency(
|
|
1277
|
-
self,
|
|
1278
|
-
source_dataset_name: str,
|
|
1279
|
-
source_dataset_version: int,
|
|
1280
|
-
storage_uri: StorageURI,
|
|
1281
|
-
storage_timestamp_str: Optional[str] = None,
|
|
1282
|
-
) -> None:
|
|
1283
|
-
source_dataset = self.get_dataset(source_dataset_name)
|
|
1284
|
-
storage = self.get_storage(storage_uri)
|
|
1285
|
-
|
|
1286
|
-
self._insert_dataset_dependency(
|
|
1287
|
-
{
|
|
1288
|
-
"source_dataset_id": source_dataset.id,
|
|
1289
|
-
"source_dataset_version_id": (
|
|
1290
|
-
source_dataset.get_version(source_dataset_version).id
|
|
1291
|
-
),
|
|
1292
|
-
"bucket_id": storage.id,
|
|
1293
|
-
"bucket_version": storage_timestamp_str,
|
|
1294
|
-
}
|
|
1295
|
-
)
|
|
1296
|
-
|
|
1297
1238
|
def add_dataset_dependency(
|
|
1298
1239
|
self,
|
|
1299
1240
|
source_dataset_name: str,
|
|
@@ -1305,15 +1246,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1305
1246
|
source_dataset = self.get_dataset(source_dataset_name)
|
|
1306
1247
|
dataset = self.get_dataset(dataset_name)
|
|
1307
1248
|
|
|
1308
|
-
self.
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1249
|
+
self.db.execute(
|
|
1250
|
+
self._datasets_dependencies_insert().values(
|
|
1251
|
+
source_dataset_id=source_dataset.id,
|
|
1252
|
+
source_dataset_version_id=(
|
|
1312
1253
|
source_dataset.get_version(source_dataset_version).id
|
|
1313
1254
|
),
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1255
|
+
dataset_id=dataset.id,
|
|
1256
|
+
dataset_version_id=dataset.get_version(dataset_version).id,
|
|
1257
|
+
)
|
|
1317
1258
|
)
|
|
1318
1259
|
|
|
1319
1260
|
def update_dataset_dependency_source(
|
|
@@ -651,11 +651,14 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
651
651
|
self, dataset: DatasetRecord, version: int
|
|
652
652
|
) -> list[StorageURI]:
|
|
653
653
|
dr = self.dataset_rows(dataset, version)
|
|
654
|
-
query = dr.select(dr.c.
|
|
654
|
+
query = dr.select(dr.c.file__source).distinct()
|
|
655
655
|
cur = self.db.cursor()
|
|
656
656
|
cur.row_factory = sqlite3.Row # type: ignore[assignment]
|
|
657
657
|
|
|
658
|
-
return [
|
|
658
|
+
return [
|
|
659
|
+
StorageURI(row["file__source"])
|
|
660
|
+
for row in self.db.execute(query, cursor=cur)
|
|
661
|
+
]
|
|
659
662
|
|
|
660
663
|
def merge_dataset_rows(
|
|
661
664
|
self,
|
|
@@ -942,28 +942,6 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
942
942
|
self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
|
|
943
943
|
pbar.update(1)
|
|
944
944
|
|
|
945
|
-
def changed_query(
|
|
946
|
-
self,
|
|
947
|
-
source_query: sa.sql.selectable.Select,
|
|
948
|
-
target_query: sa.sql.selectable.Select,
|
|
949
|
-
) -> sa.sql.selectable.Select:
|
|
950
|
-
sq = source_query.alias("source_query")
|
|
951
|
-
tq = target_query.alias("target_query")
|
|
952
|
-
|
|
953
|
-
source_target_join = sa.join(
|
|
954
|
-
sq, tq, (sq.c.source == tq.c.source) & (sq.c.path == tq.c.path)
|
|
955
|
-
)
|
|
956
|
-
|
|
957
|
-
return (
|
|
958
|
-
select(*sq.c)
|
|
959
|
-
.select_from(source_target_join)
|
|
960
|
-
.where(
|
|
961
|
-
(sq.c.last_modified > tq.c.last_modified)
|
|
962
|
-
& (sq.c.is_latest == true())
|
|
963
|
-
& (tq.c.is_latest == true())
|
|
964
|
-
)
|
|
965
|
-
)
|
|
966
|
-
|
|
967
945
|
|
|
968
946
|
def _random_string(length: int) -> str:
|
|
969
947
|
return "".join(
|
|
@@ -1337,8 +1337,7 @@ class DataChain(DatasetQuery):
|
|
|
1337
1337
|
other.signals_schema.resolve(*right_on).db_signals(),
|
|
1338
1338
|
) # type: ignore[arg-type]
|
|
1339
1339
|
)
|
|
1340
|
-
|
|
1341
|
-
return super()._subtract(other, signals) # type: ignore[arg-type]
|
|
1340
|
+
return super().subtract(other, signals) # type: ignore[arg-type]
|
|
1342
1341
|
|
|
1343
1342
|
@classmethod
|
|
1344
1343
|
def from_values(
|
|
@@ -77,6 +77,7 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
|
|
|
77
77
|
"""
|
|
78
78
|
Parsing uri and returns listing dataset name, listing uri and listing path
|
|
79
79
|
"""
|
|
80
|
+
client_config = client_config or {}
|
|
80
81
|
client = Client.get_client(uri, cache, **client_config)
|
|
81
82
|
storage_uri, path = Client.parse_url(uri)
|
|
82
83
|
|
|
@@ -114,9 +114,23 @@ class Node:
|
|
|
114
114
|
)
|
|
115
115
|
|
|
116
116
|
@classmethod
|
|
117
|
-
def from_dict(cls, d: dict[str, Any]) -> "Self":
|
|
118
|
-
|
|
119
|
-
|
|
117
|
+
def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
|
|
118
|
+
def _dval(field_name: str):
|
|
119
|
+
return d.get(f"{file_prefix}__{field_name}")
|
|
120
|
+
|
|
121
|
+
return cls(
|
|
122
|
+
sys__id=d["sys__id"],
|
|
123
|
+
sys__rand=d["sys__rand"],
|
|
124
|
+
source=_dval("source"),
|
|
125
|
+
path=_dval("path"),
|
|
126
|
+
etag=_dval("etag"),
|
|
127
|
+
is_latest=_dval("is_latest"),
|
|
128
|
+
size=_dval("size"),
|
|
129
|
+
last_modified=_dval("last_modified"),
|
|
130
|
+
version=_dval("version"),
|
|
131
|
+
location=_dval("location"),
|
|
132
|
+
dir_type=DirType.FILE,
|
|
133
|
+
)
|
|
120
134
|
|
|
121
135
|
@classmethod
|
|
122
136
|
def from_dir(cls, path, **kwargs) -> "Node":
|
|
@@ -2,7 +2,6 @@ from .dataset import DatasetQuery
|
|
|
2
2
|
from .params import param
|
|
3
3
|
from .schema import C, DatasetRow, LocalFilename, Object, Stream
|
|
4
4
|
from .session import Session
|
|
5
|
-
from .udf import udf
|
|
6
5
|
|
|
7
6
|
__all__ = [
|
|
8
7
|
"C",
|
|
@@ -13,5 +12,4 @@ __all__ = [
|
|
|
13
12
|
"Session",
|
|
14
13
|
"Stream",
|
|
15
14
|
"param",
|
|
16
|
-
"udf",
|
|
17
15
|
]
|
|
@@ -3,7 +3,6 @@ import inspect
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import random
|
|
6
|
-
import re
|
|
7
6
|
import string
|
|
8
7
|
import subprocess
|
|
9
8
|
import sys
|
|
@@ -36,7 +35,6 @@ from sqlalchemy.sql.selectable import Select
|
|
|
36
35
|
|
|
37
36
|
from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
|
|
38
37
|
from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
|
|
39
|
-
from datachain.client import Client
|
|
40
38
|
from datachain.data_storage.schema import (
|
|
41
39
|
PARTITION_COLUMN_ID,
|
|
42
40
|
partition_col_names,
|
|
@@ -46,7 +44,6 @@ from datachain.dataset import DatasetStatus, RowDict
|
|
|
46
44
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
47
45
|
from datachain.progress import CombinedDownloadCallback
|
|
48
46
|
from datachain.sql.functions import rand
|
|
49
|
-
from datachain.storage import Storage, StorageURI
|
|
50
47
|
from datachain.utils import (
|
|
51
48
|
batched,
|
|
52
49
|
determine_processes,
|
|
@@ -77,9 +74,7 @@ INSERT_BATCH_SIZE = 10000
|
|
|
77
74
|
|
|
78
75
|
PartitionByType = Union[ColumnElement, Sequence[ColumnElement]]
|
|
79
76
|
JoinPredicateType = Union[str, ColumnClause, ColumnElement]
|
|
80
|
-
|
|
81
|
-
# depending what type of dependency we are adding
|
|
82
|
-
DatasetDependencyType = Union[tuple[str, int], StorageURI]
|
|
77
|
+
DatasetDependencyType = tuple[str, int]
|
|
83
78
|
|
|
84
79
|
logger = logging.getLogger("datachain")
|
|
85
80
|
|
|
@@ -185,38 +180,6 @@ class QueryStep(StartingStep):
|
|
|
185
180
|
)
|
|
186
181
|
|
|
187
182
|
|
|
188
|
-
@frozen
|
|
189
|
-
class IndexingStep(StartingStep):
|
|
190
|
-
path: str
|
|
191
|
-
catalog: "Catalog"
|
|
192
|
-
kwargs: dict[str, Any]
|
|
193
|
-
recursive: Optional[bool] = True
|
|
194
|
-
|
|
195
|
-
def apply(self):
|
|
196
|
-
self.catalog.index([self.path], **self.kwargs)
|
|
197
|
-
uri, path = Client.parse_url(self.path)
|
|
198
|
-
_partial_id, partial_path = self.catalog.metastore.get_valid_partial_id(
|
|
199
|
-
uri, path
|
|
200
|
-
)
|
|
201
|
-
dataset = self.catalog.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
202
|
-
dataset_rows = self.catalog.warehouse.dataset_rows(
|
|
203
|
-
dataset, dataset.latest_version
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
def q(*columns):
|
|
207
|
-
col_names = [c.name for c in columns]
|
|
208
|
-
return self.catalog.warehouse.nodes_dataset_query(
|
|
209
|
-
dataset_rows,
|
|
210
|
-
column_names=col_names,
|
|
211
|
-
path=path,
|
|
212
|
-
recursive=self.recursive,
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
storage = self.catalog.metastore.get_storage(uri)
|
|
216
|
-
|
|
217
|
-
return step_result(q, dataset_rows.c, dependencies=[storage.uri])
|
|
218
|
-
|
|
219
|
-
|
|
220
183
|
def generator_then_call(generator, func: Callable):
|
|
221
184
|
"""
|
|
222
185
|
Yield items from generator then execute a function and yield
|
|
@@ -230,7 +193,7 @@ def generator_then_call(generator, func: Callable):
|
|
|
230
193
|
class DatasetDiffOperation(Step):
|
|
231
194
|
"""
|
|
232
195
|
Abstract class for operations that are calculation some kind of diff between
|
|
233
|
-
datasets queries like subtract
|
|
196
|
+
datasets queries like subtract etc.
|
|
234
197
|
"""
|
|
235
198
|
|
|
236
199
|
dq: "DatasetQuery"
|
|
@@ -304,28 +267,6 @@ class Subtract(DatasetDiffOperation):
|
|
|
304
267
|
return sq.select().except_(sq.select().where(where_clause))
|
|
305
268
|
|
|
306
269
|
|
|
307
|
-
@frozen
|
|
308
|
-
class Changed(DatasetDiffOperation):
|
|
309
|
-
"""
|
|
310
|
-
Calculates rows that are changed in a source query compared to target query
|
|
311
|
-
Changed means it has same source + path but different last_modified
|
|
312
|
-
Example:
|
|
313
|
-
>>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
|
|
314
|
-
>>> ds_updated = (
|
|
315
|
-
DatasetQuery("gs://dvcx-datalakes/dogs-and-cats")
|
|
316
|
-
.filter(C.size > 1000) # we can also filter out source query
|
|
317
|
-
.changed(ds)
|
|
318
|
-
.add_signals(calc_embeddings) # calculae embeddings only on changed rows
|
|
319
|
-
.union(ds) # union with old dataset that's missing updated rows
|
|
320
|
-
.save("dogs_cats_updated")
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
"""
|
|
324
|
-
|
|
325
|
-
def query(self, source_query: Select, target_query: Select) -> Select:
|
|
326
|
-
return self.catalog.warehouse.changed_query(source_query, target_query)
|
|
327
|
-
|
|
328
|
-
|
|
329
270
|
def adjust_outputs(
|
|
330
271
|
warehouse: "AbstractWarehouse", row: dict[str, Any], udf_col_types: list[tuple]
|
|
331
272
|
) -> dict[str, Any]:
|
|
@@ -1096,28 +1037,14 @@ class ResultIter:
|
|
|
1096
1037
|
class DatasetQuery:
|
|
1097
1038
|
def __init__(
|
|
1098
1039
|
self,
|
|
1099
|
-
|
|
1100
|
-
name: str = "",
|
|
1040
|
+
name: str,
|
|
1101
1041
|
version: Optional[int] = None,
|
|
1102
1042
|
catalog: Optional["Catalog"] = None,
|
|
1103
|
-
client_config=None,
|
|
1104
|
-
recursive: Optional[bool] = True,
|
|
1105
1043
|
session: Optional[Session] = None,
|
|
1106
|
-
anon: bool = False,
|
|
1107
|
-
indexing_feature_schema: Optional[dict] = None,
|
|
1108
1044
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
1109
|
-
update: Optional[bool] = False,
|
|
1110
1045
|
in_memory: bool = False,
|
|
1111
1046
|
):
|
|
1112
|
-
|
|
1113
|
-
client_config = {}
|
|
1114
|
-
|
|
1115
|
-
if anon:
|
|
1116
|
-
client_config["anon"] = True
|
|
1117
|
-
|
|
1118
|
-
self.session = Session.get(
|
|
1119
|
-
session, catalog=catalog, client_config=client_config, in_memory=in_memory
|
|
1120
|
-
)
|
|
1047
|
+
self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
|
|
1121
1048
|
self.catalog = catalog or self.session.catalog
|
|
1122
1049
|
self.steps: list[Step] = []
|
|
1123
1050
|
self._chunk_index: Optional[int] = None
|
|
@@ -1131,26 +1058,14 @@ class DatasetQuery:
|
|
|
1131
1058
|
self.feature_schema: Optional[dict] = None
|
|
1132
1059
|
self.column_types: Optional[dict[str, Any]] = None
|
|
1133
1060
|
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
self.
|
|
1141
|
-
|
|
1142
|
-
self.version = version or ds.latest_version
|
|
1143
|
-
self.feature_schema = ds.get_version(self.version).feature_schema
|
|
1144
|
-
self.column_types = copy(ds.schema)
|
|
1145
|
-
if "sys__id" in self.column_types:
|
|
1146
|
-
self.column_types.pop("sys__id")
|
|
1147
|
-
self.starting_step = QueryStep(self.catalog, name, self.version)
|
|
1148
|
-
else:
|
|
1149
|
-
raise ValueError("must provide path or name")
|
|
1150
|
-
|
|
1151
|
-
@staticmethod
|
|
1152
|
-
def is_storage_path(path):
|
|
1153
|
-
return bool(re.compile(r"^[a-zA-Z0-9]+://").match(path))
|
|
1061
|
+
self.name = name
|
|
1062
|
+
ds = self.catalog.get_dataset(name)
|
|
1063
|
+
self.version = version or ds.latest_version
|
|
1064
|
+
self.feature_schema = ds.get_version(self.version).feature_schema
|
|
1065
|
+
self.column_types = copy(ds.schema)
|
|
1066
|
+
if "sys__id" in self.column_types:
|
|
1067
|
+
self.column_types.pop("sys__id")
|
|
1068
|
+
self.starting_step = QueryStep(self.catalog, name, self.version)
|
|
1154
1069
|
|
|
1155
1070
|
def __iter__(self):
|
|
1156
1071
|
return iter(self.db_results())
|
|
@@ -1595,21 +1510,11 @@ class DatasetQuery:
|
|
|
1595
1510
|
return query
|
|
1596
1511
|
|
|
1597
1512
|
@detach
|
|
1598
|
-
def subtract(self, dq: "DatasetQuery") -> "Self":
|
|
1599
|
-
return self._subtract(dq, on=[("source", "source"), ("path", "path")])
|
|
1600
|
-
|
|
1601
|
-
@detach
|
|
1602
|
-
def _subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
|
|
1513
|
+
def subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
|
|
1603
1514
|
query = self.clone()
|
|
1604
1515
|
query.steps.append(Subtract(dq, self.catalog, on=on))
|
|
1605
1516
|
return query
|
|
1606
1517
|
|
|
1607
|
-
@detach
|
|
1608
|
-
def changed(self, dq: "DatasetQuery") -> "Self":
|
|
1609
|
-
query = self.clone()
|
|
1610
|
-
query.steps.append(Changed(dq, self.catalog))
|
|
1611
|
-
return query
|
|
1612
|
-
|
|
1613
1518
|
@detach
|
|
1614
1519
|
def generate(
|
|
1615
1520
|
self,
|
|
@@ -1640,24 +1545,13 @@ class DatasetQuery:
|
|
|
1640
1545
|
|
|
1641
1546
|
def _add_dependencies(self, dataset: "DatasetRecord", version: int):
|
|
1642
1547
|
for dependency in self.dependencies:
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
ds_dependency_version,
|
|
1651
|
-
)
|
|
1652
|
-
else:
|
|
1653
|
-
# storage dependency - its name is a valid StorageURI
|
|
1654
|
-
storage = self.catalog.metastore.get_storage(dependency)
|
|
1655
|
-
self.catalog.metastore.add_storage_dependency(
|
|
1656
|
-
StorageURI(dataset.name),
|
|
1657
|
-
version,
|
|
1658
|
-
storage.uri,
|
|
1659
|
-
storage.timestamp_str,
|
|
1660
|
-
)
|
|
1548
|
+
ds_dependency_name, ds_dependency_version = dependency
|
|
1549
|
+
self.catalog.metastore.add_dataset_dependency(
|
|
1550
|
+
dataset.name,
|
|
1551
|
+
version,
|
|
1552
|
+
ds_dependency_name,
|
|
1553
|
+
ds_dependency_version,
|
|
1554
|
+
)
|
|
1661
1555
|
|
|
1662
1556
|
def exec(self) -> "Self":
|
|
1663
1557
|
"""Execute the query."""
|
|
@@ -19,6 +19,17 @@ if TYPE_CHECKING:
|
|
|
19
19
|
DEFAULT_DELIMITER = "__"
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
def file_signals(row, signal_name="file"):
|
|
23
|
+
# TODO this is workaround until we decide what to do with these classes
|
|
24
|
+
prefix = f"{signal_name}{DEFAULT_DELIMITER}"
|
|
25
|
+
return {
|
|
26
|
+
c_name.removeprefix(prefix): c_value
|
|
27
|
+
for c_name, c_value in row.items()
|
|
28
|
+
if c_name.startswith(prefix)
|
|
29
|
+
and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
22
33
|
class ColumnMeta(type):
|
|
23
34
|
@staticmethod
|
|
24
35
|
def to_db_name(name: str) -> str:
|
|
@@ -86,8 +97,8 @@ class Object(UDFParameter):
|
|
|
86
97
|
cb: Callback = DEFAULT_CALLBACK,
|
|
87
98
|
**kwargs,
|
|
88
99
|
) -> Any:
|
|
89
|
-
client = catalog.get_client(row["
|
|
90
|
-
uid = catalog._get_row_uid(row)
|
|
100
|
+
client = catalog.get_client(row["file__source"])
|
|
101
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
91
102
|
if cache:
|
|
92
103
|
client.download(uid, callback=cb)
|
|
93
104
|
with client.open_object(uid, use_cache=cache, cb=cb) as f:
|
|
@@ -103,8 +114,8 @@ class Object(UDFParameter):
|
|
|
103
114
|
cb: Callback = DEFAULT_CALLBACK,
|
|
104
115
|
**kwargs,
|
|
105
116
|
) -> Any:
|
|
106
|
-
client = catalog.get_client(row["
|
|
107
|
-
uid = catalog._get_row_uid(row)
|
|
117
|
+
client = catalog.get_client(row["file__source"])
|
|
118
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
108
119
|
if cache:
|
|
109
120
|
await client._download(uid, callback=cb)
|
|
110
121
|
obj = await mapper.to_thread(
|
|
@@ -129,8 +140,8 @@ class Stream(UDFParameter):
|
|
|
129
140
|
cb: Callback = DEFAULT_CALLBACK,
|
|
130
141
|
**kwargs,
|
|
131
142
|
) -> Any:
|
|
132
|
-
client = catalog.get_client(row["
|
|
133
|
-
uid = catalog._get_row_uid(row)
|
|
143
|
+
client = catalog.get_client(row["file__source"])
|
|
144
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
134
145
|
if cache:
|
|
135
146
|
client.download(uid, callback=cb)
|
|
136
147
|
return client.open_object(uid, use_cache=cache, cb=cb)
|
|
@@ -145,8 +156,8 @@ class Stream(UDFParameter):
|
|
|
145
156
|
cb: Callback = DEFAULT_CALLBACK,
|
|
146
157
|
**kwargs,
|
|
147
158
|
) -> Any:
|
|
148
|
-
client = catalog.get_client(row["
|
|
149
|
-
uid = catalog._get_row_uid(row)
|
|
159
|
+
client = catalog.get_client(row["file__source"])
|
|
160
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
150
161
|
if cache:
|
|
151
162
|
await client._download(uid, callback=cb)
|
|
152
163
|
return await mapper.to_thread(
|
|
@@ -178,8 +189,8 @@ class LocalFilename(UDFParameter):
|
|
|
178
189
|
# If the glob pattern is specified and the row filename
|
|
179
190
|
# does not match it, then return None
|
|
180
191
|
return None
|
|
181
|
-
client = catalog.get_client(row["
|
|
182
|
-
uid = catalog._get_row_uid(row)
|
|
192
|
+
client = catalog.get_client(row["file__source"])
|
|
193
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
183
194
|
client.download(uid, callback=cb)
|
|
184
195
|
return client.cache.get_path(uid)
|
|
185
196
|
|
|
@@ -197,8 +208,8 @@ class LocalFilename(UDFParameter):
|
|
|
197
208
|
# If the glob pattern is specified and the row filename
|
|
198
209
|
# does not match it, then return None
|
|
199
210
|
return None
|
|
200
|
-
client = catalog.get_client(row["
|
|
201
|
-
uid = catalog._get_row_uid(row)
|
|
211
|
+
client = catalog.get_client(row["file__source"])
|
|
212
|
+
uid = catalog._get_row_uid(file_signals(row))
|
|
202
213
|
await client._download(uid, callback=cb)
|
|
203
214
|
return client.cache.get_path(uid)
|
|
204
215
|
|