datachain 0.3.12__tar.gz → 0.3.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.12/src/datachain.egg-info → datachain-0.3.14}/PKG-INFO +1 -1
- {datachain-0.3.12 → datachain-0.3.14}/examples/computer_vision/iptc_exif_xmp_lib.py +7 -1
- {datachain-0.3.12 → datachain-0.3.14}/examples/computer_vision/llava2_image_desc_lib.py +7 -1
- {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/json-csv-reader.py +0 -2
- {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/torch-loader.py +6 -1
- {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/udfs/stateful.py +2 -2
- {datachain-0.3.12 → datachain-0.3.14}/noxfile.py +1 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/asyn.py +4 -9
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/cache.py +0 -1
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/catalog/catalog.py +3 -12
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/cli.py +4 -6
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/azure.py +1 -13
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/fsspec.py +7 -8
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/gcs.py +2 -13
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/hf.py +0 -10
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/local.py +3 -12
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/s3.py +9 -23
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/schema.py +4 -8
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/sqlite.py +10 -1
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/warehouse.py +17 -34
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/dc.py +0 -1
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/file.py +0 -3
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/listing.py +1 -2
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/model_store.py +2 -2
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/pytorch.py +32 -26
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/signal_schema.py +146 -58
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/listing.py +8 -10
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/node.py +3 -68
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/builtins.py +0 -14
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/schema.py +1 -16
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/utils.py +0 -3
- {datachain-0.3.12 → datachain-0.3.14/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.3.12 → datachain-0.3.14}/tests/conftest.py +35 -3
- {datachain-0.3.12 → datachain-0.3.14}/tests/data.py +11 -31
- {datachain-0.3.12 → datachain-0.3.14}/tests/examples/test_wds_e2e.py +10 -8
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_catalog.py +32 -9
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_datachain.py +164 -4
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_dataset_query.py +23 -228
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_feature_pickling.py +66 -1
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_ls.py +0 -15
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_pull.py +1 -11
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_query.py +3 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/test_cli_e2e.py +10 -3
- {datachain-0.3.12 → datachain-0.3.14}/tests/test_query_e2e.py +10 -3
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_datachain.py +1 -1
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_file.py +3 -7
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_signal_schema.py +244 -8
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_cache.py +3 -7
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_client_s3.py +0 -1
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_data_storage.py +28 -32
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_dataset.py +0 -6
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_id_generator.py +3 -1
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_listing.py +3 -2
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_udf.py +0 -3
- {datachain-0.3.12 → datachain-0.3.14}/tests/utils.py +1 -15
- {datachain-0.3.12 → datachain-0.3.14}/.cruft.json +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.gitattributes +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.github/codecov.yaml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.github/dependabot.yml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.github/workflows/release.yml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.github/workflows/tests.yml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.gitignore +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/.pre-commit-config.yaml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/LICENSE +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/README.rst +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/assets/datachain.svg +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/index.md +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/references/datachain.md +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/references/datatype.md +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/references/file.md +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/references/index.md +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/references/sql.md +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/references/torch.md +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/docs/references/udf.md +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/mkdocs.yml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/pyproject.toml +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/setup.cfg +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/__main__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/config.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/dataset.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/error.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/job.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/hf.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/progress.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/py.typed +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/dataset.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/params.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/session.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/storage.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/examples/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_client.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_datasets.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_listing.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_client.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_session.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_warehouse.py +0 -0
|
@@ -1,14 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from collections.abc import Awaitable, Coroutine, Iterable
|
|
2
|
+
from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
|
|
3
3
|
from concurrent.futures import ThreadPoolExecutor
|
|
4
4
|
from heapq import heappop, heappush
|
|
5
|
-
from typing import
|
|
6
|
-
Any,
|
|
7
|
-
Callable,
|
|
8
|
-
Generic,
|
|
9
|
-
Optional,
|
|
10
|
-
TypeVar,
|
|
11
|
-
)
|
|
5
|
+
from typing import Any, Callable, Generic, Optional, TypeVar
|
|
12
6
|
|
|
13
7
|
from fsspec.asyn import get_loop
|
|
14
8
|
|
|
@@ -16,6 +10,7 @@ ASYNC_WORKERS = 20
|
|
|
16
10
|
|
|
17
11
|
InputT = TypeVar("InputT", contravariant=True) # noqa: PLC0105
|
|
18
12
|
ResultT = TypeVar("ResultT", covariant=True) # noqa: PLC0105
|
|
13
|
+
T = TypeVar("T")
|
|
19
14
|
|
|
20
15
|
|
|
21
16
|
class AsyncMapper(Generic[InputT, ResultT]):
|
|
@@ -226,7 +221,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
|
|
|
226
221
|
self._push_result(self._next_yield, None)
|
|
227
222
|
|
|
228
223
|
|
|
229
|
-
def iter_over_async(ait, loop):
|
|
224
|
+
def iter_over_async(ait: AsyncIterable[T], loop) -> Iterator[T]:
|
|
230
225
|
"""Wrap an asynchronous iterator into a synchronous one"""
|
|
231
226
|
ait = ait.__aiter__()
|
|
232
227
|
|
|
@@ -62,7 +62,7 @@ from datachain.listing import Listing
|
|
|
62
62
|
from datachain.node import DirType, Node, NodeWithPath
|
|
63
63
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
64
64
|
from datachain.remote.studio import StudioClient
|
|
65
|
-
from datachain.sql.types import JSON, Boolean, DateTime,
|
|
65
|
+
from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
|
|
66
66
|
from datachain.storage import Storage, StorageStatus, StorageURI
|
|
67
67
|
from datachain.utils import (
|
|
68
68
|
DataChainDir,
|
|
@@ -513,8 +513,6 @@ def find_column_to_str( # noqa: PLR0911
|
|
|
513
513
|
)
|
|
514
514
|
if column == "name":
|
|
515
515
|
return posixpath.basename(row[field_lookup["path"]]) or ""
|
|
516
|
-
if column == "owner":
|
|
517
|
-
return row[field_lookup["owner_name"]] or ""
|
|
518
516
|
if column == "path":
|
|
519
517
|
is_dir = row[field_lookup["dir_type"]] == DirType.DIR
|
|
520
518
|
path = row[field_lookup["path"]]
|
|
@@ -666,16 +664,12 @@ class Catalog:
|
|
|
666
664
|
source_metastore = self.metastore.clone(client.uri)
|
|
667
665
|
|
|
668
666
|
columns = [
|
|
669
|
-
Column("vtype", String),
|
|
670
|
-
Column("dir_type", Int),
|
|
671
667
|
Column("path", String),
|
|
672
668
|
Column("etag", String),
|
|
673
669
|
Column("version", String),
|
|
674
670
|
Column("is_latest", Boolean),
|
|
675
671
|
Column("last_modified", DateTime(timezone=True)),
|
|
676
672
|
Column("size", Int64),
|
|
677
|
-
Column("owner_name", String),
|
|
678
|
-
Column("owner_id", String),
|
|
679
673
|
Column("location", JSON),
|
|
680
674
|
Column("source", String),
|
|
681
675
|
]
|
|
@@ -1396,12 +1390,12 @@ class Catalog:
|
|
|
1396
1390
|
dataset = self.get_dataset(name)
|
|
1397
1391
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1398
1392
|
|
|
1399
|
-
def dataset_stats(self, name: str, version: int) -> DatasetStats:
|
|
1393
|
+
def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
|
|
1400
1394
|
"""
|
|
1401
1395
|
Returns tuple with dataset stats: total number of rows and total dataset size.
|
|
1402
1396
|
"""
|
|
1403
1397
|
dataset = self.get_dataset(name)
|
|
1404
|
-
dataset_version = dataset.get_version(version)
|
|
1398
|
+
dataset_version = dataset.get_version(version or dataset.latest_version)
|
|
1405
1399
|
return DatasetStats(
|
|
1406
1400
|
num_objects=dataset_version.num_objects,
|
|
1407
1401
|
size=dataset_version.size,
|
|
@@ -1516,7 +1510,6 @@ class Catalog:
|
|
|
1516
1510
|
row["etag"],
|
|
1517
1511
|
row["version"],
|
|
1518
1512
|
row["is_latest"],
|
|
1519
|
-
row["vtype"],
|
|
1520
1513
|
row["location"],
|
|
1521
1514
|
row["last_modified"],
|
|
1522
1515
|
)
|
|
@@ -1987,8 +1980,6 @@ class Catalog:
|
|
|
1987
1980
|
field_set.add("path")
|
|
1988
1981
|
elif column == "name":
|
|
1989
1982
|
field_set.add("path")
|
|
1990
|
-
elif column == "owner":
|
|
1991
|
-
field_set.add("owner_name")
|
|
1992
1983
|
elif column == "path":
|
|
1993
1984
|
field_set.add("dir_type")
|
|
1994
1985
|
field_set.add("path")
|
|
@@ -24,7 +24,7 @@ logger = logging.getLogger("datachain")
|
|
|
24
24
|
|
|
25
25
|
TTL_HUMAN = "4h"
|
|
26
26
|
TTL_INT = 4 * 60 * 60
|
|
27
|
-
FIND_COLUMNS = ["du", "name", "
|
|
27
|
+
FIND_COLUMNS = ["du", "name", "path", "size", "type"]
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def human_time_type(value_str: str, can_be_none: bool = False) -> Optional[int]:
|
|
@@ -579,9 +579,8 @@ def _node_data_to_ls_values(row, long_format=False):
|
|
|
579
579
|
value = name + ending
|
|
580
580
|
if long_format:
|
|
581
581
|
last_modified = row[2]
|
|
582
|
-
owner_name = row[3]
|
|
583
582
|
timestamp = last_modified if not is_dir else None
|
|
584
|
-
return long_line_str(value, timestamp
|
|
583
|
+
return long_line_str(value, timestamp)
|
|
585
584
|
return value
|
|
586
585
|
|
|
587
586
|
|
|
@@ -599,7 +598,7 @@ def _ls_urls_flat(
|
|
|
599
598
|
if client_cls.is_root_url(source):
|
|
600
599
|
buckets = client_cls.ls_buckets(**catalog.client_config)
|
|
601
600
|
if long:
|
|
602
|
-
values = (long_line_str(b.name, b.created
|
|
601
|
+
values = (long_line_str(b.name, b.created) for b in buckets)
|
|
603
602
|
else:
|
|
604
603
|
values = (b.name for b in buckets)
|
|
605
604
|
yield source, values
|
|
@@ -607,7 +606,7 @@ def _ls_urls_flat(
|
|
|
607
606
|
found = False
|
|
608
607
|
fields = ["name", "dir_type"]
|
|
609
608
|
if long:
|
|
610
|
-
fields.
|
|
609
|
+
fields.append("last_modified")
|
|
611
610
|
for data_source, results in catalog.ls([source], fields=fields, **kwargs):
|
|
612
611
|
values = (_node_data_to_ls_values(r, long) for r in results)
|
|
613
612
|
found = True
|
|
@@ -683,7 +682,6 @@ def ls_remote(
|
|
|
683
682
|
entry = long_line_str(
|
|
684
683
|
row["name"] + ("/" if row["dir_type"] else ""),
|
|
685
684
|
row["last_modified"],
|
|
686
|
-
row["owner_name"],
|
|
687
685
|
)
|
|
688
686
|
print(format_ls_entry(entry))
|
|
689
687
|
else:
|
|
@@ -4,7 +4,6 @@ from adlfs import AzureBlobFileSystem
|
|
|
4
4
|
from tqdm import tqdm
|
|
5
5
|
|
|
6
6
|
from datachain.lib.file import File
|
|
7
|
-
from datachain.node import Entry
|
|
8
7
|
|
|
9
8
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
10
9
|
|
|
@@ -14,17 +13,6 @@ class AzureClient(Client):
|
|
|
14
13
|
PREFIX = "az://"
|
|
15
14
|
protocol = "az"
|
|
16
15
|
|
|
17
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
18
|
-
version_id = v.get("version_id")
|
|
19
|
-
return Entry.from_file(
|
|
20
|
-
path=path,
|
|
21
|
-
etag=v.get("etag", "").strip('"'),
|
|
22
|
-
version=version_id or "",
|
|
23
|
-
is_latest=version_id is None or bool(v.get("is_current_version")),
|
|
24
|
-
last_modified=v["last_modified"],
|
|
25
|
-
size=v.get("size", ""),
|
|
26
|
-
)
|
|
27
|
-
|
|
28
16
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
29
17
|
version_id = v.get("version_id")
|
|
30
18
|
return File(
|
|
@@ -57,7 +45,7 @@ class AzureClient(Client):
|
|
|
57
45
|
continue
|
|
58
46
|
info = (await self.fs._details([b]))[0]
|
|
59
47
|
entries.append(
|
|
60
|
-
self.
|
|
48
|
+
self.info_to_file(info, self.rel_path(info["name"]))
|
|
61
49
|
)
|
|
62
50
|
if entries:
|
|
63
51
|
await result_queue.put(entries)
|
|
@@ -29,7 +29,7 @@ from tqdm import tqdm
|
|
|
29
29
|
from datachain.cache import DataChainCache, UniqueId
|
|
30
30
|
from datachain.client.fileslice import FileSlice, FileWrapper
|
|
31
31
|
from datachain.error import ClientError as DataChainClientError
|
|
32
|
-
from datachain.
|
|
32
|
+
from datachain.lib.file import File
|
|
33
33
|
from datachain.nodes_fetcher import NodesFetcher
|
|
34
34
|
from datachain.nodes_thread_pool import NodeChunk
|
|
35
35
|
from datachain.storage import StorageURI
|
|
@@ -45,7 +45,7 @@ DELIMITER = "/" # Path delimiter.
|
|
|
45
45
|
|
|
46
46
|
DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
|
|
47
47
|
|
|
48
|
-
ResultQueue = asyncio.Queue[Optional[Sequence[
|
|
48
|
+
ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
def _is_win_local_path(uri: str) -> bool:
|
|
@@ -188,7 +188,7 @@ class Client(ABC):
|
|
|
188
188
|
|
|
189
189
|
async def get_current_etag(self, uid: UniqueId) -> str:
|
|
190
190
|
info = await self.fs._info(self.get_full_path(uid.path))
|
|
191
|
-
return self.
|
|
191
|
+
return self.info_to_file(info, "").etag
|
|
192
192
|
|
|
193
193
|
async def get_size(self, path: str) -> int:
|
|
194
194
|
return await self.fs._size(path)
|
|
@@ -198,7 +198,7 @@ class Client(ABC):
|
|
|
198
198
|
|
|
199
199
|
async def scandir(
|
|
200
200
|
self, start_prefix: str, method: str = "default"
|
|
201
|
-
) -> AsyncIterator[Sequence[
|
|
201
|
+
) -> AsyncIterator[Sequence[File]]:
|
|
202
202
|
try:
|
|
203
203
|
impl = getattr(self, f"_fetch_{method}")
|
|
204
204
|
except AttributeError:
|
|
@@ -264,7 +264,7 @@ class Client(ABC):
|
|
|
264
264
|
) -> None:
|
|
265
265
|
await self._fetch_nested(start_prefix, result_queue)
|
|
266
266
|
|
|
267
|
-
async def _fetch_dir(self, prefix, pbar, result_queue) -> set[str]:
|
|
267
|
+
async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
|
|
268
268
|
path = f"{self.name}/{prefix}"
|
|
269
269
|
infos = await self.ls_dir(path)
|
|
270
270
|
files = []
|
|
@@ -277,7 +277,7 @@ class Client(ABC):
|
|
|
277
277
|
if info["type"] == "directory":
|
|
278
278
|
subdirs.add(subprefix)
|
|
279
279
|
else:
|
|
280
|
-
files.append(self.
|
|
280
|
+
files.append(self.info_to_file(info, subprefix))
|
|
281
281
|
if files:
|
|
282
282
|
await result_queue.put(files)
|
|
283
283
|
found_count = len(subdirs) + len(files)
|
|
@@ -303,7 +303,7 @@ class Client(ABC):
|
|
|
303
303
|
return f"{self.PREFIX}{self.name}/{rel_path}"
|
|
304
304
|
|
|
305
305
|
@abstractmethod
|
|
306
|
-
def
|
|
306
|
+
def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
|
|
307
307
|
|
|
308
308
|
def fetch_nodes(
|
|
309
309
|
self,
|
|
@@ -363,7 +363,6 @@ class Client(ABC):
|
|
|
363
363
|
parent["path"],
|
|
364
364
|
parent["size"],
|
|
365
365
|
parent["etag"],
|
|
366
|
-
vtype=parent["vtype"],
|
|
367
366
|
location=parent["location"],
|
|
368
367
|
)
|
|
369
368
|
f = self.open_object(parent_uid, use_cache=use_cache)
|
|
@@ -10,7 +10,6 @@ from gcsfs import GCSFileSystem
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
12
|
from datachain.lib.file import File
|
|
13
|
-
from datachain.node import Entry
|
|
14
13
|
|
|
15
14
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
16
15
|
|
|
@@ -108,19 +107,9 @@ class GCSClient(Client):
|
|
|
108
107
|
finally:
|
|
109
108
|
await page_queue.put(None)
|
|
110
109
|
|
|
111
|
-
def _entry_from_dict(self, d: dict[str, Any]) ->
|
|
110
|
+
def _entry_from_dict(self, d: dict[str, Any]) -> File:
|
|
112
111
|
info = self.fs._process_object(self.name, d)
|
|
113
|
-
return self.
|
|
114
|
-
|
|
115
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
116
|
-
return Entry.from_file(
|
|
117
|
-
path=path,
|
|
118
|
-
etag=v.get("etag", ""),
|
|
119
|
-
version=v.get("generation", ""),
|
|
120
|
-
is_latest=not v.get("timeDeleted"),
|
|
121
|
-
last_modified=self.parse_timestamp(v["updated"]),
|
|
122
|
-
size=v.get("size", ""),
|
|
123
|
-
)
|
|
112
|
+
return self.info_to_file(info, self.rel_path(info["name"]))
|
|
124
113
|
|
|
125
114
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
126
115
|
return File(
|
|
@@ -5,7 +5,6 @@ from typing import Any, cast
|
|
|
5
5
|
from huggingface_hub import HfFileSystem
|
|
6
6
|
|
|
7
7
|
from datachain.lib.file import File
|
|
8
|
-
from datachain.node import Entry
|
|
9
8
|
|
|
10
9
|
from .fsspec import Client
|
|
11
10
|
|
|
@@ -22,15 +21,6 @@ class HfClient(Client):
|
|
|
22
21
|
|
|
23
22
|
return cast(HfFileSystem, super().create_fs(**kwargs))
|
|
24
23
|
|
|
25
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
26
|
-
return Entry.from_file(
|
|
27
|
-
path=path,
|
|
28
|
-
size=v["size"],
|
|
29
|
-
version=v["last_commit"].oid,
|
|
30
|
-
etag=v.get("blob_id", ""),
|
|
31
|
-
last_modified=v["last_commit"].date,
|
|
32
|
-
)
|
|
33
|
-
|
|
34
24
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
35
25
|
return File(
|
|
36
26
|
path=path,
|
|
@@ -7,8 +7,8 @@ from urllib.parse import urlparse
|
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
9
9
|
|
|
10
|
+
from datachain.cache import UniqueId
|
|
10
11
|
from datachain.lib.file import File
|
|
11
|
-
from datachain.node import Entry
|
|
12
12
|
from datachain.storage import StorageURI
|
|
13
13
|
|
|
14
14
|
from .fsspec import Client
|
|
@@ -114,9 +114,9 @@ class FileClient(Client):
|
|
|
114
114
|
use_symlinks=use_symlinks,
|
|
115
115
|
)
|
|
116
116
|
|
|
117
|
-
async def get_current_etag(self, uid) -> str:
|
|
117
|
+
async def get_current_etag(self, uid: UniqueId) -> str:
|
|
118
118
|
info = self.fs.info(self.get_full_path(uid.path))
|
|
119
|
-
return self.
|
|
119
|
+
return self.info_to_file(info, "").etag
|
|
120
120
|
|
|
121
121
|
async def get_size(self, path: str) -> int:
|
|
122
122
|
return self.fs.size(path)
|
|
@@ -136,15 +136,6 @@ class FileClient(Client):
|
|
|
136
136
|
full_path += "/"
|
|
137
137
|
return full_path
|
|
138
138
|
|
|
139
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
140
|
-
return Entry.from_file(
|
|
141
|
-
path=path,
|
|
142
|
-
etag=v["mtime"].hex(),
|
|
143
|
-
is_latest=True,
|
|
144
|
-
last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
|
|
145
|
-
size=v.get("size", ""),
|
|
146
|
-
)
|
|
147
|
-
|
|
148
139
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
149
140
|
return File(
|
|
150
141
|
source=self.uri,
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import Any, cast
|
|
2
|
+
from typing import Any, Optional, cast
|
|
3
3
|
|
|
4
4
|
from botocore.exceptions import NoCredentialsError
|
|
5
5
|
from s3fs import S3FileSystem
|
|
6
6
|
from tqdm import tqdm
|
|
7
7
|
|
|
8
8
|
from datachain.lib.file import File
|
|
9
|
-
from datachain.node import Entry
|
|
10
9
|
|
|
11
10
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
12
11
|
|
|
@@ -111,24 +110,23 @@ class ClientS3(Client):
|
|
|
111
110
|
) -> None:
|
|
112
111
|
await self._fetch_flat(start_prefix, result_queue)
|
|
113
112
|
|
|
114
|
-
def _entry_from_boto(self, v, bucket, versions=False):
|
|
115
|
-
return
|
|
113
|
+
def _entry_from_boto(self, v, bucket, versions=False) -> File:
|
|
114
|
+
return File(
|
|
115
|
+
source=self.uri,
|
|
116
116
|
path=v["Key"],
|
|
117
117
|
etag=v.get("ETag", "").strip('"'),
|
|
118
118
|
version=ClientS3.clean_s3_version(v.get("VersionId", "")),
|
|
119
119
|
is_latest=v.get("IsLatest", True),
|
|
120
120
|
last_modified=v.get("LastModified", ""),
|
|
121
121
|
size=v["Size"],
|
|
122
|
-
owner_name=v.get("Owner", {}).get("DisplayName", ""),
|
|
123
|
-
owner_id=v.get("Owner", {}).get("ID", ""),
|
|
124
122
|
)
|
|
125
123
|
|
|
126
124
|
async def _fetch_dir(
|
|
127
125
|
self,
|
|
128
126
|
prefix,
|
|
129
127
|
pbar,
|
|
130
|
-
result_queue,
|
|
131
|
-
):
|
|
128
|
+
result_queue: ResultQueue,
|
|
129
|
+
) -> set[str]:
|
|
132
130
|
if prefix:
|
|
133
131
|
prefix = prefix.lstrip(DELIMITER) + DELIMITER
|
|
134
132
|
files = []
|
|
@@ -143,7 +141,7 @@ class ClientS3(Client):
|
|
|
143
141
|
if info["type"] == "directory":
|
|
144
142
|
subdirs.add(subprefix)
|
|
145
143
|
else:
|
|
146
|
-
files.append(self.
|
|
144
|
+
files.append(self.info_to_file(info, subprefix))
|
|
147
145
|
pbar.update()
|
|
148
146
|
found = True
|
|
149
147
|
if not found:
|
|
@@ -154,20 +152,8 @@ class ClientS3(Client):
|
|
|
154
152
|
return subdirs
|
|
155
153
|
|
|
156
154
|
@staticmethod
|
|
157
|
-
def clean_s3_version(ver):
|
|
158
|
-
return ver if ver != "null" else ""
|
|
159
|
-
|
|
160
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
161
|
-
return Entry.from_file(
|
|
162
|
-
path=path,
|
|
163
|
-
etag=v.get("ETag", "").strip('"'),
|
|
164
|
-
version=ClientS3.clean_s3_version(v.get("VersionId", "")),
|
|
165
|
-
is_latest=v.get("IsLatest", True),
|
|
166
|
-
last_modified=v.get("LastModified", ""),
|
|
167
|
-
size=v["size"],
|
|
168
|
-
owner_name=v.get("Owner", {}).get("DisplayName", ""),
|
|
169
|
-
owner_id=v.get("Owner", {}).get("ID", ""),
|
|
170
|
-
)
|
|
155
|
+
def clean_s3_version(ver: Optional[str]) -> str:
|
|
156
|
+
return ver if (ver is not None and ver != "null") else ""
|
|
171
157
|
|
|
172
158
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
173
159
|
return File(
|
|
@@ -10,9 +10,8 @@ from typing import (
|
|
|
10
10
|
|
|
11
11
|
import sqlalchemy as sa
|
|
12
12
|
from sqlalchemy.sql import func as f
|
|
13
|
-
from sqlalchemy.sql.expression import null, true
|
|
13
|
+
from sqlalchemy.sql.expression import false, null, true
|
|
14
14
|
|
|
15
|
-
from datachain.node import DirType
|
|
16
15
|
from datachain.sql.functions import path
|
|
17
16
|
from datachain.sql.types import Int, SQLType, UInt64
|
|
18
17
|
|
|
@@ -81,8 +80,7 @@ class DirExpansion:
|
|
|
81
80
|
def base_select(q):
|
|
82
81
|
return sa.select(
|
|
83
82
|
q.c.sys__id,
|
|
84
|
-
|
|
85
|
-
(q.c.dir_type == DirType.DIR).label("is_dir"),
|
|
83
|
+
false().label("is_dir"),
|
|
86
84
|
q.c.source,
|
|
87
85
|
q.c.path,
|
|
88
86
|
q.c.version,
|
|
@@ -94,7 +92,6 @@ class DirExpansion:
|
|
|
94
92
|
return (
|
|
95
93
|
sa.select(
|
|
96
94
|
f.min(q.c.sys__id).label("sys__id"),
|
|
97
|
-
q.c.vtype,
|
|
98
95
|
q.c.is_dir,
|
|
99
96
|
q.c.source,
|
|
100
97
|
q.c.path,
|
|
@@ -102,8 +99,8 @@ class DirExpansion:
|
|
|
102
99
|
f.max(q.c.location).label("location"),
|
|
103
100
|
)
|
|
104
101
|
.select_from(q)
|
|
105
|
-
.group_by(q.c.source, q.c.path, q.c.
|
|
106
|
-
.order_by(q.c.source, q.c.path, q.c.
|
|
102
|
+
.group_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
|
|
103
|
+
.order_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
|
|
107
104
|
)
|
|
108
105
|
|
|
109
106
|
@classmethod
|
|
@@ -113,7 +110,6 @@ class DirExpansion:
|
|
|
113
110
|
q = q.union_all(
|
|
114
111
|
sa.select(
|
|
115
112
|
sa.literal(-1).label("sys__id"),
|
|
116
|
-
sa.literal("").label("vtype"),
|
|
117
113
|
true().label("is_dir"),
|
|
118
114
|
q.c.source,
|
|
119
115
|
parent.label("path"),
|
|
@@ -43,6 +43,8 @@ if TYPE_CHECKING:
|
|
|
43
43
|
from sqlalchemy.sql.elements import ColumnElement
|
|
44
44
|
from sqlalchemy.types import TypeEngine
|
|
45
45
|
|
|
46
|
+
from datachain.lib.file import File
|
|
47
|
+
|
|
46
48
|
|
|
47
49
|
logger = logging.getLogger("datachain")
|
|
48
50
|
|
|
@@ -58,6 +60,10 @@ quote_schema = sqlite_dialect.identifier_preparer.quote_schema
|
|
|
58
60
|
quote = sqlite_dialect.identifier_preparer.quote
|
|
59
61
|
|
|
60
62
|
|
|
63
|
+
def _get_in_memory_uri():
|
|
64
|
+
return "file::memory:?cache=shared"
|
|
65
|
+
|
|
66
|
+
|
|
61
67
|
def get_retry_sleep_sec(retry_count: int) -> int:
|
|
62
68
|
return RETRY_START_SEC * (RETRY_FACTOR**retry_count)
|
|
63
69
|
|
|
@@ -119,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
119
125
|
if db_file == ":memory:":
|
|
120
126
|
# Enable multithreaded usage of the same in-memory db
|
|
121
127
|
db = sqlite3.connect(
|
|
122
|
-
|
|
128
|
+
_get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
|
|
123
129
|
)
|
|
124
130
|
else:
|
|
125
131
|
db = sqlite3.connect(
|
|
@@ -704,6 +710,9 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
704
710
|
|
|
705
711
|
self.db.execute(insert_query)
|
|
706
712
|
|
|
713
|
+
def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
|
|
714
|
+
return (e.model_dump() for e in entries)
|
|
715
|
+
|
|
707
716
|
def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
|
|
708
717
|
rows = list(rows)
|
|
709
718
|
if not rows:
|