datachain 0.3.13__tar.gz → 0.3.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.13/src/datachain.egg-info → datachain-0.3.15}/PKG-INFO +1 -1
- {datachain-0.3.13 → datachain-0.3.15}/examples/computer_vision/iptc_exif_xmp_lib.py +7 -1
- {datachain-0.3.13 → datachain-0.3.15}/examples/computer_vision/llava2_image_desc_lib.py +7 -1
- {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/json-csv-reader.py +0 -2
- {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/torch-loader.py +6 -1
- {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/udfs/stateful.py +2 -2
- {datachain-0.3.13 → datachain-0.3.15}/noxfile.py +1 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/asyn.py +4 -9
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/catalog/catalog.py +20 -31
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/azure.py +1 -13
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/fsspec.py +16 -15
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/gcs.py +2 -13
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/hf.py +0 -10
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/local.py +3 -12
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/s3.py +9 -19
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/sqlite.py +10 -1
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/warehouse.py +11 -17
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/dataset.py +1 -1
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/arrow.py +51 -16
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/dc.py +7 -2
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/file.py +76 -2
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/hf.py +23 -6
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/listing.py +8 -7
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/listing_info.py +2 -2
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/model_store.py +2 -2
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/pytorch.py +32 -26
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/signal_schema.py +157 -60
- datachain-0.3.15/src/datachain/lib/tar.py +33 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/webdataset.py +3 -59
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/listing.py +6 -8
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/node.py +0 -43
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/dataset.py +2 -6
- {datachain-0.3.13 → datachain-0.3.15/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/conftest.py +35 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/data.py +11 -11
- {datachain-0.3.13 → datachain-0.3.15}/tests/examples/test_wds_e2e.py +10 -8
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_catalog.py +28 -3
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_datachain.py +198 -5
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_dataset_query.py +6 -205
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_datasets.py +4 -3
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_feature_pickling.py +66 -1
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_listing.py +2 -1
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_pull.py +1 -2
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_query.py +3 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/test_cli_e2e.py +10 -3
- {datachain-0.3.13 → datachain-0.3.15}/tests/test_query_e2e.py +10 -3
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_arrow.py +24 -5
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_datachain.py +3 -2
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_datachain_bootstrap.py +38 -19
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_file.py +84 -1
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_hf.py +8 -8
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_signal_schema.py +260 -8
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_client.py +32 -24
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_id_generator.py +3 -1
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_listing.py +3 -2
- {datachain-0.3.13 → datachain-0.3.15}/.cruft.json +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.gitattributes +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.github/codecov.yaml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.github/dependabot.yml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.github/workflows/release.yml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.github/workflows/tests.yml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.gitignore +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/.pre-commit-config.yaml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/LICENSE +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/README.rst +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/assets/datachain.svg +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/index.md +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/references/datachain.md +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/references/datatype.md +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/references/file.md +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/references/index.md +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/references/sql.md +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/references/torch.md +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/docs/references/udf.md +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/mkdocs.yml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/pyproject.toml +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/setup.cfg +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/__main__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/cache.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/cli.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/config.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/error.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/job.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/progress.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/py.typed +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/builtins.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/params.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/schema.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/session.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/storage.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain/utils.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/examples/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_client.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_ls.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_session.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_udf.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.13 → datachain-0.3.15}/tests/utils.py +0 -0
|
@@ -1,14 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from collections.abc import Awaitable, Coroutine, Iterable
|
|
2
|
+
from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
|
|
3
3
|
from concurrent.futures import ThreadPoolExecutor
|
|
4
4
|
from heapq import heappop, heappush
|
|
5
|
-
from typing import
|
|
6
|
-
Any,
|
|
7
|
-
Callable,
|
|
8
|
-
Generic,
|
|
9
|
-
Optional,
|
|
10
|
-
TypeVar,
|
|
11
|
-
)
|
|
5
|
+
from typing import Any, Callable, Generic, Optional, TypeVar
|
|
12
6
|
|
|
13
7
|
from fsspec.asyn import get_loop
|
|
14
8
|
|
|
@@ -16,6 +10,7 @@ ASYNC_WORKERS = 20
|
|
|
16
10
|
|
|
17
11
|
InputT = TypeVar("InputT", contravariant=True) # noqa: PLC0105
|
|
18
12
|
ResultT = TypeVar("ResultT", covariant=True) # noqa: PLC0105
|
|
13
|
+
T = TypeVar("T")
|
|
19
14
|
|
|
20
15
|
|
|
21
16
|
class AsyncMapper(Generic[InputT, ResultT]):
|
|
@@ -226,7 +221,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
|
|
|
226
221
|
self._push_result(self._next_yield, None)
|
|
227
222
|
|
|
228
223
|
|
|
229
|
-
def iter_over_async(ait, loop):
|
|
224
|
+
def iter_over_async(ait: AsyncIterable[T], loop) -> Iterator[T]:
|
|
230
225
|
"""Wrap an asynchronous iterator into a synchronous one"""
|
|
231
226
|
ait = ait.__aiter__()
|
|
232
227
|
|
|
@@ -621,10 +621,6 @@ class Catalog:
|
|
|
621
621
|
code_ast.body[-1:] = new_expressions
|
|
622
622
|
return code_ast
|
|
623
623
|
|
|
624
|
-
def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
|
|
625
|
-
config = config or self.client_config
|
|
626
|
-
return Client.parse_url(uri, self.cache, **config)
|
|
627
|
-
|
|
628
624
|
def get_client(self, uri: StorageURI, **config: Any) -> Client:
|
|
629
625
|
"""
|
|
630
626
|
Return the client corresponding to the given source `uri`.
|
|
@@ -651,17 +647,16 @@ class Catalog:
|
|
|
651
647
|
partial_path: Optional[str]
|
|
652
648
|
|
|
653
649
|
client_config = client_config or self.client_config
|
|
654
|
-
|
|
650
|
+
uri, path = Client.parse_url(source)
|
|
651
|
+
client = Client.get_client(source, self.cache, **client_config)
|
|
655
652
|
stem = os.path.basename(os.path.normpath(path))
|
|
656
653
|
prefix = (
|
|
657
654
|
posixpath.dirname(path)
|
|
658
655
|
if glob.has_magic(stem) or client.fs.isfile(source)
|
|
659
656
|
else path
|
|
660
657
|
)
|
|
661
|
-
storage_dataset_name = Storage.dataset_name(
|
|
662
|
-
|
|
663
|
-
)
|
|
664
|
-
source_metastore = self.metastore.clone(client.uri)
|
|
658
|
+
storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
|
|
659
|
+
source_metastore = self.metastore.clone(uri)
|
|
665
660
|
|
|
666
661
|
columns = [
|
|
667
662
|
Column("path", String),
|
|
@@ -675,15 +670,13 @@ class Catalog:
|
|
|
675
670
|
]
|
|
676
671
|
|
|
677
672
|
if skip_indexing:
|
|
678
|
-
source_metastore.create_storage_if_not_registered(
|
|
679
|
-
storage = source_metastore.get_storage(
|
|
680
|
-
source_metastore.init_partial_id(
|
|
681
|
-
partial_id = source_metastore.get_next_partial_id(
|
|
673
|
+
source_metastore.create_storage_if_not_registered(uri)
|
|
674
|
+
storage = source_metastore.get_storage(uri)
|
|
675
|
+
source_metastore.init_partial_id(uri)
|
|
676
|
+
partial_id = source_metastore.get_next_partial_id(uri)
|
|
682
677
|
|
|
683
|
-
source_metastore = self.metastore.clone(
|
|
684
|
-
|
|
685
|
-
)
|
|
686
|
-
source_metastore.init(client.uri)
|
|
678
|
+
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
679
|
+
source_metastore.init(uri)
|
|
687
680
|
|
|
688
681
|
source_warehouse = self.warehouse.clone()
|
|
689
682
|
dataset = self.create_dataset(
|
|
@@ -701,20 +694,16 @@ class Catalog:
|
|
|
701
694
|
in_progress,
|
|
702
695
|
partial_id,
|
|
703
696
|
partial_path,
|
|
704
|
-
) = source_metastore.register_storage_for_indexing(
|
|
705
|
-
client.uri, force_update, prefix
|
|
706
|
-
)
|
|
697
|
+
) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
|
|
707
698
|
if in_progress:
|
|
708
699
|
raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
|
|
709
700
|
|
|
710
701
|
if not need_index:
|
|
711
702
|
assert partial_id is not None
|
|
712
703
|
assert partial_path is not None
|
|
713
|
-
source_metastore = self.metastore.clone(
|
|
714
|
-
uri=client.uri, partial_id=partial_id
|
|
715
|
-
)
|
|
704
|
+
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
716
705
|
source_warehouse = self.warehouse.clone()
|
|
717
|
-
dataset = self.get_dataset(Storage.dataset_name(
|
|
706
|
+
dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
718
707
|
lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
|
|
719
708
|
logger.debug(
|
|
720
709
|
"Using cached listing %s. Valid till: %s",
|
|
@@ -731,11 +720,11 @@ class Catalog:
|
|
|
731
720
|
|
|
732
721
|
return lst, path
|
|
733
722
|
|
|
734
|
-
source_metastore.init_partial_id(
|
|
735
|
-
partial_id = source_metastore.get_next_partial_id(
|
|
723
|
+
source_metastore.init_partial_id(uri)
|
|
724
|
+
partial_id = source_metastore.get_next_partial_id(uri)
|
|
736
725
|
|
|
737
|
-
source_metastore.init(
|
|
738
|
-
source_metastore = self.metastore.clone(uri=
|
|
726
|
+
source_metastore.init(uri)
|
|
727
|
+
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
739
728
|
|
|
740
729
|
source_warehouse = self.warehouse.clone()
|
|
741
730
|
|
|
@@ -1370,7 +1359,7 @@ class Catalog:
|
|
|
1370
1359
|
|
|
1371
1360
|
def signed_url(self, source: str, path: str, client_config=None) -> str:
|
|
1372
1361
|
client_config = client_config or self.client_config
|
|
1373
|
-
client
|
|
1362
|
+
client = Client.get_client(source, self.cache, **client_config)
|
|
1374
1363
|
return client.url(path)
|
|
1375
1364
|
|
|
1376
1365
|
def export_dataset_table(
|
|
@@ -1390,12 +1379,12 @@ class Catalog:
|
|
|
1390
1379
|
dataset = self.get_dataset(name)
|
|
1391
1380
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1392
1381
|
|
|
1393
|
-
def dataset_stats(self, name: str, version: int) -> DatasetStats:
|
|
1382
|
+
def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
|
|
1394
1383
|
"""
|
|
1395
1384
|
Returns tuple with dataset stats: total number of rows and total dataset size.
|
|
1396
1385
|
"""
|
|
1397
1386
|
dataset = self.get_dataset(name)
|
|
1398
|
-
dataset_version = dataset.get_version(version)
|
|
1387
|
+
dataset_version = dataset.get_version(version or dataset.latest_version)
|
|
1399
1388
|
return DatasetStats(
|
|
1400
1389
|
num_objects=dataset_version.num_objects,
|
|
1401
1390
|
size=dataset_version.size,
|
|
@@ -4,7 +4,6 @@ from adlfs import AzureBlobFileSystem
|
|
|
4
4
|
from tqdm import tqdm
|
|
5
5
|
|
|
6
6
|
from datachain.lib.file import File
|
|
7
|
-
from datachain.node import Entry
|
|
8
7
|
|
|
9
8
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
10
9
|
|
|
@@ -14,17 +13,6 @@ class AzureClient(Client):
|
|
|
14
13
|
PREFIX = "az://"
|
|
15
14
|
protocol = "az"
|
|
16
15
|
|
|
17
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
18
|
-
version_id = v.get("version_id")
|
|
19
|
-
return Entry.from_file(
|
|
20
|
-
path=path,
|
|
21
|
-
etag=v.get("etag", "").strip('"'),
|
|
22
|
-
version=version_id or "",
|
|
23
|
-
is_latest=version_id is None or bool(v.get("is_current_version")),
|
|
24
|
-
last_modified=v["last_modified"],
|
|
25
|
-
size=v.get("size", ""),
|
|
26
|
-
)
|
|
27
|
-
|
|
28
16
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
29
17
|
version_id = v.get("version_id")
|
|
30
18
|
return File(
|
|
@@ -57,7 +45,7 @@ class AzureClient(Client):
|
|
|
57
45
|
continue
|
|
58
46
|
info = (await self.fs._details([b]))[0]
|
|
59
47
|
entries.append(
|
|
60
|
-
self.
|
|
48
|
+
self.info_to_file(info, self.rel_path(info["name"]))
|
|
61
49
|
)
|
|
62
50
|
if entries:
|
|
63
51
|
await result_queue.put(entries)
|
|
@@ -29,7 +29,7 @@ from tqdm import tqdm
|
|
|
29
29
|
from datachain.cache import DataChainCache, UniqueId
|
|
30
30
|
from datachain.client.fileslice import FileSlice, FileWrapper
|
|
31
31
|
from datachain.error import ClientError as DataChainClientError
|
|
32
|
-
from datachain.
|
|
32
|
+
from datachain.lib.file import File
|
|
33
33
|
from datachain.nodes_fetcher import NodesFetcher
|
|
34
34
|
from datachain.nodes_thread_pool import NodeChunk
|
|
35
35
|
from datachain.storage import StorageURI
|
|
@@ -45,7 +45,7 @@ DELIMITER = "/" # Path delimiter.
|
|
|
45
45
|
|
|
46
46
|
DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
|
|
47
47
|
|
|
48
|
-
ResultQueue = asyncio.Queue[Optional[Sequence[
|
|
48
|
+
ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
def _is_win_local_path(uri: str) -> bool:
|
|
@@ -116,15 +116,16 @@ class Client(ABC):
|
|
|
116
116
|
return DATA_SOURCE_URI_PATTERN.match(name) is not None
|
|
117
117
|
|
|
118
118
|
@staticmethod
|
|
119
|
-
def parse_url(
|
|
120
|
-
source
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
119
|
+
def parse_url(source: str) -> tuple[StorageURI, str]:
|
|
120
|
+
cls = Client.get_implementation(source)
|
|
121
|
+
storage_name, rel_path = cls.split_url(source)
|
|
122
|
+
return cls.get_uri(storage_name), rel_path
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
|
|
124
126
|
cls = Client.get_implementation(source)
|
|
125
|
-
storage_url,
|
|
126
|
-
|
|
127
|
-
return client, rel_path
|
|
127
|
+
storage_url, _ = cls.split_url(source)
|
|
128
|
+
return cls.from_name(storage_url, cache, kwargs)
|
|
128
129
|
|
|
129
130
|
@classmethod
|
|
130
131
|
def create_fs(cls, **kwargs) -> "AbstractFileSystem":
|
|
@@ -188,7 +189,7 @@ class Client(ABC):
|
|
|
188
189
|
|
|
189
190
|
async def get_current_etag(self, uid: UniqueId) -> str:
|
|
190
191
|
info = await self.fs._info(self.get_full_path(uid.path))
|
|
191
|
-
return self.
|
|
192
|
+
return self.info_to_file(info, "").etag
|
|
192
193
|
|
|
193
194
|
async def get_size(self, path: str) -> int:
|
|
194
195
|
return await self.fs._size(path)
|
|
@@ -198,7 +199,7 @@ class Client(ABC):
|
|
|
198
199
|
|
|
199
200
|
async def scandir(
|
|
200
201
|
self, start_prefix: str, method: str = "default"
|
|
201
|
-
) -> AsyncIterator[Sequence[
|
|
202
|
+
) -> AsyncIterator[Sequence[File]]:
|
|
202
203
|
try:
|
|
203
204
|
impl = getattr(self, f"_fetch_{method}")
|
|
204
205
|
except AttributeError:
|
|
@@ -264,7 +265,7 @@ class Client(ABC):
|
|
|
264
265
|
) -> None:
|
|
265
266
|
await self._fetch_nested(start_prefix, result_queue)
|
|
266
267
|
|
|
267
|
-
async def _fetch_dir(self, prefix, pbar, result_queue) -> set[str]:
|
|
268
|
+
async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
|
|
268
269
|
path = f"{self.name}/{prefix}"
|
|
269
270
|
infos = await self.ls_dir(path)
|
|
270
271
|
files = []
|
|
@@ -277,7 +278,7 @@ class Client(ABC):
|
|
|
277
278
|
if info["type"] == "directory":
|
|
278
279
|
subdirs.add(subprefix)
|
|
279
280
|
else:
|
|
280
|
-
files.append(self.
|
|
281
|
+
files.append(self.info_to_file(info, subprefix))
|
|
281
282
|
if files:
|
|
282
283
|
await result_queue.put(files)
|
|
283
284
|
found_count = len(subdirs) + len(files)
|
|
@@ -303,7 +304,7 @@ class Client(ABC):
|
|
|
303
304
|
return f"{self.PREFIX}{self.name}/{rel_path}"
|
|
304
305
|
|
|
305
306
|
@abstractmethod
|
|
306
|
-
def
|
|
307
|
+
def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
|
|
307
308
|
|
|
308
309
|
def fetch_nodes(
|
|
309
310
|
self,
|
|
@@ -10,7 +10,6 @@ from gcsfs import GCSFileSystem
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
12
|
from datachain.lib.file import File
|
|
13
|
-
from datachain.node import Entry
|
|
14
13
|
|
|
15
14
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
16
15
|
|
|
@@ -108,19 +107,9 @@ class GCSClient(Client):
|
|
|
108
107
|
finally:
|
|
109
108
|
await page_queue.put(None)
|
|
110
109
|
|
|
111
|
-
def _entry_from_dict(self, d: dict[str, Any]) ->
|
|
110
|
+
def _entry_from_dict(self, d: dict[str, Any]) -> File:
|
|
112
111
|
info = self.fs._process_object(self.name, d)
|
|
113
|
-
return self.
|
|
114
|
-
|
|
115
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
116
|
-
return Entry.from_file(
|
|
117
|
-
path=path,
|
|
118
|
-
etag=v.get("etag", ""),
|
|
119
|
-
version=v.get("generation", ""),
|
|
120
|
-
is_latest=not v.get("timeDeleted"),
|
|
121
|
-
last_modified=self.parse_timestamp(v["updated"]),
|
|
122
|
-
size=v.get("size", ""),
|
|
123
|
-
)
|
|
112
|
+
return self.info_to_file(info, self.rel_path(info["name"]))
|
|
124
113
|
|
|
125
114
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
126
115
|
return File(
|
|
@@ -5,7 +5,6 @@ from typing import Any, cast
|
|
|
5
5
|
from huggingface_hub import HfFileSystem
|
|
6
6
|
|
|
7
7
|
from datachain.lib.file import File
|
|
8
|
-
from datachain.node import Entry
|
|
9
8
|
|
|
10
9
|
from .fsspec import Client
|
|
11
10
|
|
|
@@ -22,15 +21,6 @@ class HfClient(Client):
|
|
|
22
21
|
|
|
23
22
|
return cast(HfFileSystem, super().create_fs(**kwargs))
|
|
24
23
|
|
|
25
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
26
|
-
return Entry.from_file(
|
|
27
|
-
path=path,
|
|
28
|
-
size=v["size"],
|
|
29
|
-
version=v["last_commit"].oid,
|
|
30
|
-
etag=v.get("blob_id", ""),
|
|
31
|
-
last_modified=v["last_commit"].date,
|
|
32
|
-
)
|
|
33
|
-
|
|
34
24
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
35
25
|
return File(
|
|
36
26
|
path=path,
|
|
@@ -7,8 +7,8 @@ from urllib.parse import urlparse
|
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
9
9
|
|
|
10
|
+
from datachain.cache import UniqueId
|
|
10
11
|
from datachain.lib.file import File
|
|
11
|
-
from datachain.node import Entry
|
|
12
12
|
from datachain.storage import StorageURI
|
|
13
13
|
|
|
14
14
|
from .fsspec import Client
|
|
@@ -114,9 +114,9 @@ class FileClient(Client):
|
|
|
114
114
|
use_symlinks=use_symlinks,
|
|
115
115
|
)
|
|
116
116
|
|
|
117
|
-
async def get_current_etag(self, uid) -> str:
|
|
117
|
+
async def get_current_etag(self, uid: UniqueId) -> str:
|
|
118
118
|
info = self.fs.info(self.get_full_path(uid.path))
|
|
119
|
-
return self.
|
|
119
|
+
return self.info_to_file(info, "").etag
|
|
120
120
|
|
|
121
121
|
async def get_size(self, path: str) -> int:
|
|
122
122
|
return self.fs.size(path)
|
|
@@ -136,15 +136,6 @@ class FileClient(Client):
|
|
|
136
136
|
full_path += "/"
|
|
137
137
|
return full_path
|
|
138
138
|
|
|
139
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
140
|
-
return Entry.from_file(
|
|
141
|
-
path=path,
|
|
142
|
-
etag=v["mtime"].hex(),
|
|
143
|
-
is_latest=True,
|
|
144
|
-
last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
|
|
145
|
-
size=v.get("size", ""),
|
|
146
|
-
)
|
|
147
|
-
|
|
148
139
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
149
140
|
return File(
|
|
150
141
|
source=self.uri,
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import Any, cast
|
|
2
|
+
from typing import Any, Optional, cast
|
|
3
3
|
|
|
4
4
|
from botocore.exceptions import NoCredentialsError
|
|
5
5
|
from s3fs import S3FileSystem
|
|
6
6
|
from tqdm import tqdm
|
|
7
7
|
|
|
8
8
|
from datachain.lib.file import File
|
|
9
|
-
from datachain.node import Entry
|
|
10
9
|
|
|
11
10
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
12
11
|
|
|
@@ -111,8 +110,9 @@ class ClientS3(Client):
|
|
|
111
110
|
) -> None:
|
|
112
111
|
await self._fetch_flat(start_prefix, result_queue)
|
|
113
112
|
|
|
114
|
-
def _entry_from_boto(self, v, bucket, versions=False):
|
|
115
|
-
return
|
|
113
|
+
def _entry_from_boto(self, v, bucket, versions=False) -> File:
|
|
114
|
+
return File(
|
|
115
|
+
source=self.uri,
|
|
116
116
|
path=v["Key"],
|
|
117
117
|
etag=v.get("ETag", "").strip('"'),
|
|
118
118
|
version=ClientS3.clean_s3_version(v.get("VersionId", "")),
|
|
@@ -125,8 +125,8 @@ class ClientS3(Client):
|
|
|
125
125
|
self,
|
|
126
126
|
prefix,
|
|
127
127
|
pbar,
|
|
128
|
-
result_queue,
|
|
129
|
-
):
|
|
128
|
+
result_queue: ResultQueue,
|
|
129
|
+
) -> set[str]:
|
|
130
130
|
if prefix:
|
|
131
131
|
prefix = prefix.lstrip(DELIMITER) + DELIMITER
|
|
132
132
|
files = []
|
|
@@ -141,7 +141,7 @@ class ClientS3(Client):
|
|
|
141
141
|
if info["type"] == "directory":
|
|
142
142
|
subdirs.add(subprefix)
|
|
143
143
|
else:
|
|
144
|
-
files.append(self.
|
|
144
|
+
files.append(self.info_to_file(info, subprefix))
|
|
145
145
|
pbar.update()
|
|
146
146
|
found = True
|
|
147
147
|
if not found:
|
|
@@ -152,18 +152,8 @@ class ClientS3(Client):
|
|
|
152
152
|
return subdirs
|
|
153
153
|
|
|
154
154
|
@staticmethod
|
|
155
|
-
def clean_s3_version(ver):
|
|
156
|
-
return ver if ver != "null" else ""
|
|
157
|
-
|
|
158
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
159
|
-
return Entry.from_file(
|
|
160
|
-
path=path,
|
|
161
|
-
etag=v.get("ETag", "").strip('"'),
|
|
162
|
-
version=ClientS3.clean_s3_version(v.get("VersionId", "")),
|
|
163
|
-
is_latest=v.get("IsLatest", True),
|
|
164
|
-
last_modified=v.get("LastModified", ""),
|
|
165
|
-
size=v["size"],
|
|
166
|
-
)
|
|
155
|
+
def clean_s3_version(ver: Optional[str]) -> str:
|
|
156
|
+
return ver if (ver is not None and ver != "null") else ""
|
|
167
157
|
|
|
168
158
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
169
159
|
return File(
|
|
@@ -43,6 +43,8 @@ if TYPE_CHECKING:
|
|
|
43
43
|
from sqlalchemy.sql.elements import ColumnElement
|
|
44
44
|
from sqlalchemy.types import TypeEngine
|
|
45
45
|
|
|
46
|
+
from datachain.lib.file import File
|
|
47
|
+
|
|
46
48
|
|
|
47
49
|
logger = logging.getLogger("datachain")
|
|
48
50
|
|
|
@@ -58,6 +60,10 @@ quote_schema = sqlite_dialect.identifier_preparer.quote_schema
|
|
|
58
60
|
quote = sqlite_dialect.identifier_preparer.quote
|
|
59
61
|
|
|
60
62
|
|
|
63
|
+
def _get_in_memory_uri():
|
|
64
|
+
return "file::memory:?cache=shared"
|
|
65
|
+
|
|
66
|
+
|
|
61
67
|
def get_retry_sleep_sec(retry_count: int) -> int:
|
|
62
68
|
return RETRY_START_SEC * (RETRY_FACTOR**retry_count)
|
|
63
69
|
|
|
@@ -119,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
119
125
|
if db_file == ":memory:":
|
|
120
126
|
# Enable multithreaded usage of the same in-memory db
|
|
121
127
|
db = sqlite3.connect(
|
|
122
|
-
|
|
128
|
+
_get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
|
|
123
129
|
)
|
|
124
130
|
else:
|
|
125
131
|
db = sqlite3.connect(
|
|
@@ -704,6 +710,9 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
704
710
|
|
|
705
711
|
self.db.execute(insert_query)
|
|
706
712
|
|
|
713
|
+
def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
|
|
714
|
+
return (e.model_dump() for e in entries)
|
|
715
|
+
|
|
707
716
|
def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
|
|
708
717
|
rows = list(rows)
|
|
709
718
|
if not rows:
|
|
@@ -20,7 +20,7 @@ from datachain.client import Client
|
|
|
20
20
|
from datachain.data_storage.schema import convert_rows_custom_column_types
|
|
21
21
|
from datachain.data_storage.serializer import Serializable
|
|
22
22
|
from datachain.dataset import DatasetRecord
|
|
23
|
-
from datachain.node import DirType, DirTypeGroup,
|
|
23
|
+
from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
|
|
24
24
|
from datachain.sql.functions import path as pathfunc
|
|
25
25
|
from datachain.sql.types import Int, SQLType
|
|
26
26
|
from datachain.storage import StorageURI
|
|
@@ -34,6 +34,7 @@ if TYPE_CHECKING:
|
|
|
34
34
|
from datachain.data_storage import AbstractIDGenerator, schema
|
|
35
35
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
36
36
|
from datachain.data_storage.schema import DataTable
|
|
37
|
+
from datachain.lib.file import File
|
|
37
38
|
|
|
38
39
|
try:
|
|
39
40
|
import numpy as np
|
|
@@ -401,25 +402,18 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
401
402
|
expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
|
|
402
403
|
sa.func.count(table.c.sys__id),
|
|
403
404
|
)
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
405
|
+
size_columns = [
|
|
406
|
+
c for c in table.columns if c.name == "size" or c.name.endswith("__size")
|
|
407
|
+
]
|
|
408
|
+
if size_columns:
|
|
409
|
+
expressions = (*expressions, sa.func.sum(sum(size_columns)))
|
|
408
410
|
query = select(*expressions)
|
|
409
411
|
((nrows, *rest),) = self.db.execute(query)
|
|
410
|
-
return nrows, rest[0] if rest else
|
|
411
|
-
|
|
412
|
-
def prepare_entries(
|
|
413
|
-
self, uri: str, entries: Iterable[Entry]
|
|
414
|
-
) -> list[dict[str, Any]]:
|
|
415
|
-
"""
|
|
416
|
-
Prepares bucket listing entry (row) for inserting into database
|
|
417
|
-
"""
|
|
418
|
-
|
|
419
|
-
def _prepare_entry(entry: Entry):
|
|
420
|
-
return attrs.asdict(entry) | {"source": uri}
|
|
412
|
+
return nrows, rest[0] if rest else 0
|
|
421
413
|
|
|
422
|
-
|
|
414
|
+
@abstractmethod
|
|
415
|
+
def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
|
|
416
|
+
"""Convert File entries so they can be passed on to `insert_rows()`"""
|
|
423
417
|
|
|
424
418
|
@abstractmethod
|
|
425
419
|
def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
|
|
@@ -112,7 +112,7 @@ class DatasetDependency:
|
|
|
112
112
|
|
|
113
113
|
if is_listing_dataset(dataset_name):
|
|
114
114
|
dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
|
|
115
|
-
dependency_name = listing_uri_from_name(dataset_name)
|
|
115
|
+
dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
|
|
116
116
|
|
|
117
117
|
return cls(
|
|
118
118
|
id,
|