datachain 0.3.17__tar.gz → 0.3.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.17 → datachain-0.3.19}/.github/workflows/benchmarks.yml +11 -8
- {datachain-0.3.17 → datachain-0.3.19}/.github/workflows/release.yml +6 -7
- {datachain-0.3.17 → datachain-0.3.19}/.github/workflows/tests-studio.yml +8 -5
- {datachain-0.3.17 → datachain-0.3.19}/.github/workflows/tests.yml +27 -18
- {datachain-0.3.17/src/datachain.egg-info → datachain-0.3.19}/PKG-INFO +5 -4
- {datachain-0.3.17 → datachain-0.3.19}/docs/references/file.md +2 -2
- datachain-0.3.19/examples/llm_and_nlp/unstructured-embeddings-gen.py +76 -0
- datachain-0.3.17/examples/llm_and_nlp/unstructured-text.py → datachain-0.3.19/examples/llm_and_nlp/unstructured-summary-map.py +7 -3
- {datachain-0.3.17 → datachain-0.3.19}/examples/multimodal/hf_pipeline.py +7 -1
- {datachain-0.3.17 → datachain-0.3.19}/examples/multimodal/openai_image_desc_lib.py +0 -2
- {datachain-0.3.17 → datachain-0.3.19}/noxfile.py +2 -2
- {datachain-0.3.17 → datachain-0.3.19}/pyproject.toml +6 -5
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/__init__.py +5 -2
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/cache.py +14 -55
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/catalog/catalog.py +17 -97
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/cli.py +7 -2
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/fsspec.py +29 -63
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/local.py +2 -3
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/dataset.py +7 -2
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/error.py +6 -4
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/arrow.py +10 -4
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/dc.py +6 -2
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/file.py +64 -28
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/listing.py +2 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/listing.py +4 -4
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/node.py +6 -6
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/nodes_fetcher.py +12 -5
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/nodes_thread_pool.py +1 -1
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/progress.py +2 -12
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/dataset.py +6 -40
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/dispatch.py +2 -15
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/schema.py +25 -24
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/udf.py +0 -106
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/types.py +4 -2
- datachain-0.3.19/src/datachain/telemetry.py +37 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/utils.py +11 -0
- {datachain-0.3.17 → datachain-0.3.19/src/datachain.egg-info}/PKG-INFO +5 -4
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain.egg-info/SOURCES.txt +5 -1
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain.egg-info/requires.txt +4 -3
- {datachain-0.3.17 → datachain-0.3.19}/tests/conftest.py +6 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/examples/test_examples.py +38 -30
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_catalog.py +2 -108
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_datachain.py +46 -5
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_dataset_query.py +6 -2
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_datasets.py +12 -10
- datachain-0.3.19/tests/func/test_query.py +112 -0
- datachain-0.3.19/tests/test_telemetry.py +20 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_arrow.py +8 -9
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_file.py +3 -26
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_cache.py +9 -4
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_data_storage.py +1 -1
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_module_exports.py +2 -2
- datachain-0.3.19/tests/unit/test_query.py +65 -0
- datachain-0.3.17/tests/func/test_query.py +0 -182
- {datachain-0.3.17 → datachain-0.3.19}/.cruft.json +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/.gitattributes +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/.github/codecov.yaml +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/.github/dependabot.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/.gitignore +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/.pre-commit-config.yaml +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/LICENSE +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/README.rst +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/assets/datachain.svg +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/index.md +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/references/datachain.md +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/references/datatype.md +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/references/index.md +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/references/sql.md +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/references/torch.md +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/docs/references/udf.md +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/mkdocs.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/setup.cfg +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/__main__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/asyn.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/hf.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/config.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/job.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/hf.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/tar.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/py.typed +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/params.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/session.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/storage.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/data.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/examples/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_client.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_listing.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_ls.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_pull.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_client.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_session.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.17 → datachain-0.3.19}/tests/utils.py +0 -0
|
@@ -23,15 +23,18 @@ jobs:
|
|
|
23
23
|
uses: actions/setup-python@v5
|
|
24
24
|
with:
|
|
25
25
|
python-version: '3.12'
|
|
26
|
-
cache: 'pip'
|
|
27
26
|
|
|
28
|
-
- name:
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
27
|
+
- name: Setup uv
|
|
28
|
+
uses: astral-sh/setup-uv@v3
|
|
29
|
+
with:
|
|
30
|
+
enable-cache: true
|
|
31
|
+
cache-suffix: benchmarks
|
|
32
|
+
cache-dependency-glob: pyproject.toml
|
|
33
|
+
|
|
34
|
+
- name: Install nox and dvc
|
|
35
|
+
run: uv pip install dvc[gs] nox --system
|
|
33
36
|
|
|
34
|
-
-
|
|
35
|
-
|
|
37
|
+
- name: Pull dataset
|
|
38
|
+
run: dvc --cd tests/benchmarks/datasets pull
|
|
36
39
|
- name: Run benchmarks
|
|
37
40
|
run: nox -s bench
|
|
@@ -21,17 +21,16 @@ jobs:
|
|
|
21
21
|
with:
|
|
22
22
|
fetch-depth: 0
|
|
23
23
|
|
|
24
|
-
- name: Set up Python 3.
|
|
24
|
+
- name: Set up Python 3.12
|
|
25
25
|
uses: actions/setup-python@v5
|
|
26
26
|
with:
|
|
27
|
-
python-version: '3.
|
|
27
|
+
python-version: '3.12'
|
|
28
28
|
|
|
29
|
-
- name:
|
|
30
|
-
|
|
31
|
-
python -m pip install --upgrade 'nox[uv]'
|
|
32
|
-
nox --version
|
|
33
|
-
uv --version
|
|
29
|
+
- name: Setup uv
|
|
30
|
+
uses: astral-sh/setup-uv@v3
|
|
34
31
|
|
|
32
|
+
- name: Install nox
|
|
33
|
+
run: uv pip install nox --system
|
|
35
34
|
- name: Build package
|
|
36
35
|
run: nox -s build
|
|
37
36
|
|
|
@@ -82,12 +82,15 @@ jobs:
|
|
|
82
82
|
uses: actions/setup-python@v5
|
|
83
83
|
with:
|
|
84
84
|
python-version: ${{ matrix.pyv }}
|
|
85
|
-
cache: 'pip'
|
|
86
85
|
|
|
87
|
-
- name:
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
86
|
+
- name: Setup uv
|
|
87
|
+
uses: astral-sh/setup-uv@v3
|
|
88
|
+
with:
|
|
89
|
+
enable-cache: true
|
|
90
|
+
cache-suffix: studio
|
|
91
|
+
cache-dependency-glob: |
|
|
92
|
+
backend/datachain_server/pyproject.toml
|
|
93
|
+
backend/datachain/pyproject.toml
|
|
91
94
|
|
|
92
95
|
- name: Install dependencies
|
|
93
96
|
run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
|
|
@@ -26,13 +26,16 @@ jobs:
|
|
|
26
26
|
uses: actions/setup-python@v5
|
|
27
27
|
with:
|
|
28
28
|
python-version: '3.9'
|
|
29
|
-
cache: 'pip'
|
|
30
29
|
|
|
31
|
-
- name:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
- name: Setup uv
|
|
31
|
+
uses: astral-sh/setup-uv@v3
|
|
32
|
+
with:
|
|
33
|
+
enable-cache: true
|
|
34
|
+
cache-suffix: lint
|
|
35
|
+
cache-dependency-glob: pyproject.toml
|
|
36
|
+
|
|
37
|
+
- name: Install nox
|
|
38
|
+
run: uv pip install nox --system
|
|
36
39
|
|
|
37
40
|
- name: Cache mypy
|
|
38
41
|
uses: actions/cache@v4
|
|
@@ -77,13 +80,16 @@ jobs:
|
|
|
77
80
|
uses: actions/setup-python@v5
|
|
78
81
|
with:
|
|
79
82
|
python-version: ${{ matrix.pyv }}
|
|
80
|
-
cache: 'pip'
|
|
81
83
|
|
|
82
|
-
- name:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
84
|
+
- name: Setup uv
|
|
85
|
+
uses: astral-sh/setup-uv@v3
|
|
86
|
+
with:
|
|
87
|
+
enable-cache: true
|
|
88
|
+
cache-suffix: tests-${{ matrix.pyv }}
|
|
89
|
+
cache-dependency-glob: pyproject.toml
|
|
90
|
+
|
|
91
|
+
- name: Install nox
|
|
92
|
+
run: uv pip install nox --system
|
|
87
93
|
|
|
88
94
|
- name: Skip flaky azure, gs remotes on macOS
|
|
89
95
|
if: runner.os == 'macOS'
|
|
@@ -134,13 +140,16 @@ jobs:
|
|
|
134
140
|
uses: actions/setup-python@v5
|
|
135
141
|
with:
|
|
136
142
|
python-version: ${{ matrix.pyv }}
|
|
137
|
-
cache: 'pip'
|
|
138
143
|
|
|
139
|
-
- name:
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
+
- name: Setup uv
|
|
145
|
+
uses: astral-sh/setup-uv@v3
|
|
146
|
+
with:
|
|
147
|
+
enable-cache: true
|
|
148
|
+
cache-suffix: examples-${{ matrix.pyv }}
|
|
149
|
+
cache-dependency-glob: pyproject.toml
|
|
150
|
+
|
|
151
|
+
- name: Install nox
|
|
152
|
+
run: uv pip install nox --system
|
|
144
153
|
|
|
145
154
|
- name: Run examples
|
|
146
155
|
run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.19
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -34,7 +34,6 @@ Requires-Dist: dvc-objects<6,>=4
|
|
|
34
34
|
Requires-Dist: shtab<2,>=1.3.4
|
|
35
35
|
Requires-Dist: sqlalchemy>=2
|
|
36
36
|
Requires-Dist: multiprocess==0.70.16
|
|
37
|
-
Requires-Dist: dill==0.3.8
|
|
38
37
|
Requires-Dist: cloudpickle
|
|
39
38
|
Requires-Dist: orjson>=3.10.5
|
|
40
39
|
Requires-Dist: pydantic<3,>=2
|
|
@@ -44,6 +43,7 @@ Requires-Dist: Pillow<11,>=10.0.0
|
|
|
44
43
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
45
44
|
Requires-Dist: psutil
|
|
46
45
|
Requires-Dist: huggingface_hub
|
|
46
|
+
Requires-Dist: iterative-telemetry>=0.0.9
|
|
47
47
|
Provides-Extra: docs
|
|
48
48
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
49
49
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -69,7 +69,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
|
69
69
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
70
70
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
71
71
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
72
|
-
Requires-Dist: pytest-servers[all]>=0.5.
|
|
72
|
+
Requires-Dist: pytest-servers[all]>=0.5.7; extra == "tests"
|
|
73
73
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
74
74
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
75
75
|
Requires-Dist: virtualenv; extra == "tests"
|
|
@@ -91,9 +91,10 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
91
91
|
Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
92
92
|
Requires-Dist: defusedxml; extra == "examples"
|
|
93
93
|
Requires-Dist: accelerate; extra == "examples"
|
|
94
|
-
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
94
|
+
Requires-Dist: unstructured[embed-huggingface,pdf]; extra == "examples"
|
|
95
95
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
96
96
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
97
|
+
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
97
98
|
|
|
98
99
|
================
|
|
99
100
|
|logo| DataChain
|
|
@@ -7,6 +7,8 @@ automatically when creating a `DataChain` from files, like in
|
|
|
7
7
|
classes include various metadata fields about the underlying file as well as methods to
|
|
8
8
|
read from the files and otherwise work with the file contents.
|
|
9
9
|
|
|
10
|
+
::: datachain.lib.file.ArrowRow
|
|
11
|
+
|
|
10
12
|
::: datachain.lib.file.ExportPlacement
|
|
11
13
|
|
|
12
14
|
::: datachain.lib.file.File
|
|
@@ -15,8 +17,6 @@ read from the files and otherwise work with the file contents.
|
|
|
15
17
|
|
|
16
18
|
::: datachain.lib.file.ImageFile
|
|
17
19
|
|
|
18
|
-
::: datachain.lib.file.IndexedFile
|
|
19
|
-
|
|
20
20
|
::: datachain.lib.file.TarVFile
|
|
21
21
|
|
|
22
22
|
::: datachain.lib.file.TextFile
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
To install the required dependencies:
|
|
3
|
+
|
|
4
|
+
pip install datachain[examples]
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from collections.abc import Iterator
|
|
9
|
+
|
|
10
|
+
from unstructured.cleaners.core import (
|
|
11
|
+
clean,
|
|
12
|
+
group_broken_paragraphs,
|
|
13
|
+
replace_unicode_quotes,
|
|
14
|
+
)
|
|
15
|
+
from unstructured.embed.huggingface import (
|
|
16
|
+
HuggingFaceEmbeddingConfig,
|
|
17
|
+
HuggingFaceEmbeddingEncoder,
|
|
18
|
+
)
|
|
19
|
+
from unstructured.partition.pdf import partition_pdf
|
|
20
|
+
|
|
21
|
+
from datachain import C, DataChain, DataModel, File
|
|
22
|
+
|
|
23
|
+
source = "gs://datachain-demo/neurips/1987/"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Define the output as a DataModel class
|
|
27
|
+
class Chunk(DataModel):
|
|
28
|
+
key: str
|
|
29
|
+
text: str
|
|
30
|
+
embeddings: list[float]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Define embedding encoder
|
|
34
|
+
|
|
35
|
+
embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Use signatures to define UDF input/output
|
|
39
|
+
# these can be pydantic model or regular Python types
|
|
40
|
+
def process_pdf(file: File) -> Iterator[Chunk]:
|
|
41
|
+
# Ingest the file
|
|
42
|
+
with file.open() as f:
|
|
43
|
+
chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
|
|
44
|
+
|
|
45
|
+
# Clean the chunks and add new columns
|
|
46
|
+
for chunk in chunks:
|
|
47
|
+
chunk.apply(
|
|
48
|
+
lambda text: clean(
|
|
49
|
+
text, bullets=True, extra_whitespace=True, trailing_punctuation=True
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
chunk.apply(replace_unicode_quotes)
|
|
53
|
+
chunk.apply(group_broken_paragraphs)
|
|
54
|
+
|
|
55
|
+
# create embeddings
|
|
56
|
+
chunks_embedded = embedding_encoder.embed_documents(chunks)
|
|
57
|
+
|
|
58
|
+
# Add new rows to DataChain
|
|
59
|
+
for chunk in chunks_embedded:
|
|
60
|
+
yield Chunk(
|
|
61
|
+
key=file.path,
|
|
62
|
+
text=chunk.text,
|
|
63
|
+
embeddings=chunk.embeddings,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
dc = (
|
|
68
|
+
DataChain.from_storage(source)
|
|
69
|
+
.settings(parallel=-1)
|
|
70
|
+
.filter(C.file.path.glob("*.pdf"))
|
|
71
|
+
.gen(document=process_pdf)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
dc.save("embedded-documents")
|
|
75
|
+
|
|
76
|
+
DataChain.from_dataset("embedded-documents").show()
|
|
@@ -57,8 +57,8 @@ def lint(session: nox.Session) -> None:
|
|
|
57
57
|
|
|
58
58
|
@nox.session
|
|
59
59
|
def build(session: nox.Session) -> None:
|
|
60
|
-
session.install("
|
|
61
|
-
session.run("
|
|
60
|
+
session.install("twine", "uv")
|
|
61
|
+
session.run("uv", "build")
|
|
62
62
|
dists = glob.glob("dist/*")
|
|
63
63
|
session.run("twine", "check", *dists, silent=True)
|
|
64
64
|
|
|
@@ -37,7 +37,6 @@ dependencies = [
|
|
|
37
37
|
"shtab>=1.3.4,<2",
|
|
38
38
|
"sqlalchemy>=2",
|
|
39
39
|
"multiprocess==0.70.16",
|
|
40
|
-
"dill==0.3.8",
|
|
41
40
|
"cloudpickle",
|
|
42
41
|
"orjson>=3.10.5",
|
|
43
42
|
"pydantic>=2,<3",
|
|
@@ -46,7 +45,8 @@ dependencies = [
|
|
|
46
45
|
"Pillow>=10.0.0,<11",
|
|
47
46
|
"msgpack>=1.0.4,<2",
|
|
48
47
|
"psutil",
|
|
49
|
-
"huggingface_hub"
|
|
48
|
+
"huggingface_hub",
|
|
49
|
+
"iterative-telemetry>=0.0.9"
|
|
50
50
|
]
|
|
51
51
|
|
|
52
52
|
[project.optional-dependencies]
|
|
@@ -80,7 +80,7 @@ tests = [
|
|
|
80
80
|
"pytest-sugar>=0.9.6",
|
|
81
81
|
"pytest-cov>=4.1.0",
|
|
82
82
|
"pytest-mock>=3.12.0",
|
|
83
|
-
"pytest-servers[all]>=0.5.
|
|
83
|
+
"pytest-servers[all]>=0.5.7",
|
|
84
84
|
"pytest-benchmark[histogram]",
|
|
85
85
|
"pytest-xdist>=3.3.1",
|
|
86
86
|
"virtualenv",
|
|
@@ -104,9 +104,10 @@ examples = [
|
|
|
104
104
|
"numpy>=1,<2",
|
|
105
105
|
"defusedxml",
|
|
106
106
|
"accelerate",
|
|
107
|
-
"unstructured[pdf]",
|
|
107
|
+
"unstructured[pdf, embed-huggingface]",
|
|
108
108
|
"pdfplumber==0.11.4",
|
|
109
|
-
"huggingface_hub[hf_transfer]"
|
|
109
|
+
"huggingface_hub[hf_transfer]",
|
|
110
|
+
"onnx==1.16.1"
|
|
110
111
|
]
|
|
111
112
|
|
|
112
113
|
[project.urls]
|
|
@@ -1,21 +1,23 @@
|
|
|
1
1
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
2
2
|
from datachain.lib.dc import C, Column, DataChain, Sys
|
|
3
3
|
from datachain.lib.file import (
|
|
4
|
+
ArrowRow,
|
|
4
5
|
File,
|
|
5
6
|
FileError,
|
|
6
7
|
ImageFile,
|
|
7
|
-
IndexedFile,
|
|
8
8
|
TarVFile,
|
|
9
9
|
TextFile,
|
|
10
10
|
)
|
|
11
11
|
from datachain.lib.model_store import ModelStore
|
|
12
12
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
13
13
|
from datachain.lib.utils import AbstractUDF, DataChainError
|
|
14
|
+
from datachain.query import metrics, param
|
|
14
15
|
from datachain.query.session import Session
|
|
15
16
|
|
|
16
17
|
__all__ = [
|
|
17
18
|
"AbstractUDF",
|
|
18
19
|
"Aggregator",
|
|
20
|
+
"ArrowRow",
|
|
19
21
|
"C",
|
|
20
22
|
"Column",
|
|
21
23
|
"DataChain",
|
|
@@ -26,7 +28,6 @@ __all__ = [
|
|
|
26
28
|
"FileError",
|
|
27
29
|
"Generator",
|
|
28
30
|
"ImageFile",
|
|
29
|
-
"IndexedFile",
|
|
30
31
|
"Mapper",
|
|
31
32
|
"ModelStore",
|
|
32
33
|
"Session",
|
|
@@ -34,4 +35,6 @@ __all__ = [
|
|
|
34
35
|
"TarVFile",
|
|
35
36
|
"TextFile",
|
|
36
37
|
"is_chain_type",
|
|
38
|
+
"metrics",
|
|
39
|
+
"param",
|
|
37
40
|
]
|
|
@@ -1,56 +1,15 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import json
|
|
3
1
|
import os
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from functools import partial
|
|
6
2
|
from typing import TYPE_CHECKING, Optional
|
|
7
3
|
|
|
8
|
-
import attrs
|
|
9
4
|
from dvc_data.hashfile.db.local import LocalHashFileDB
|
|
10
5
|
from dvc_objects.fs.local import LocalFileSystem
|
|
11
6
|
from fsspec.callbacks import Callback, TqdmCallback
|
|
12
7
|
|
|
13
|
-
from datachain.utils import TIME_ZERO
|
|
14
|
-
|
|
15
8
|
from .progress import Tqdm
|
|
16
9
|
|
|
17
10
|
if TYPE_CHECKING:
|
|
18
11
|
from datachain.client import Client
|
|
19
|
-
from datachain.
|
|
20
|
-
|
|
21
|
-
sha256 = partial(hashlib.sha256, usedforsecurity=False)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@attrs.frozen
|
|
25
|
-
class UniqueId:
|
|
26
|
-
storage: "StorageURI"
|
|
27
|
-
path: str
|
|
28
|
-
size: int
|
|
29
|
-
etag: str
|
|
30
|
-
version: str = ""
|
|
31
|
-
is_latest: bool = True
|
|
32
|
-
location: Optional[str] = None
|
|
33
|
-
last_modified: datetime = TIME_ZERO
|
|
34
|
-
|
|
35
|
-
def get_parsed_location(self) -> Optional[dict]:
|
|
36
|
-
if not self.location:
|
|
37
|
-
return None
|
|
38
|
-
|
|
39
|
-
loc_stack = (
|
|
40
|
-
json.loads(self.location)
|
|
41
|
-
if isinstance(self.location, str)
|
|
42
|
-
else self.location
|
|
43
|
-
)
|
|
44
|
-
if len(loc_stack) > 1:
|
|
45
|
-
raise NotImplementedError("Nested v-objects are not supported yet.")
|
|
46
|
-
|
|
47
|
-
return loc_stack[0]
|
|
48
|
-
|
|
49
|
-
def get_hash(self) -> str:
|
|
50
|
-
fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
|
|
51
|
-
if self.location:
|
|
52
|
-
fingerprint += f"/{self.location}"
|
|
53
|
-
return sha256(fingerprint.encode()).hexdigest()
|
|
12
|
+
from datachain.lib.file import File
|
|
54
13
|
|
|
55
14
|
|
|
56
15
|
def try_scandir(path):
|
|
@@ -77,30 +36,30 @@ class DataChainCache:
|
|
|
77
36
|
def tmp_dir(self):
|
|
78
37
|
return self.odb.tmp_dir
|
|
79
38
|
|
|
80
|
-
def get_path(self,
|
|
81
|
-
if self.contains(
|
|
82
|
-
return self.path_from_checksum(
|
|
39
|
+
def get_path(self, file: "File") -> Optional[str]:
|
|
40
|
+
if self.contains(file):
|
|
41
|
+
return self.path_from_checksum(file.get_hash())
|
|
83
42
|
return None
|
|
84
43
|
|
|
85
|
-
def contains(self,
|
|
86
|
-
return self.odb.exists(
|
|
44
|
+
def contains(self, file: "File") -> bool:
|
|
45
|
+
return self.odb.exists(file.get_hash())
|
|
87
46
|
|
|
88
47
|
def path_from_checksum(self, checksum: str) -> str:
|
|
89
48
|
assert checksum
|
|
90
49
|
return self.odb.oid_to_path(checksum)
|
|
91
50
|
|
|
92
|
-
def remove(self,
|
|
93
|
-
self.odb.delete(
|
|
51
|
+
def remove(self, file: "File") -> None:
|
|
52
|
+
self.odb.delete(file.get_hash())
|
|
94
53
|
|
|
95
54
|
async def download(
|
|
96
|
-
self,
|
|
55
|
+
self, file: "File", client: "Client", callback: Optional[Callback] = None
|
|
97
56
|
) -> None:
|
|
98
|
-
from_path = f"{
|
|
57
|
+
from_path = f"{file.source}/{file.path}"
|
|
99
58
|
from dvc_objects.fs.utils import tmp_fname
|
|
100
59
|
|
|
101
60
|
odb_fs = self.odb.fs
|
|
102
61
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
103
|
-
size =
|
|
62
|
+
size = file.size
|
|
104
63
|
if size < 0:
|
|
105
64
|
size = await client.get_size(from_path)
|
|
106
65
|
cb = callback or TqdmCallback(
|
|
@@ -115,13 +74,13 @@ class DataChainCache:
|
|
|
115
74
|
cb.close()
|
|
116
75
|
|
|
117
76
|
try:
|
|
118
|
-
oid =
|
|
77
|
+
oid = file.get_hash()
|
|
119
78
|
self.odb.add(tmp_info, self.odb.fs, oid)
|
|
120
79
|
finally:
|
|
121
80
|
os.unlink(tmp_info)
|
|
122
81
|
|
|
123
|
-
def store_data(self,
|
|
124
|
-
checksum =
|
|
82
|
+
def store_data(self, file: "File", contents: bytes) -> None:
|
|
83
|
+
checksum = file.get_hash()
|
|
125
84
|
dst = self.path_from_checksum(checksum)
|
|
126
85
|
if not os.path.exists(dst):
|
|
127
86
|
# Create the file only if it's not already in cache
|