datachain 0.8.4__tar.gz → 0.8.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.8.4 → datachain-0.8.6}/.gitignore +3 -0
- {datachain-0.8.4 → datachain-0.8.6}/PKG-INFO +6 -6
- {datachain-0.8.4 → datachain-0.8.6}/README.rst +2 -2
- {datachain-0.8.4 → datachain-0.8.6}/docs/index.md +1 -1
- {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/ultralytics-bbox.py +5 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/ultralytics-pose.py +5 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/ultralytics-segment.py +5 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/torch-loader.py +2 -2
- {datachain-0.8.4 → datachain-0.8.6}/pyproject.toml +2 -2
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/asyn.py +16 -6
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cache.py +32 -10
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/catalog/catalog.py +17 -1
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/azure.py +6 -2
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/fsspec.py +1 -1
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/gcs.py +6 -2
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/s3.py +22 -4
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/db_engine.py +9 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/schema.py +4 -10
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/sqlite.py +7 -1
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/warehouse.py +6 -4
- datachain-0.8.4/src/datachain/lib/diff.py → datachain-0.8.6/src/datachain/diff/__init__.py +116 -12
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/__init__.py +2 -1
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/conditional.py +31 -9
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/arrow.py +3 -1
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/dc.py +5 -3
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/file.py +15 -4
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/hf.py +1 -1
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/pytorch.py +57 -13
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/udf.py +82 -40
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/listing.py +1 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/progress.py +18 -1
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/dataset.py +122 -93
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/dispatch.py +22 -16
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/utils.py +13 -2
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/PKG-INFO +6 -6
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/SOURCES.txt +3 -1
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/requires.txt +2 -2
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_datachain.py +83 -1
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_pytorch.py +41 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_datachain.py +15 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_diff.py +49 -43
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_conditional.py +21 -4
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_asyn.py +33 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_cache.py +27 -1
- datachain-0.8.6/tests/unit/test_diff.py +70 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_func.py +18 -0
- datachain-0.8.6/tests/unit/test_pytorch.py +58 -0
- {datachain-0.8.4 → datachain-0.8.6}/.cruft.json +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.gitattributes +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.github/codecov.yaml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.github/dependabot.yml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.github/workflows/release.yml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.github/workflows/tests.yml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/.pre-commit-config.yaml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/LICENSE +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/assets/datachain.svg +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/contributing.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/examples.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/overrides/main.html +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/quick-start.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/references/datachain.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/references/datatype.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/references/file.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/references/index.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/references/sql.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/references/torch.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/references/udf.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/docs/tutorials.md +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/multimodal/wds.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/mkdocs.yml +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/noxfile.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/setup.cfg +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/__main__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/utils.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/hf.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/local.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/config.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/dataset.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/error.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/array.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/base.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/func.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/numeric.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/path.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/random.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/string.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/window.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/job.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/clip.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/image.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/listing.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/settings.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/tar.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/text.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/utils.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/bbox.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/pose.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/segment.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/node.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/py.typed +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/batch.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/metrics.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/params.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/queue.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/schema.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/session.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/udf.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/utils.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/remote/studio.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/types.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/utils.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/studio.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/telemetry.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/conftest.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/data.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/examples/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/examples/test_examples.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/examples/wds_data.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_catalog.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_client.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_datasets.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_listing.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_ls.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_metrics.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_pull.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_query.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_session.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_toolkit.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/scripts/feature_class.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/test_atomicity.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/test_cli_e2e.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/test_cli_studio.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/test_query_e2e.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/test_telemetry.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_catalog.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_client.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_config.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_dataset.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_listing.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_metastore.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_query.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_query_params.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_serializer.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_session.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_utils.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.8.4 → datachain-0.8.6}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.6
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -96,10 +96,10 @@ Requires-Dist: defusedxml; extra == "examples"
|
|
|
96
96
|
Requires-Dist: accelerate; extra == "examples"
|
|
97
97
|
Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
|
|
98
98
|
Requires-Dist: unstructured[pdf]<0.16.12; extra == "examples"
|
|
99
|
-
Requires-Dist: pdfplumber==0.11.
|
|
99
|
+
Requires-Dist: pdfplumber==0.11.5; extra == "examples"
|
|
100
100
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
101
101
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
102
|
-
Requires-Dist: ultralytics==8.3.
|
|
102
|
+
Requires-Dist: ultralytics==8.3.58; extra == "examples"
|
|
103
103
|
|
|
104
104
|
================
|
|
105
105
|
|logo| DataChain
|
|
@@ -134,7 +134,7 @@ Use Cases
|
|
|
134
134
|
1. **ETL.** Pythonic framework for describing and running unstructured data transformations
|
|
135
135
|
and enrichments, applying models to data, including LLMs.
|
|
136
136
|
2. **Analytics.** DataChain dataset is a table that combines all the information about data
|
|
137
|
-
objects in one place + it provides dataframe-like API and
|
|
137
|
+
objects in one place + it provides dataframe-like API and vectorized engine to do analytics
|
|
138
138
|
on these tables at scale.
|
|
139
139
|
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
140
140
|
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
@@ -270,7 +270,7 @@ DataChain Studio Platform
|
|
|
270
270
|
|
|
271
271
|
`DataChain Studio`_ is a proprietary solution for teams that offers:
|
|
272
272
|
|
|
273
|
-
- **Centralized dataset registry** to manage data, code and
|
|
273
|
+
- **Centralized dataset registry** to manage data, code and
|
|
274
274
|
dependencies in one place.
|
|
275
275
|
- **Data Lineage** for data sources as well as derivative dataset.
|
|
276
276
|
- **UI for Multimodal Data** like images, videos, and PDFs.
|
|
@@ -31,7 +31,7 @@ Use Cases
|
|
|
31
31
|
1. **ETL.** Pythonic framework for describing and running unstructured data transformations
|
|
32
32
|
and enrichments, applying models to data, including LLMs.
|
|
33
33
|
2. **Analytics.** DataChain dataset is a table that combines all the information about data
|
|
34
|
-
objects in one place + it provides dataframe-like API and
|
|
34
|
+
objects in one place + it provides dataframe-like API and vectorized engine to do analytics
|
|
35
35
|
on these tables at scale.
|
|
36
36
|
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
37
37
|
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
@@ -167,7 +167,7 @@ DataChain Studio Platform
|
|
|
167
167
|
|
|
168
168
|
`DataChain Studio`_ is a proprietary solution for teams that offers:
|
|
169
169
|
|
|
170
|
-
- **Centralized dataset registry** to manage data, code and
|
|
170
|
+
- **Centralized dataset registry** to manage data, code and
|
|
171
171
|
dependencies in one place.
|
|
172
172
|
- **Data Lineage** for data sources as well as derivative dataset.
|
|
173
173
|
- **UI for Multimodal Data** like images, videos, and PDFs.
|
|
@@ -42,7 +42,7 @@ database for easy and efficient querying.
|
|
|
42
42
|
including LLMs.
|
|
43
43
|
2. **Analytics.** DataChain dataset is a table that combines all the
|
|
44
44
|
information about data objects in one place + it provides
|
|
45
|
-
dataframe-like API and
|
|
45
|
+
dataframe-like API and vectorized engine to do analytics on these
|
|
46
46
|
tables at scale.
|
|
47
47
|
3. **Versioning.** DataChain doesn't store, require moving or copying
|
|
48
48
|
data (unlike DVC). Perfect use case is a bucket with thousands or
|
|
@@ -56,7 +56,7 @@ class CNN(nn.Module):
|
|
|
56
56
|
if __name__ == "__main__":
|
|
57
57
|
ds = (
|
|
58
58
|
DataChain.from_storage(STORAGE, type="image")
|
|
59
|
-
.settings(
|
|
59
|
+
.settings(prefetch=25)
|
|
60
60
|
.filter(C("file.path").glob("*.jpg"))
|
|
61
61
|
.map(
|
|
62
62
|
label=lambda path: label_to_int(basename(path)[:3], CLASSES),
|
|
@@ -68,7 +68,7 @@ if __name__ == "__main__":
|
|
|
68
68
|
train_loader = DataLoader(
|
|
69
69
|
ds.to_pytorch(transform=transform),
|
|
70
70
|
batch_size=25,
|
|
71
|
-
num_workers=
|
|
71
|
+
num_workers=min(4, os.cpu_count() or 2),
|
|
72
72
|
persistent_workers=True,
|
|
73
73
|
multiprocessing_context=multiprocessing.get_context("spawn"),
|
|
74
74
|
)
|
|
@@ -109,10 +109,10 @@ examples = [
|
|
|
109
109
|
"accelerate",
|
|
110
110
|
"unstructured_ingest[embed-huggingface]",
|
|
111
111
|
"unstructured[pdf]<0.16.12",
|
|
112
|
-
"pdfplumber==0.11.
|
|
112
|
+
"pdfplumber==0.11.5",
|
|
113
113
|
"huggingface_hub[hf_transfer]",
|
|
114
114
|
"onnx==1.16.1",
|
|
115
|
-
"ultralytics==8.3.
|
|
115
|
+
"ultralytics==8.3.58"
|
|
116
116
|
]
|
|
117
117
|
|
|
118
118
|
[project.urls]
|
|
@@ -8,12 +8,14 @@ from collections.abc import (
|
|
|
8
8
|
Iterable,
|
|
9
9
|
Iterator,
|
|
10
10
|
)
|
|
11
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor, wait
|
|
12
12
|
from heapq import heappop, heappush
|
|
13
13
|
from typing import Any, Callable, Generic, Optional, TypeVar
|
|
14
14
|
|
|
15
15
|
from fsspec.asyn import get_loop
|
|
16
16
|
|
|
17
|
+
from datachain.utils import safe_closing
|
|
18
|
+
|
|
17
19
|
ASYNC_WORKERS = 20
|
|
18
20
|
|
|
19
21
|
InputT = TypeVar("InputT", contravariant=True) # noqa: PLC0105
|
|
@@ -56,6 +58,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
|
|
|
56
58
|
self.pool = ThreadPoolExecutor(workers)
|
|
57
59
|
self._tasks: set[asyncio.Task] = set()
|
|
58
60
|
self._shutdown_producer = threading.Event()
|
|
61
|
+
self._producer_is_shutdown = threading.Event()
|
|
59
62
|
|
|
60
63
|
def start_task(self, coro: Coroutine) -> asyncio.Task:
|
|
61
64
|
task = self.loop.create_task(coro)
|
|
@@ -64,11 +67,16 @@ class AsyncMapper(Generic[InputT, ResultT]):
|
|
|
64
67
|
return task
|
|
65
68
|
|
|
66
69
|
def _produce(self) -> None:
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
try:
|
|
71
|
+
with safe_closing(self.iterable):
|
|
72
|
+
for item in self.iterable:
|
|
73
|
+
if self._shutdown_producer.is_set():
|
|
74
|
+
return
|
|
75
|
+
coro = self.work_queue.put(item)
|
|
76
|
+
fut = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
|
77
|
+
fut.result() # wait until the item is in the queue
|
|
78
|
+
finally:
|
|
79
|
+
self._producer_is_shutdown.set()
|
|
72
80
|
|
|
73
81
|
async def produce(self) -> None:
|
|
74
82
|
await self.to_thread(self._produce)
|
|
@@ -179,6 +187,8 @@ class AsyncMapper(Generic[InputT, ResultT]):
|
|
|
179
187
|
self.shutdown_producer()
|
|
180
188
|
if not async_run.done():
|
|
181
189
|
async_run.cancel()
|
|
190
|
+
wait([async_run])
|
|
191
|
+
self._producer_is_shutdown.wait()
|
|
182
192
|
|
|
183
193
|
def __iter__(self):
|
|
184
194
|
return self.iterate()
|
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from collections.abc import Iterator
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from tempfile import mkdtemp
|
|
2
5
|
from typing import TYPE_CHECKING, Optional
|
|
3
6
|
|
|
4
7
|
from dvc_data.hashfile.db.local import LocalHashFileDB
|
|
5
8
|
from dvc_objects.fs.local import LocalFileSystem
|
|
9
|
+
from dvc_objects.fs.utils import remove
|
|
6
10
|
from fsspec.callbacks import Callback, TqdmCallback
|
|
7
11
|
|
|
8
12
|
from .progress import Tqdm
|
|
@@ -20,6 +24,23 @@ def try_scandir(path):
|
|
|
20
24
|
pass
|
|
21
25
|
|
|
22
26
|
|
|
27
|
+
def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "DataChainCache":
|
|
28
|
+
cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
|
|
29
|
+
return DataChainCache(cache_dir, tmp_dir=tmp_dir)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@contextmanager
|
|
33
|
+
def temporary_cache(
|
|
34
|
+
tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
|
|
35
|
+
) -> Iterator["DataChainCache"]:
|
|
36
|
+
cache = get_temp_cache(tmp_dir, prefix=prefix)
|
|
37
|
+
try:
|
|
38
|
+
yield cache
|
|
39
|
+
finally:
|
|
40
|
+
if delete:
|
|
41
|
+
cache.destroy()
|
|
42
|
+
|
|
43
|
+
|
|
23
44
|
class DataChainCache:
|
|
24
45
|
def __init__(self, cache_dir: str, tmp_dir: str):
|
|
25
46
|
self.odb = LocalHashFileDB(
|
|
@@ -28,6 +49,9 @@ class DataChainCache:
|
|
|
28
49
|
tmp_dir=tmp_dir,
|
|
29
50
|
)
|
|
30
51
|
|
|
52
|
+
def __eq__(self, other) -> bool:
|
|
53
|
+
return self.odb == other.odb
|
|
54
|
+
|
|
31
55
|
@property
|
|
32
56
|
def cache_dir(self):
|
|
33
57
|
return self.odb.path
|
|
@@ -63,7 +87,7 @@ class DataChainCache:
|
|
|
63
87
|
if size < 0:
|
|
64
88
|
size = await client.get_size(from_path, version_id=file.version)
|
|
65
89
|
cb = callback or TqdmCallback(
|
|
66
|
-
tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
|
|
90
|
+
tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True, "leave": False},
|
|
67
91
|
tqdm_cls=Tqdm,
|
|
68
92
|
size=size,
|
|
69
93
|
)
|
|
@@ -82,20 +106,18 @@ class DataChainCache:
|
|
|
82
106
|
os.unlink(tmp_info)
|
|
83
107
|
|
|
84
108
|
def store_data(self, file: "File", contents: bytes) -> None:
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
# Create the file only if it's not already in cache
|
|
89
|
-
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
|
90
|
-
with open(dst, mode="wb") as f:
|
|
91
|
-
f.write(contents)
|
|
92
|
-
|
|
93
|
-
def clear(self):
|
|
109
|
+
self.odb.add_bytes(file.get_hash(), contents)
|
|
110
|
+
|
|
111
|
+
def clear(self) -> None:
|
|
94
112
|
"""
|
|
95
113
|
Completely clear the cache.
|
|
96
114
|
"""
|
|
97
115
|
self.odb.clear()
|
|
98
116
|
|
|
117
|
+
def destroy(self) -> None:
|
|
118
|
+
# `clear` leaves the prefix directory structure intact.
|
|
119
|
+
remove(self.cache_dir)
|
|
120
|
+
|
|
99
121
|
def get_total_size(self) -> int:
|
|
100
122
|
total = 0
|
|
101
123
|
for subdir in try_scandir(self.odb.path):
|
|
@@ -405,6 +405,7 @@ def get_download_bar(bar_format: str, total_size: int):
|
|
|
405
405
|
unit_scale=True,
|
|
406
406
|
unit_divisor=1000,
|
|
407
407
|
total=total_size,
|
|
408
|
+
leave=False,
|
|
408
409
|
)
|
|
409
410
|
|
|
410
411
|
|
|
@@ -429,6 +430,7 @@ def instantiate_node_groups(
|
|
|
429
430
|
unit_scale=True,
|
|
430
431
|
unit_divisor=1000,
|
|
431
432
|
total=total_files,
|
|
433
|
+
leave=False,
|
|
432
434
|
)
|
|
433
435
|
)
|
|
434
436
|
|
|
@@ -534,6 +536,12 @@ def find_column_to_str( # noqa: PLR0911
|
|
|
534
536
|
return ""
|
|
535
537
|
|
|
536
538
|
|
|
539
|
+
def clone_catalog_with_cache(catalog: "Catalog", cache: "DataChainCache") -> "Catalog":
|
|
540
|
+
clone = catalog.copy()
|
|
541
|
+
clone.cache = cache
|
|
542
|
+
return clone
|
|
543
|
+
|
|
544
|
+
|
|
537
545
|
class Catalog:
|
|
538
546
|
def __init__(
|
|
539
547
|
self,
|
|
@@ -1242,10 +1250,17 @@ class Catalog:
|
|
|
1242
1250
|
path: str,
|
|
1243
1251
|
version_id: Optional[str] = None,
|
|
1244
1252
|
client_config=None,
|
|
1253
|
+
content_disposition: Optional[str] = None,
|
|
1254
|
+
**kwargs,
|
|
1245
1255
|
) -> str:
|
|
1246
1256
|
client_config = client_config or self.client_config
|
|
1247
1257
|
client = Client.get_client(source, self.cache, **client_config)
|
|
1248
|
-
return client.url(
|
|
1258
|
+
return client.url(
|
|
1259
|
+
path,
|
|
1260
|
+
version_id=version_id,
|
|
1261
|
+
content_disposition=content_disposition,
|
|
1262
|
+
**kwargs,
|
|
1263
|
+
)
|
|
1249
1264
|
|
|
1250
1265
|
def export_dataset_table(
|
|
1251
1266
|
self,
|
|
@@ -1437,6 +1452,7 @@ class Catalog:
|
|
|
1437
1452
|
unit_scale=True,
|
|
1438
1453
|
unit_divisor=1000,
|
|
1439
1454
|
total=ds_stats.num_objects, # type: ignore [union-attr]
|
|
1455
|
+
leave=False,
|
|
1440
1456
|
)
|
|
1441
1457
|
|
|
1442
1458
|
schema = DatasetRecord.parse_schema(remote_ds_version.schema)
|
|
@@ -31,8 +31,12 @@ class AzureClient(Client):
|
|
|
31
31
|
Generate a signed URL for the given path.
|
|
32
32
|
"""
|
|
33
33
|
version_id = kwargs.pop("version_id", None)
|
|
34
|
+
content_disposition = kwargs.pop("content_disposition", None)
|
|
34
35
|
result = self.fs.sign(
|
|
35
|
-
self.get_full_path(path, version_id),
|
|
36
|
+
self.get_full_path(path, version_id),
|
|
37
|
+
expiration=expires,
|
|
38
|
+
content_disposition=content_disposition,
|
|
39
|
+
**kwargs,
|
|
36
40
|
)
|
|
37
41
|
return result + (f"&versionid={version_id}" if version_id else "")
|
|
38
42
|
|
|
@@ -42,7 +46,7 @@ class AzureClient(Client):
|
|
|
42
46
|
prefix = prefix.lstrip(DELIMITER) + DELIMITER
|
|
43
47
|
found = False
|
|
44
48
|
try:
|
|
45
|
-
with tqdm(desc=f"Listing {self.uri}", unit=" objects") as pbar:
|
|
49
|
+
with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
|
|
46
50
|
async with self.fs.service_client.get_container_client(
|
|
47
51
|
container=self.name
|
|
48
52
|
) as container_client:
|
|
@@ -249,7 +249,7 @@ class Client(ABC):
|
|
|
249
249
|
await main_task
|
|
250
250
|
|
|
251
251
|
async def _fetch_nested(self, start_prefix: str, result_queue: ResultQueue) -> None:
|
|
252
|
-
progress_bar = tqdm(desc=f"Listing {self.uri}", unit=" objects")
|
|
252
|
+
progress_bar = tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False)
|
|
253
253
|
loop = get_loop()
|
|
254
254
|
|
|
255
255
|
queue: asyncio.Queue[str] = asyncio.Queue()
|
|
@@ -39,11 +39,15 @@ class GCSClient(Client):
|
|
|
39
39
|
(see https://cloud.google.com/storage/docs/access-public-data#api-link).
|
|
40
40
|
"""
|
|
41
41
|
version_id = kwargs.pop("version_id", None)
|
|
42
|
+
content_disposition = kwargs.pop("content_disposition", None)
|
|
42
43
|
if self.fs.storage_options.get("token") == "anon":
|
|
43
44
|
query = f"?generation={version_id}" if version_id else ""
|
|
44
45
|
return f"https://storage.googleapis.com/{self.name}/{path}{query}"
|
|
45
46
|
return self.fs.sign(
|
|
46
|
-
self.get_full_path(path, version_id),
|
|
47
|
+
self.get_full_path(path, version_id),
|
|
48
|
+
expiration=expires,
|
|
49
|
+
response_disposition=content_disposition,
|
|
50
|
+
**kwargs,
|
|
47
51
|
)
|
|
48
52
|
|
|
49
53
|
@staticmethod
|
|
@@ -83,7 +87,7 @@ class GCSClient(Client):
|
|
|
83
87
|
self, page_queue: PageQueue, result_queue: ResultQueue
|
|
84
88
|
) -> bool:
|
|
85
89
|
found = False
|
|
86
|
-
with tqdm(desc=f"Listing {self.uri}", unit=" objects") as pbar:
|
|
90
|
+
with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
|
|
87
91
|
while (page := await page_queue.get()) is not None:
|
|
88
92
|
if page:
|
|
89
93
|
found = True
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import os
|
|
2
3
|
from typing import Any, Optional, cast
|
|
3
4
|
from urllib.parse import parse_qs, urlsplit, urlunsplit
|
|
4
5
|
|
|
@@ -31,9 +32,11 @@ class ClientS3(Client):
|
|
|
31
32
|
if "aws_token" in kwargs:
|
|
32
33
|
kwargs.setdefault("token", kwargs.pop("aws_token"))
|
|
33
34
|
|
|
34
|
-
#
|
|
35
|
-
|
|
36
|
-
|
|
35
|
+
# remove this `if` when https://github.com/fsspec/s3fs/pull/929 lands
|
|
36
|
+
if not os.environ.get("AWS_REGION") and not os.environ.get("AWS_ENDPOINT_URL"):
|
|
37
|
+
# caching bucket regions to use the right one in signed urls, otherwise
|
|
38
|
+
# it tries to randomly guess and creates wrong signature
|
|
39
|
+
kwargs.setdefault("cache_regions", True)
|
|
37
40
|
|
|
38
41
|
# We want to use newer v4 signature version since regions added after
|
|
39
42
|
# 2014 are not going to support v2 which is the older one.
|
|
@@ -51,6 +54,21 @@ class ClientS3(Client):
|
|
|
51
54
|
|
|
52
55
|
return cast(S3FileSystem, super().create_fs(**kwargs))
|
|
53
56
|
|
|
57
|
+
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Generate a signed URL for the given path.
|
|
60
|
+
"""
|
|
61
|
+
version_id = kwargs.pop("version_id", None)
|
|
62
|
+
content_disposition = kwargs.pop("content_disposition", None)
|
|
63
|
+
if content_disposition:
|
|
64
|
+
kwargs["ResponseContentDisposition"] = content_disposition
|
|
65
|
+
|
|
66
|
+
return self.fs.sign(
|
|
67
|
+
self.get_full_path(path, version_id),
|
|
68
|
+
expiration=expires,
|
|
69
|
+
**kwargs,
|
|
70
|
+
)
|
|
71
|
+
|
|
54
72
|
async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
|
|
55
73
|
async def get_pages(it, page_queue):
|
|
56
74
|
try:
|
|
@@ -61,7 +79,7 @@ class ClientS3(Client):
|
|
|
61
79
|
|
|
62
80
|
async def process_pages(page_queue, result_queue):
|
|
63
81
|
found = False
|
|
64
|
-
with tqdm(desc=f"Listing {self.uri}", unit=" objects") as pbar:
|
|
82
|
+
with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
|
|
65
83
|
while (res := await page_queue.get()) is not None:
|
|
66
84
|
if res:
|
|
67
85
|
found = True
|
|
@@ -79,6 +79,15 @@ class DatabaseEngine(ABC, Serializable):
|
|
|
79
79
|
conn: Optional[Any] = None,
|
|
80
80
|
) -> Iterator[tuple[Any, ...]]: ...
|
|
81
81
|
|
|
82
|
+
def get_table(self, name: str) -> "Table":
|
|
83
|
+
table = self.metadata.tables.get(name)
|
|
84
|
+
if table is None:
|
|
85
|
+
sa.Table(name, self.metadata, autoload_with=self.engine)
|
|
86
|
+
# ^^^ This table may not be correctly initialised on some dialects
|
|
87
|
+
# Grab it from metadata instead.
|
|
88
|
+
table = self.metadata.tables[name]
|
|
89
|
+
return table
|
|
90
|
+
|
|
82
91
|
@abstractmethod
|
|
83
92
|
def executemany(
|
|
84
93
|
self, query, params, cursor: Optional[Any] = None
|
|
@@ -16,7 +16,6 @@ from datachain.sql.functions import path as pathfunc
|
|
|
16
16
|
from datachain.sql.types import Int, SQLType, UInt64
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
|
-
from sqlalchemy import Engine
|
|
20
19
|
from sqlalchemy.engine.interfaces import Dialect
|
|
21
20
|
from sqlalchemy.sql.base import (
|
|
22
21
|
ColumnCollection,
|
|
@@ -25,6 +24,8 @@ if TYPE_CHECKING:
|
|
|
25
24
|
)
|
|
26
25
|
from sqlalchemy.sql.elements import ColumnElement
|
|
27
26
|
|
|
27
|
+
from datachain.data_storage.db_engine import DatabaseEngine
|
|
28
|
+
|
|
28
29
|
|
|
29
30
|
DEFAULT_DELIMITER = "__"
|
|
30
31
|
|
|
@@ -150,14 +151,12 @@ class DataTable:
|
|
|
150
151
|
def __init__(
|
|
151
152
|
self,
|
|
152
153
|
name: str,
|
|
153
|
-
engine: "
|
|
154
|
-
metadata: Optional["sa.MetaData"] = None,
|
|
154
|
+
engine: "DatabaseEngine",
|
|
155
155
|
column_types: Optional[dict[str, SQLType]] = None,
|
|
156
156
|
object_name: str = "file",
|
|
157
157
|
):
|
|
158
158
|
self.name: str = name
|
|
159
159
|
self.engine = engine
|
|
160
|
-
self.metadata: sa.MetaData = metadata if metadata is not None else sa.MetaData()
|
|
161
160
|
self.column_types: dict[str, SQLType] = column_types or {}
|
|
162
161
|
self.object_name = object_name
|
|
163
162
|
|
|
@@ -211,12 +210,7 @@ class DataTable:
|
|
|
211
210
|
return sa.Table(name, metadata, *columns)
|
|
212
211
|
|
|
213
212
|
def get_table(self) -> "sa.Table":
|
|
214
|
-
table = self.
|
|
215
|
-
if table is None:
|
|
216
|
-
sa.Table(self.name, self.metadata, autoload_with=self.engine)
|
|
217
|
-
# ^^^ This table may not be correctly initialised on some dialects
|
|
218
|
-
# Grab it from metadata instead.
|
|
219
|
-
table = self.metadata.tables[self.name]
|
|
213
|
+
table = self.engine.get_table(self.name)
|
|
220
214
|
|
|
221
215
|
column_types = self.column_types | {c.name: c.type for c in self.sys_columns()}
|
|
222
216
|
# adjusting types for custom columns to be instances of SQLType if possible
|
|
@@ -186,6 +186,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
186
186
|
self.db_file = db_file
|
|
187
187
|
self.is_closed = False
|
|
188
188
|
|
|
189
|
+
def get_table(self, name: str) -> Table:
|
|
190
|
+
if self.is_closed:
|
|
191
|
+
# Reconnect in case of being closed previously.
|
|
192
|
+
self._reconnect()
|
|
193
|
+
return super().get_table(name)
|
|
194
|
+
|
|
189
195
|
@retry_sqlite_locks
|
|
190
196
|
def execute(
|
|
191
197
|
self,
|
|
@@ -670,7 +676,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
670
676
|
]
|
|
671
677
|
table = self.create_udf_table(columns)
|
|
672
678
|
|
|
673
|
-
with tqdm(desc="Preparing", unit=" rows") as pbar:
|
|
679
|
+
with tqdm(desc="Preparing", unit=" rows", leave=False) as pbar:
|
|
674
680
|
self.copy_table(table, query, progress_cb=pbar.update)
|
|
675
681
|
|
|
676
682
|
return table
|
|
@@ -191,8 +191,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
191
191
|
table_name = self.dataset_table_name(dataset.name, version)
|
|
192
192
|
return self.schema.dataset_row_cls(
|
|
193
193
|
table_name,
|
|
194
|
-
self.db
|
|
195
|
-
self.db.metadata,
|
|
194
|
+
self.db,
|
|
196
195
|
dataset.get_schema(version),
|
|
197
196
|
object_name=object_name,
|
|
198
197
|
)
|
|
@@ -904,8 +903,11 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
904
903
|
This should be implemented to ensure that the provided tables
|
|
905
904
|
are cleaned up as soon as they are no longer needed.
|
|
906
905
|
"""
|
|
907
|
-
|
|
908
|
-
|
|
906
|
+
to_drop = set(names)
|
|
907
|
+
with tqdm(
|
|
908
|
+
desc="Cleanup", unit=" tables", total=len(to_drop), leave=False
|
|
909
|
+
) as pbar:
|
|
910
|
+
for name in to_drop:
|
|
909
911
|
self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
|
|
910
912
|
pbar.update(1)
|
|
911
913
|
|