datachain 0.9.1__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.9.1 → datachain-0.10.0}/PKG-INFO +3 -3
- {datachain-0.9.1 → datachain-0.10.0}/README.rst +2 -2
- {datachain-0.9.1 → datachain-0.10.0}/docs/quick-start.md +2 -2
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/fsspec.py +1 -1
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/dc.py +60 -4
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/file.py +16 -5
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/dataset.py +2 -2
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/session.py +15 -3
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/PKG-INFO +3 -3
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/conftest.py +2 -2
- datachain-0.10.0/tests/func/test_cloud_transfer.py +68 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_datachain.py +16 -6
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_pull.py +1 -1
- {datachain-0.9.1 → datachain-0.10.0}/.cruft.json +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.gitattributes +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.github/codecov.yaml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.github/dependabot.yml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.github/workflows/release.yml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.gitignore +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/.pre-commit-config.yaml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/LICENSE +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/contributing.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/examples.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/index.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/overrides/main.html +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/datachain.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/func.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/index.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/toolkit.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/torch.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/references/udf.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/docs/tutorials.md +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/mkdocs.yml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/noxfile.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/pyproject.toml +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/setup.cfg +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/__main__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/asyn.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cache.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/local.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/config.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/dataset.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/error.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/array.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/base.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/func.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/path.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/random.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/string.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/window.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/job.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/listing.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/node.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/progress.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/py.typed +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/params.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/studio.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/data.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/examples/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_client.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_file.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_hf.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_listing.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_ls.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_query.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_session.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/test_atomicity.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/test_cli_studio.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/test_telemetry.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_video.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_client.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_config.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_func.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_query.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_session.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.9.1 → datachain-0.10.0}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -175,7 +175,7 @@ high confidence scores.
|
|
|
175
175
|
|
|
176
176
|
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
177
177
|
& (Column("meta.inference.class_") == "cat"))
|
|
178
|
-
likely_cats.
|
|
178
|
+
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
179
179
|
|
|
180
180
|
|
|
181
181
|
Example: LLM based text-file evaluation
|
|
@@ -216,7 +216,7 @@ Python code:
|
|
|
216
216
|
)
|
|
217
217
|
|
|
218
218
|
successful_chain = chain.filter(Column("is_success") == True)
|
|
219
|
-
successful_chain.
|
|
219
|
+
successful_chain.to_storage("./output_mistral")
|
|
220
220
|
|
|
221
221
|
print(f"{successful_chain.count()} files were exported")
|
|
222
222
|
|
|
@@ -68,7 +68,7 @@ high confidence scores.
|
|
|
68
68
|
|
|
69
69
|
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
70
70
|
& (Column("meta.inference.class_") == "cat"))
|
|
71
|
-
likely_cats.
|
|
71
|
+
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
72
72
|
|
|
73
73
|
|
|
74
74
|
Example: LLM based text-file evaluation
|
|
@@ -109,7 +109,7 @@ Python code:
|
|
|
109
109
|
)
|
|
110
110
|
|
|
111
111
|
successful_chain = chain.filter(Column("is_success") == True)
|
|
112
|
-
successful_chain.
|
|
112
|
+
successful_chain.to_storage("./output_mistral")
|
|
113
113
|
|
|
114
114
|
print(f"{successful_chain.count()} files were exported")
|
|
115
115
|
|
|
@@ -47,7 +47,7 @@ annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
|
47
47
|
|
|
48
48
|
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
49
49
|
& (Column("meta.inference.class_") == "cat"))
|
|
50
|
-
likely_cats.
|
|
50
|
+
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
51
51
|
```
|
|
52
52
|
|
|
53
53
|
## Data curation with a local AI model
|
|
@@ -85,7 +85,7 @@ chain = (
|
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
positive_chain = chain.filter(Column("is_positive") == True)
|
|
88
|
-
positive_chain.
|
|
88
|
+
positive_chain.to_storage("./output")
|
|
89
89
|
|
|
90
90
|
print(f"{positive_chain.count()} files were exported")
|
|
91
91
|
```
|
|
@@ -390,7 +390,7 @@ class Client(ABC):
|
|
|
390
390
|
) # type: ignore[return-value]
|
|
391
391
|
|
|
392
392
|
def upload(self, data: bytes, path: str) -> "File":
|
|
393
|
-
full_path = self.get_full_path(path)
|
|
393
|
+
full_path = path if path.startswith(self.PREFIX) else self.get_full_path(path)
|
|
394
394
|
|
|
395
395
|
parent = posixpath.dirname(full_path)
|
|
396
396
|
self.fs.makedirs(parent, exist_ok=True)
|
|
@@ -411,6 +411,7 @@ class DataChain:
|
|
|
411
411
|
object_name: str = "file",
|
|
412
412
|
update: bool = False,
|
|
413
413
|
anon: bool = False,
|
|
414
|
+
client_config: Optional[dict] = None,
|
|
414
415
|
) -> "Self":
|
|
415
416
|
"""Get data from a storage as a list of file with all file attributes.
|
|
416
417
|
It returns the chain itself as usual.
|
|
@@ -423,15 +424,32 @@ class DataChain:
|
|
|
423
424
|
object_name : Created object column name.
|
|
424
425
|
update : force storage reindexing. Default is False.
|
|
425
426
|
anon : If True, we will treat cloud bucket as public one
|
|
427
|
+
client_config : Optional client configuration for the storage client.
|
|
426
428
|
|
|
427
429
|
Example:
|
|
430
|
+
Simple call from s3
|
|
428
431
|
```py
|
|
429
432
|
chain = DataChain.from_storage("s3://my-bucket/my-dir")
|
|
430
433
|
```
|
|
434
|
+
|
|
435
|
+
With AWS S3-compatible storage
|
|
436
|
+
```py
|
|
437
|
+
chain = DataChain.from_storage(
|
|
438
|
+
"s3://my-bucket/my-dir",
|
|
439
|
+
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
440
|
+
)
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
Pass existing session
|
|
444
|
+
```py
|
|
445
|
+
session = Session.get()
|
|
446
|
+
chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
|
|
447
|
+
```
|
|
431
448
|
"""
|
|
432
449
|
file_type = get_file_type(type)
|
|
433
450
|
|
|
434
|
-
|
|
451
|
+
if anon:
|
|
452
|
+
client_config = (client_config or {}) | {"anon": True}
|
|
435
453
|
session = Session.get(session, client_config=client_config, in_memory=in_memory)
|
|
436
454
|
cache = session.catalog.cache
|
|
437
455
|
client_config = session.catalog.client_config
|
|
@@ -481,25 +499,56 @@ class DataChain:
|
|
|
481
499
|
version: Optional[int] = None,
|
|
482
500
|
session: Optional[Session] = None,
|
|
483
501
|
settings: Optional[dict] = None,
|
|
484
|
-
|
|
502
|
+
fallback_to_studio: bool = True,
|
|
485
503
|
) -> "Self":
|
|
486
504
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
505
|
+
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
487
506
|
|
|
488
507
|
Parameters:
|
|
489
508
|
name : dataset name
|
|
490
509
|
version : dataset version
|
|
510
|
+
session : Session to use for the chain.
|
|
511
|
+
settings : Settings to use for the chain.
|
|
512
|
+
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
513
|
+
Default is True.
|
|
491
514
|
|
|
492
515
|
Example:
|
|
493
516
|
```py
|
|
494
517
|
chain = DataChain.from_dataset("my_cats")
|
|
495
518
|
```
|
|
519
|
+
|
|
520
|
+
```py
|
|
521
|
+
chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
```py
|
|
525
|
+
chain = DataChain.from_dataset("my_cats", version=1)
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
```py
|
|
529
|
+
session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
|
|
530
|
+
settings = {
|
|
531
|
+
"cache": True,
|
|
532
|
+
"parallel": 4,
|
|
533
|
+
"workers": 4,
|
|
534
|
+
"min_task_size": 1000,
|
|
535
|
+
"prefetch": 10,
|
|
536
|
+
}
|
|
537
|
+
chain = DataChain.from_dataset(
|
|
538
|
+
name="my_cats",
|
|
539
|
+
version=1,
|
|
540
|
+
session=session,
|
|
541
|
+
settings=settings,
|
|
542
|
+
fallback_to_studio=True,
|
|
543
|
+
)
|
|
544
|
+
```
|
|
496
545
|
"""
|
|
497
546
|
query = DatasetQuery(
|
|
498
547
|
name=name,
|
|
499
548
|
version=version,
|
|
500
549
|
session=session,
|
|
501
550
|
indexing_column_types=File._datachain_column_types,
|
|
502
|
-
|
|
551
|
+
fallback_to_studio=fallback_to_studio,
|
|
503
552
|
)
|
|
504
553
|
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
505
554
|
if settings:
|
|
@@ -2444,7 +2493,7 @@ class DataChain:
|
|
|
2444
2493
|
self._setup = self._setup | kwargs
|
|
2445
2494
|
return self
|
|
2446
2495
|
|
|
2447
|
-
def
|
|
2496
|
+
def to_storage(
|
|
2448
2497
|
self,
|
|
2449
2498
|
output: str,
|
|
2450
2499
|
signal: str = "file",
|
|
@@ -2462,6 +2511,13 @@ class DataChain:
|
|
|
2462
2511
|
use_cache: If `True`, cache the files before exporting.
|
|
2463
2512
|
link_type: Method to use for exporting files.
|
|
2464
2513
|
Falls back to `'copy'` if symlinking fails.
|
|
2514
|
+
|
|
2515
|
+
Example:
|
|
2516
|
+
Cross cloud transfer
|
|
2517
|
+
```py
|
|
2518
|
+
ds = DataChain.from_storage("s3://mybucket")
|
|
2519
|
+
ds.to_storage("gs://mybucket", placement="filename")
|
|
2520
|
+
```
|
|
2465
2521
|
"""
|
|
2466
2522
|
if placement == "filename" and (
|
|
2467
2523
|
self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
|
|
@@ -17,6 +17,7 @@ from urllib.parse import unquote, urlparse
|
|
|
17
17
|
from urllib.request import url2pathname
|
|
18
18
|
|
|
19
19
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
20
|
+
from fsspec.utils import stringify_path
|
|
20
21
|
from PIL import Image as PilImage
|
|
21
22
|
from pydantic import Field, field_validator
|
|
22
23
|
|
|
@@ -270,8 +271,9 @@ class File(DataModel):
|
|
|
270
271
|
|
|
271
272
|
def save(self, destination: str):
|
|
272
273
|
"""Writes it's content to destination"""
|
|
273
|
-
|
|
274
|
-
|
|
274
|
+
destination = stringify_path(destination)
|
|
275
|
+
client: Client = self._catalog.get_client(str(destination))
|
|
276
|
+
client.upload(self.read(), str(destination))
|
|
275
277
|
|
|
276
278
|
def _symlink_to(self, destination: str):
|
|
277
279
|
if self.location:
|
|
@@ -285,6 +287,7 @@ class File(DataModel):
|
|
|
285
287
|
source = self.get_path()
|
|
286
288
|
else:
|
|
287
289
|
raise OSError(errno.EXDEV, "can't link across filesystems")
|
|
290
|
+
|
|
288
291
|
return os.symlink(source, destination)
|
|
289
292
|
|
|
290
293
|
def export(
|
|
@@ -299,7 +302,8 @@ class File(DataModel):
|
|
|
299
302
|
self._caching_enabled = use_cache
|
|
300
303
|
dst = self.get_destination_path(output, placement)
|
|
301
304
|
dst_dir = os.path.dirname(dst)
|
|
302
|
-
|
|
305
|
+
client: Client = self._catalog.get_client(dst_dir)
|
|
306
|
+
client.fs.makedirs(dst_dir, exist_ok=True)
|
|
303
307
|
|
|
304
308
|
if link_type == "symlink":
|
|
305
309
|
try:
|
|
@@ -496,7 +500,10 @@ class TextFile(File):
|
|
|
496
500
|
|
|
497
501
|
def save(self, destination: str):
|
|
498
502
|
"""Writes it's content to destination"""
|
|
499
|
-
|
|
503
|
+
destination = stringify_path(destination)
|
|
504
|
+
|
|
505
|
+
client: Client = self._catalog.get_client(destination)
|
|
506
|
+
with client.fs.open(destination, mode="w") as f:
|
|
500
507
|
f.write(self.read_text())
|
|
501
508
|
|
|
502
509
|
|
|
@@ -510,7 +517,11 @@ class ImageFile(File):
|
|
|
510
517
|
|
|
511
518
|
def save(self, destination: str):
|
|
512
519
|
"""Writes it's content to destination"""
|
|
513
|
-
|
|
520
|
+
destination = stringify_path(destination)
|
|
521
|
+
|
|
522
|
+
client: Client = self._catalog.get_client(destination)
|
|
523
|
+
with client.fs.open(destination, mode="wb") as f:
|
|
524
|
+
self.read().save(f)
|
|
514
525
|
|
|
515
526
|
|
|
516
527
|
class Image(DataModel):
|
|
@@ -1085,7 +1085,7 @@ class DatasetQuery:
|
|
|
1085
1085
|
session: Optional[Session] = None,
|
|
1086
1086
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
1087
1087
|
in_memory: bool = False,
|
|
1088
|
-
|
|
1088
|
+
fallback_to_studio: bool = True,
|
|
1089
1089
|
) -> None:
|
|
1090
1090
|
self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
|
|
1091
1091
|
self.catalog = catalog or self.session.catalog
|
|
@@ -1103,7 +1103,7 @@ class DatasetQuery:
|
|
|
1103
1103
|
|
|
1104
1104
|
self.name = name
|
|
1105
1105
|
|
|
1106
|
-
if
|
|
1106
|
+
if fallback_to_studio and is_token_set():
|
|
1107
1107
|
ds = self.catalog.get_dataset_with_remote_fallback(name, version)
|
|
1108
1108
|
else:
|
|
1109
1109
|
ds = self.catalog.get_dataset(name)
|
|
@@ -139,21 +139,33 @@ class Session:
|
|
|
139
139
|
|
|
140
140
|
# Access the active (most recent) context from the stack
|
|
141
141
|
if cls.SESSION_CONTEXTS:
|
|
142
|
-
|
|
142
|
+
session = cls.SESSION_CONTEXTS[-1]
|
|
143
143
|
|
|
144
|
-
|
|
144
|
+
elif cls.GLOBAL_SESSION_CTX is None:
|
|
145
145
|
cls.GLOBAL_SESSION_CTX = Session(
|
|
146
146
|
cls.GLOBAL_SESSION_NAME,
|
|
147
147
|
catalog,
|
|
148
148
|
client_config=client_config,
|
|
149
149
|
in_memory=in_memory,
|
|
150
150
|
)
|
|
151
|
+
session = cls.GLOBAL_SESSION_CTX
|
|
151
152
|
|
|
152
153
|
atexit.register(cls._global_cleanup)
|
|
153
154
|
cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
|
|
154
155
|
sys.excepthook = cls.except_hook
|
|
156
|
+
else:
|
|
157
|
+
session = cls.GLOBAL_SESSION_CTX
|
|
155
158
|
|
|
156
|
-
|
|
159
|
+
if client_config and session.catalog.client_config != client_config:
|
|
160
|
+
session = Session(
|
|
161
|
+
"session" + uuid4().hex[:4],
|
|
162
|
+
catalog,
|
|
163
|
+
client_config=client_config,
|
|
164
|
+
in_memory=in_memory,
|
|
165
|
+
)
|
|
166
|
+
session.__enter__()
|
|
167
|
+
|
|
168
|
+
return session
|
|
157
169
|
|
|
158
170
|
@staticmethod
|
|
159
171
|
def except_hook(exc_type, exc_value, exc_traceback):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -175,7 +175,7 @@ high confidence scores.
|
|
|
175
175
|
|
|
176
176
|
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
177
177
|
& (Column("meta.inference.class_") == "cat"))
|
|
178
|
-
likely_cats.
|
|
178
|
+
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
179
179
|
|
|
180
180
|
|
|
181
181
|
Example: LLM based text-file evaluation
|
|
@@ -216,7 +216,7 @@ Python code:
|
|
|
216
216
|
)
|
|
217
217
|
|
|
218
218
|
successful_chain = chain.filter(Column("is_success") == True)
|
|
219
|
-
successful_chain.
|
|
219
|
+
successful_chain.to_storage("./output_mistral")
|
|
220
220
|
|
|
221
221
|
print(f"{successful_chain.count()} files were exported")
|
|
222
222
|
|
|
@@ -232,6 +232,7 @@ tests/func/__init__.py
|
|
|
232
232
|
tests/func/fake-service-account-credentials.json
|
|
233
233
|
tests/func/test_catalog.py
|
|
234
234
|
tests/func/test_client.py
|
|
235
|
+
tests/func/test_cloud_transfer.py
|
|
235
236
|
tests/func/test_data_storage.py
|
|
236
237
|
tests/func/test_datachain.py
|
|
237
238
|
tests/func/test_datachain_merge.py
|
|
@@ -472,9 +472,9 @@ def cloud_server_credentials(cloud_server, monkeypatch):
|
|
|
472
472
|
|
|
473
473
|
def get_cloud_test_catalog(cloud_server, tmp_path, metastore, warehouse):
|
|
474
474
|
cache_dir = tmp_path / ".datachain" / "cache"
|
|
475
|
-
cache_dir.mkdir(parents=True)
|
|
475
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
476
476
|
tmpfile_dir = tmp_path / ".datachain" / "tmp"
|
|
477
|
-
tmpfile_dir.mkdir()
|
|
477
|
+
tmpfile_dir.mkdir(exist_ok=True)
|
|
478
478
|
|
|
479
479
|
catalog = Catalog(
|
|
480
480
|
metastore=metastore,
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from datachain import Session
|
|
4
|
+
from datachain.lib.dc import DataChain
|
|
5
|
+
from tests.conftest import get_cloud_test_catalog, make_cloud_server
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_cross_cloud_transfer(
|
|
9
|
+
request,
|
|
10
|
+
tmp_upath_factory,
|
|
11
|
+
tree,
|
|
12
|
+
tmp_path,
|
|
13
|
+
metastore,
|
|
14
|
+
warehouse,
|
|
15
|
+
):
|
|
16
|
+
disabled_remotes = request.config.getoption("--disable-remotes") or []
|
|
17
|
+
|
|
18
|
+
if any(remote in disabled_remotes for remote in ["azure", "gs", "all"]):
|
|
19
|
+
pytest.skip("Skipping all tests for azure, gs or all remotes")
|
|
20
|
+
|
|
21
|
+
azure_path = tmp_upath_factory.mktemp("azure", version_aware=False)
|
|
22
|
+
azure_server = make_cloud_server(azure_path, "azure", tree)
|
|
23
|
+
|
|
24
|
+
gcloud_path = tmp_upath_factory.mktemp("gs", version_aware=False)
|
|
25
|
+
gcloud_server = make_cloud_server(gcloud_path, "gs", tree)
|
|
26
|
+
|
|
27
|
+
# Initialize cloud catalogs
|
|
28
|
+
azure_catalog = get_cloud_test_catalog(azure_server, tmp_path, metastore, warehouse)
|
|
29
|
+
gcloud_catalog = get_cloud_test_catalog(
|
|
30
|
+
gcloud_server, tmp_path, metastore, warehouse
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Define test file paths
|
|
34
|
+
test_filename = "image_1.jpg"
|
|
35
|
+
test_content = b"bytes"
|
|
36
|
+
|
|
37
|
+
source_dir = f"{azure_catalog.src_uri}/source-test-images"
|
|
38
|
+
source_file = f"{source_dir}/{test_filename}"
|
|
39
|
+
|
|
40
|
+
dest_dir = f"{gcloud_catalog.src_uri}/destination-test-images"
|
|
41
|
+
dest_file = f"{dest_dir}/{test_filename}"
|
|
42
|
+
|
|
43
|
+
# Get cloud clients
|
|
44
|
+
azure_client = azure_catalog.catalog.get_client(source_file)
|
|
45
|
+
gcloud_client = gcloud_catalog.catalog.get_client(dest_file)
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
# Create test file in Azure
|
|
49
|
+
with azure_client.fs.open(source_file, "wb") as f:
|
|
50
|
+
f.write(test_content)
|
|
51
|
+
|
|
52
|
+
# Perform cross-cloud transfer
|
|
53
|
+
combined_config = azure_server.client_config | gcloud_server.client_config
|
|
54
|
+
with Session("testSession", client_config=combined_config):
|
|
55
|
+
datachain = DataChain.from_storage(source_dir)
|
|
56
|
+
datachain.to_storage(dest_dir, placement="filename")
|
|
57
|
+
|
|
58
|
+
# Verify transfer
|
|
59
|
+
with gcloud_client.fs.open(dest_file, "rb") as f:
|
|
60
|
+
assert f.read() == test_content
|
|
61
|
+
|
|
62
|
+
finally:
|
|
63
|
+
# Cleanup
|
|
64
|
+
try:
|
|
65
|
+
azure_client.fs.rm(source_dir, recursive=True)
|
|
66
|
+
gcloud_client.fs.rm(dest_dir, recursive=True)
|
|
67
|
+
except FileNotFoundError:
|
|
68
|
+
pass
|
|
@@ -64,6 +64,16 @@ def test_catalog_anon(tmp_dir, catalog, anon):
|
|
|
64
64
|
assert chain.session.catalog.client_config.get("anon", False) is anon
|
|
65
65
|
|
|
66
66
|
|
|
67
|
+
def test_from_storage_client_config(tmp_dir, catalog):
|
|
68
|
+
dc = DataChain.from_storage(tmp_dir.as_uri())
|
|
69
|
+
assert dc.session.catalog.client_config == {} # Default client config is set.
|
|
70
|
+
|
|
71
|
+
dc = DataChain.from_storage(tmp_dir.as_uri(), client_config={"anon": True})
|
|
72
|
+
assert dc.session.catalog.client_config == {
|
|
73
|
+
"anon": True
|
|
74
|
+
} # New client config is set.
|
|
75
|
+
|
|
76
|
+
|
|
67
77
|
def test_from_storage(cloud_test_catalog):
|
|
68
78
|
ctc = cloud_test_catalog
|
|
69
79
|
dc = DataChain.from_storage(ctc.src_uri, session=ctc.session)
|
|
@@ -292,20 +302,20 @@ def test_read_file(cloud_test_catalog, use_cache):
|
|
|
292
302
|
@pytest.mark.parametrize("use_cache", [True, False])
|
|
293
303
|
@pytest.mark.parametrize("file_type", ["", "binary", "text"])
|
|
294
304
|
@pytest.mark.parametrize("cloud_type", ["file"], indirect=True)
|
|
295
|
-
def
|
|
305
|
+
def test_to_storage(
|
|
296
306
|
tmp_dir, cloud_test_catalog, test_session, placement, use_map, use_cache, file_type
|
|
297
307
|
):
|
|
298
308
|
ctc = cloud_test_catalog
|
|
299
309
|
df = DataChain.from_storage(ctc.src_uri, type=file_type, session=test_session)
|
|
300
310
|
if use_map:
|
|
301
|
-
df.
|
|
311
|
+
df.to_storage(tmp_dir / "output", placement=placement, use_cache=use_cache)
|
|
302
312
|
df.map(
|
|
303
313
|
res=lambda file: file.export(
|
|
304
314
|
tmp_dir / "output", placement=placement, use_cache=use_cache
|
|
305
315
|
)
|
|
306
316
|
).exec()
|
|
307
317
|
else:
|
|
308
|
-
df.
|
|
318
|
+
df.to_storage(tmp_dir / "output", placement=placement)
|
|
309
319
|
|
|
310
320
|
expected = {
|
|
311
321
|
"description": "Cats and Dogs",
|
|
@@ -341,14 +351,14 @@ def test_export_images_files(test_session, tmp_dir, tmp_path, use_cache):
|
|
|
341
351
|
ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images
|
|
342
352
|
],
|
|
343
353
|
session=test_session,
|
|
344
|
-
).
|
|
354
|
+
).to_storage(tmp_dir / "output", placement="filename", use_cache=use_cache)
|
|
345
355
|
|
|
346
356
|
for img in images:
|
|
347
357
|
exported_img = Image.open(tmp_dir / "output" / img["name"])
|
|
348
358
|
assert images_equal(img["data"], exported_img)
|
|
349
359
|
|
|
350
360
|
|
|
351
|
-
def
|
|
361
|
+
def test_to_storage_files_filename_placement_not_unique_files(tmp_dir, test_session):
|
|
352
362
|
data = b"some\x00data\x00is\x48\x65\x6c\x57\x6f\x72\x6c\x64\xff\xffheRe"
|
|
353
363
|
bucket_name = "mybucket"
|
|
354
364
|
files = ["dir1/a.json", "dir1/dir2/a.json"]
|
|
@@ -364,7 +374,7 @@ def test_export_files_filename_placement_not_unique_files(tmp_dir, test_session)
|
|
|
364
374
|
|
|
365
375
|
df = DataChain.from_storage((tmp_dir / bucket_name).as_uri(), session=test_session)
|
|
366
376
|
with pytest.raises(ValueError):
|
|
367
|
-
df.
|
|
377
|
+
df.to_storage(tmp_dir / "output", placement="filename")
|
|
368
378
|
|
|
369
379
|
|
|
370
380
|
def test_show(capsys, test_session):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|