datachain 0.9.1__tar.gz → 0.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.9.1 → datachain-0.11.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.9.1 → datachain-0.11.0}/PKG-INFO +5 -4
- {datachain-0.9.1 → datachain-0.11.0}/README.rst +2 -2
- {datachain-0.9.1 → datachain-0.11.0}/docs/quick-start.md +2 -2
- {datachain-0.9.1 → datachain-0.11.0}/pyproject.toml +3 -2
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/fsspec.py +1 -1
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/dc.py +60 -4
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/file.py +20 -5
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/dataset.py +2 -2
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/session.py +15 -3
- datachain-0.11.0/src/datachain/script_meta.py +147 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/PKG-INFO +5 -4
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/SOURCES.txt +3 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/requires.txt +4 -1
- {datachain-0.9.1 → datachain-0.11.0}/tests/conftest.py +2 -2
- datachain-0.11.0/tests/func/test_cloud_transfer.py +68 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_datachain.py +37 -6
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_pull.py +1 -1
- datachain-0.11.0/tests/unit/test_script_meta.py +119 -0
- {datachain-0.9.1 → datachain-0.11.0}/.cruft.json +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.gitattributes +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.github/codecov.yaml +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.github/dependabot.yml +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.github/workflows/release.yml +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/.gitignore +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/LICENSE +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/contributing.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/examples.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/index.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/overrides/main.html +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/datachain.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/func.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/index.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/toolkit.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/torch.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/references/udf.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/docs/tutorials.md +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/mkdocs.yml +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/noxfile.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/setup.cfg +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/__main__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/asyn.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cache.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/local.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/config.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/dataset.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/error.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/array.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/base.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/func.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/path.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/random.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/string.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/window.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/job.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/listing.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/node.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/progress.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/py.typed +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/params.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/studio.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain/utils.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/data.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/examples/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_client.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_file.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_hf.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_listing.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_ls.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_query.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_session.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/test_atomicity.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/test_cli_studio.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/test_telemetry.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_video.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_client.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_config.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_func.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_query.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_session.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.9.1 → datachain-0.11.0}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -49,6 +49,7 @@ Requires-Dist: platformdirs
|
|
|
49
49
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
50
50
|
Requires-Dist: tabulate
|
|
51
51
|
Requires-Dist: websockets
|
|
52
|
+
Requires-Dist: tomli; python_version < "3.11"
|
|
52
53
|
Provides-Extra: docs
|
|
53
54
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
54
55
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -102,7 +103,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
102
103
|
Requires-Dist: defusedxml; extra == "examples"
|
|
103
104
|
Requires-Dist: accelerate; extra == "examples"
|
|
104
105
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
105
|
-
Requires-Dist: ultralytics==8.3.
|
|
106
|
+
Requires-Dist: ultralytics==8.3.78; extra == "examples"
|
|
106
107
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
107
108
|
|
|
108
109
|
================
|
|
@@ -175,7 +176,7 @@ high confidence scores.
|
|
|
175
176
|
|
|
176
177
|
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
177
178
|
& (Column("meta.inference.class_") == "cat"))
|
|
178
|
-
likely_cats.
|
|
179
|
+
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
179
180
|
|
|
180
181
|
|
|
181
182
|
Example: LLM based text-file evaluation
|
|
@@ -216,7 +217,7 @@ Python code:
|
|
|
216
217
|
)
|
|
217
218
|
|
|
218
219
|
successful_chain = chain.filter(Column("is_success") == True)
|
|
219
|
-
successful_chain.
|
|
220
|
+
successful_chain.to_storage("./output_mistral")
|
|
220
221
|
|
|
221
222
|
print(f"{successful_chain.count()} files were exported")
|
|
222
223
|
|
|
@@ -68,7 +68,7 @@ high confidence scores.
|
|
|
68
68
|
|
|
69
69
|
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
70
70
|
& (Column("meta.inference.class_") == "cat"))
|
|
71
|
-
likely_cats.
|
|
71
|
+
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
72
72
|
|
|
73
73
|
|
|
74
74
|
Example: LLM based text-file evaluation
|
|
@@ -109,7 +109,7 @@ Python code:
|
|
|
109
109
|
)
|
|
110
110
|
|
|
111
111
|
successful_chain = chain.filter(Column("is_success") == True)
|
|
112
|
-
successful_chain.
|
|
112
|
+
successful_chain.to_storage("./output_mistral")
|
|
113
113
|
|
|
114
114
|
print(f"{successful_chain.count()} files were exported")
|
|
115
115
|
|
|
@@ -47,7 +47,7 @@ annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
|
47
47
|
|
|
48
48
|
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
49
49
|
& (Column("meta.inference.class_") == "cat"))
|
|
50
|
-
likely_cats.
|
|
50
|
+
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
51
51
|
```
|
|
52
52
|
|
|
53
53
|
## Data curation with a local AI model
|
|
@@ -85,7 +85,7 @@ chain = (
|
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
positive_chain = chain.filter(Column("is_positive") == True)
|
|
88
|
-
positive_chain.
|
|
88
|
+
positive_chain.to_storage("./output")
|
|
89
89
|
|
|
90
90
|
print(f"{positive_chain.count()} files were exported")
|
|
91
91
|
```
|
|
@@ -51,7 +51,8 @@ dependencies = [
|
|
|
51
51
|
"platformdirs",
|
|
52
52
|
"dvc-studio-client>=0.21,<1",
|
|
53
53
|
"tabulate",
|
|
54
|
-
"websockets"
|
|
54
|
+
"websockets",
|
|
55
|
+
"tomli;python_version<'3.11'"
|
|
55
56
|
]
|
|
56
57
|
|
|
57
58
|
[project.optional-dependencies]
|
|
@@ -118,7 +119,7 @@ examples = [
|
|
|
118
119
|
"defusedxml",
|
|
119
120
|
"accelerate",
|
|
120
121
|
"huggingface_hub[hf_transfer]",
|
|
121
|
-
"ultralytics==8.3.
|
|
122
|
+
"ultralytics==8.3.78",
|
|
122
123
|
"open_clip_torch"
|
|
123
124
|
]
|
|
124
125
|
|
|
@@ -390,7 +390,7 @@ class Client(ABC):
|
|
|
390
390
|
) # type: ignore[return-value]
|
|
391
391
|
|
|
392
392
|
def upload(self, data: bytes, path: str) -> "File":
|
|
393
|
-
full_path = self.get_full_path(path)
|
|
393
|
+
full_path = path if path.startswith(self.PREFIX) else self.get_full_path(path)
|
|
394
394
|
|
|
395
395
|
parent = posixpath.dirname(full_path)
|
|
396
396
|
self.fs.makedirs(parent, exist_ok=True)
|
|
@@ -411,6 +411,7 @@ class DataChain:
|
|
|
411
411
|
object_name: str = "file",
|
|
412
412
|
update: bool = False,
|
|
413
413
|
anon: bool = False,
|
|
414
|
+
client_config: Optional[dict] = None,
|
|
414
415
|
) -> "Self":
|
|
415
416
|
"""Get data from a storage as a list of file with all file attributes.
|
|
416
417
|
It returns the chain itself as usual.
|
|
@@ -423,15 +424,32 @@ class DataChain:
|
|
|
423
424
|
object_name : Created object column name.
|
|
424
425
|
update : force storage reindexing. Default is False.
|
|
425
426
|
anon : If True, we will treat cloud bucket as public one
|
|
427
|
+
client_config : Optional client configuration for the storage client.
|
|
426
428
|
|
|
427
429
|
Example:
|
|
430
|
+
Simple call from s3
|
|
428
431
|
```py
|
|
429
432
|
chain = DataChain.from_storage("s3://my-bucket/my-dir")
|
|
430
433
|
```
|
|
434
|
+
|
|
435
|
+
With AWS S3-compatible storage
|
|
436
|
+
```py
|
|
437
|
+
chain = DataChain.from_storage(
|
|
438
|
+
"s3://my-bucket/my-dir",
|
|
439
|
+
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
440
|
+
)
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
Pass existing session
|
|
444
|
+
```py
|
|
445
|
+
session = Session.get()
|
|
446
|
+
chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
|
|
447
|
+
```
|
|
431
448
|
"""
|
|
432
449
|
file_type = get_file_type(type)
|
|
433
450
|
|
|
434
|
-
|
|
451
|
+
if anon:
|
|
452
|
+
client_config = (client_config or {}) | {"anon": True}
|
|
435
453
|
session = Session.get(session, client_config=client_config, in_memory=in_memory)
|
|
436
454
|
cache = session.catalog.cache
|
|
437
455
|
client_config = session.catalog.client_config
|
|
@@ -481,25 +499,56 @@ class DataChain:
|
|
|
481
499
|
version: Optional[int] = None,
|
|
482
500
|
session: Optional[Session] = None,
|
|
483
501
|
settings: Optional[dict] = None,
|
|
484
|
-
|
|
502
|
+
fallback_to_studio: bool = True,
|
|
485
503
|
) -> "Self":
|
|
486
504
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
505
|
+
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
487
506
|
|
|
488
507
|
Parameters:
|
|
489
508
|
name : dataset name
|
|
490
509
|
version : dataset version
|
|
510
|
+
session : Session to use for the chain.
|
|
511
|
+
settings : Settings to use for the chain.
|
|
512
|
+
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
513
|
+
Default is True.
|
|
491
514
|
|
|
492
515
|
Example:
|
|
493
516
|
```py
|
|
494
517
|
chain = DataChain.from_dataset("my_cats")
|
|
495
518
|
```
|
|
519
|
+
|
|
520
|
+
```py
|
|
521
|
+
chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
```py
|
|
525
|
+
chain = DataChain.from_dataset("my_cats", version=1)
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
```py
|
|
529
|
+
session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
|
|
530
|
+
settings = {
|
|
531
|
+
"cache": True,
|
|
532
|
+
"parallel": 4,
|
|
533
|
+
"workers": 4,
|
|
534
|
+
"min_task_size": 1000,
|
|
535
|
+
"prefetch": 10,
|
|
536
|
+
}
|
|
537
|
+
chain = DataChain.from_dataset(
|
|
538
|
+
name="my_cats",
|
|
539
|
+
version=1,
|
|
540
|
+
session=session,
|
|
541
|
+
settings=settings,
|
|
542
|
+
fallback_to_studio=True,
|
|
543
|
+
)
|
|
544
|
+
```
|
|
496
545
|
"""
|
|
497
546
|
query = DatasetQuery(
|
|
498
547
|
name=name,
|
|
499
548
|
version=version,
|
|
500
549
|
session=session,
|
|
501
550
|
indexing_column_types=File._datachain_column_types,
|
|
502
|
-
|
|
551
|
+
fallback_to_studio=fallback_to_studio,
|
|
503
552
|
)
|
|
504
553
|
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
505
554
|
if settings:
|
|
@@ -2444,7 +2493,7 @@ class DataChain:
|
|
|
2444
2493
|
self._setup = self._setup | kwargs
|
|
2445
2494
|
return self
|
|
2446
2495
|
|
|
2447
|
-
def
|
|
2496
|
+
def to_storage(
|
|
2448
2497
|
self,
|
|
2449
2498
|
output: str,
|
|
2450
2499
|
signal: str = "file",
|
|
@@ -2462,6 +2511,13 @@ class DataChain:
|
|
|
2462
2511
|
use_cache: If `True`, cache the files before exporting.
|
|
2463
2512
|
link_type: Method to use for exporting files.
|
|
2464
2513
|
Falls back to `'copy'` if symlinking fails.
|
|
2514
|
+
|
|
2515
|
+
Example:
|
|
2516
|
+
Cross cloud transfer
|
|
2517
|
+
```py
|
|
2518
|
+
ds = DataChain.from_storage("s3://mybucket")
|
|
2519
|
+
ds.to_storage("gs://mybucket", placement="filename")
|
|
2520
|
+
```
|
|
2465
2521
|
"""
|
|
2466
2522
|
if placement == "filename" and (
|
|
2467
2523
|
self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
|
|
@@ -17,6 +17,7 @@ from urllib.parse import unquote, urlparse
|
|
|
17
17
|
from urllib.request import url2pathname
|
|
18
18
|
|
|
19
19
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
20
|
+
from fsspec.utils import stringify_path
|
|
20
21
|
from PIL import Image as PilImage
|
|
21
22
|
from pydantic import Field, field_validator
|
|
22
23
|
|
|
@@ -270,8 +271,13 @@ class File(DataModel):
|
|
|
270
271
|
|
|
271
272
|
def save(self, destination: str):
|
|
272
273
|
"""Writes it's content to destination"""
|
|
273
|
-
|
|
274
|
-
|
|
274
|
+
destination = stringify_path(destination)
|
|
275
|
+
client: Client = self._catalog.get_client(destination)
|
|
276
|
+
|
|
277
|
+
if client.PREFIX == "file://" and not destination.startswith(client.PREFIX):
|
|
278
|
+
destination = Path(destination).absolute().as_uri()
|
|
279
|
+
|
|
280
|
+
client.upload(self.read(), destination)
|
|
275
281
|
|
|
276
282
|
def _symlink_to(self, destination: str):
|
|
277
283
|
if self.location:
|
|
@@ -285,6 +291,7 @@ class File(DataModel):
|
|
|
285
291
|
source = self.get_path()
|
|
286
292
|
else:
|
|
287
293
|
raise OSError(errno.EXDEV, "can't link across filesystems")
|
|
294
|
+
|
|
288
295
|
return os.symlink(source, destination)
|
|
289
296
|
|
|
290
297
|
def export(
|
|
@@ -299,7 +306,8 @@ class File(DataModel):
|
|
|
299
306
|
self._caching_enabled = use_cache
|
|
300
307
|
dst = self.get_destination_path(output, placement)
|
|
301
308
|
dst_dir = os.path.dirname(dst)
|
|
302
|
-
|
|
309
|
+
client: Client = self._catalog.get_client(dst_dir)
|
|
310
|
+
client.fs.makedirs(dst_dir, exist_ok=True)
|
|
303
311
|
|
|
304
312
|
if link_type == "symlink":
|
|
305
313
|
try:
|
|
@@ -496,7 +504,10 @@ class TextFile(File):
|
|
|
496
504
|
|
|
497
505
|
def save(self, destination: str):
|
|
498
506
|
"""Writes it's content to destination"""
|
|
499
|
-
|
|
507
|
+
destination = stringify_path(destination)
|
|
508
|
+
|
|
509
|
+
client: Client = self._catalog.get_client(destination)
|
|
510
|
+
with client.fs.open(destination, mode="w") as f:
|
|
500
511
|
f.write(self.read_text())
|
|
501
512
|
|
|
502
513
|
|
|
@@ -510,7 +521,11 @@ class ImageFile(File):
|
|
|
510
521
|
|
|
511
522
|
def save(self, destination: str):
|
|
512
523
|
"""Writes it's content to destination"""
|
|
513
|
-
|
|
524
|
+
destination = stringify_path(destination)
|
|
525
|
+
|
|
526
|
+
client: Client = self._catalog.get_client(destination)
|
|
527
|
+
with client.fs.open(destination, mode="wb") as f:
|
|
528
|
+
self.read().save(f)
|
|
514
529
|
|
|
515
530
|
|
|
516
531
|
class Image(DataModel):
|
|
@@ -1085,7 +1085,7 @@ class DatasetQuery:
|
|
|
1085
1085
|
session: Optional[Session] = None,
|
|
1086
1086
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
1087
1087
|
in_memory: bool = False,
|
|
1088
|
-
|
|
1088
|
+
fallback_to_studio: bool = True,
|
|
1089
1089
|
) -> None:
|
|
1090
1090
|
self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
|
|
1091
1091
|
self.catalog = catalog or self.session.catalog
|
|
@@ -1103,7 +1103,7 @@ class DatasetQuery:
|
|
|
1103
1103
|
|
|
1104
1104
|
self.name = name
|
|
1105
1105
|
|
|
1106
|
-
if
|
|
1106
|
+
if fallback_to_studio and is_token_set():
|
|
1107
1107
|
ds = self.catalog.get_dataset_with_remote_fallback(name, version)
|
|
1108
1108
|
else:
|
|
1109
1109
|
ds = self.catalog.get_dataset(name)
|
|
@@ -139,21 +139,33 @@ class Session:
|
|
|
139
139
|
|
|
140
140
|
# Access the active (most recent) context from the stack
|
|
141
141
|
if cls.SESSION_CONTEXTS:
|
|
142
|
-
|
|
142
|
+
session = cls.SESSION_CONTEXTS[-1]
|
|
143
143
|
|
|
144
|
-
|
|
144
|
+
elif cls.GLOBAL_SESSION_CTX is None:
|
|
145
145
|
cls.GLOBAL_SESSION_CTX = Session(
|
|
146
146
|
cls.GLOBAL_SESSION_NAME,
|
|
147
147
|
catalog,
|
|
148
148
|
client_config=client_config,
|
|
149
149
|
in_memory=in_memory,
|
|
150
150
|
)
|
|
151
|
+
session = cls.GLOBAL_SESSION_CTX
|
|
151
152
|
|
|
152
153
|
atexit.register(cls._global_cleanup)
|
|
153
154
|
cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
|
|
154
155
|
sys.excepthook = cls.except_hook
|
|
156
|
+
else:
|
|
157
|
+
session = cls.GLOBAL_SESSION_CTX
|
|
155
158
|
|
|
156
|
-
|
|
159
|
+
if client_config and session.catalog.client_config != client_config:
|
|
160
|
+
session = Session(
|
|
161
|
+
"session" + uuid4().hex[:4],
|
|
162
|
+
catalog,
|
|
163
|
+
client_config=client_config,
|
|
164
|
+
in_memory=in_memory,
|
|
165
|
+
)
|
|
166
|
+
session.__enter__()
|
|
167
|
+
|
|
168
|
+
return session
|
|
157
169
|
|
|
158
170
|
@staticmethod
|
|
159
171
|
def except_hook(exc_type, exc_value, exc_traceback):
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import tomllib
|
|
7
|
+
except ModuleNotFoundError:
|
|
8
|
+
# tomllib is in standard library from python 3.11 so for earlier versions
|
|
9
|
+
# we need tomli
|
|
10
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ScriptConfigParsingError(Exception):
|
|
14
|
+
def __init__(self, message):
|
|
15
|
+
super().__init__(message)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ScriptConfig:
|
|
20
|
+
"""
|
|
21
|
+
Class that is parsing inline script metadata to get some basic information for
|
|
22
|
+
running datachain script like python version, dependencies, attachments etc.
|
|
23
|
+
Inline script metadata must follow the format described in https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata.
|
|
24
|
+
Example of script with inline metadata:
|
|
25
|
+
# /// script
|
|
26
|
+
# requires-python = ">=3.12"
|
|
27
|
+
#
|
|
28
|
+
# dependencies = [
|
|
29
|
+
# "pandas < 2.1.0",
|
|
30
|
+
# "numpy == 1.26.4"
|
|
31
|
+
# ]
|
|
32
|
+
#
|
|
33
|
+
# [tools.datachain.workers]
|
|
34
|
+
# num_workers = 3
|
|
35
|
+
#
|
|
36
|
+
# [tools.datachain.attachments]
|
|
37
|
+
# image1 = "s3://ldb-public/image1.jpg"
|
|
38
|
+
# file1 = "s3://ldb-public/file.pdf"
|
|
39
|
+
#
|
|
40
|
+
# [tools.datachain.params]
|
|
41
|
+
# min_length_sec = 1
|
|
42
|
+
# cache = false
|
|
43
|
+
#
|
|
44
|
+
# [tools.datachain.inputs]
|
|
45
|
+
# threshold = 0.5
|
|
46
|
+
# start_ds_name = "ds://start"
|
|
47
|
+
#
|
|
48
|
+
# [tools.datachain.outputs]
|
|
49
|
+
# result_dataset = "ds://res"
|
|
50
|
+
# result_dir = "/temp"
|
|
51
|
+
#
|
|
52
|
+
# ///
|
|
53
|
+
|
|
54
|
+
import sys
|
|
55
|
+
import pandas as pd
|
|
56
|
+
|
|
57
|
+
print(f"Python version: {sys.version_info}")
|
|
58
|
+
print(f"Pandas version: {pd.__version__}")
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
python_version: Optional[str]
|
|
63
|
+
dependencies: list[str]
|
|
64
|
+
attachments: dict[str, str]
|
|
65
|
+
params: dict[str, Any]
|
|
66
|
+
inputs: dict[str, Any]
|
|
67
|
+
outputs: dict[str, Any]
|
|
68
|
+
num_workers: Optional[int] = None
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
python_version: Optional[str] = None,
|
|
73
|
+
dependencies: Optional[list[str]] = None,
|
|
74
|
+
attachments: Optional[dict[str, str]] = None,
|
|
75
|
+
params: Optional[dict[str, Any]] = None,
|
|
76
|
+
inputs: Optional[dict[str, Any]] = None,
|
|
77
|
+
outputs: Optional[dict[str, Any]] = None,
|
|
78
|
+
num_workers: Optional[int] = None,
|
|
79
|
+
):
|
|
80
|
+
self.python_version = python_version
|
|
81
|
+
self.dependencies = dependencies or []
|
|
82
|
+
self.attachments = attachments or {}
|
|
83
|
+
self.params = params or {}
|
|
84
|
+
self.inputs = inputs or {}
|
|
85
|
+
self.outputs = outputs or {}
|
|
86
|
+
self.num_workers = num_workers
|
|
87
|
+
|
|
88
|
+
def get_param(self, name: str, default: Any) -> Any:
|
|
89
|
+
return self.params.get(name, default)
|
|
90
|
+
|
|
91
|
+
def get_input(self, name: str, default: Any) -> Any:
|
|
92
|
+
return self.inputs.get(name, default)
|
|
93
|
+
|
|
94
|
+
def get_output(self, name: str, default: Any) -> Any:
|
|
95
|
+
return self.outputs.get(name, default)
|
|
96
|
+
|
|
97
|
+
def get_attachment(self, name: str, default: Any) -> Any:
|
|
98
|
+
return self.attachments.get(name, default)
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def read(script: str) -> Optional[dict]:
|
|
102
|
+
"""Converts inline script metadata to dict with all found data"""
|
|
103
|
+
regex = (
|
|
104
|
+
r"(?m)^# \/\/\/ (?P<type>[a-zA-Z0-9-]+)[ \t]*$[\r\n|\r|\n]"
|
|
105
|
+
"(?P<content>(?:^#(?:| .*)$[\r\n|\r|\n])+)^# \\/\\/\\/[ \t]*$"
|
|
106
|
+
)
|
|
107
|
+
name = "script"
|
|
108
|
+
matches = list(
|
|
109
|
+
filter(lambda m: m.group("type") == name, re.finditer(regex, script))
|
|
110
|
+
)
|
|
111
|
+
if len(matches) > 1:
|
|
112
|
+
raise ValueError(f"Multiple {name} blocks found")
|
|
113
|
+
if len(matches) == 1:
|
|
114
|
+
content = "".join(
|
|
115
|
+
line[2:] if line.startswith("# ") else line[1:]
|
|
116
|
+
for line in matches[0].group("content").splitlines(keepends=True)
|
|
117
|
+
)
|
|
118
|
+
return tomllib.loads(content)
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def parse(script: str) -> Optional["ScriptConfig"]:
|
|
123
|
+
"""
|
|
124
|
+
Method that is parsing inline script metadata from datachain script and
|
|
125
|
+
instantiating ScriptConfig class with found data. If no inline metadata is
|
|
126
|
+
found, it returns None
|
|
127
|
+
"""
|
|
128
|
+
try:
|
|
129
|
+
meta = ScriptConfig.read(script)
|
|
130
|
+
if not meta:
|
|
131
|
+
return None
|
|
132
|
+
custom = meta.get("tools", {}).get("datachain", {})
|
|
133
|
+
return ScriptConfig(
|
|
134
|
+
python_version=meta.get("requires-python"),
|
|
135
|
+
dependencies=meta.get("dependencies"),
|
|
136
|
+
num_workers=custom.get("workers", {}).get("num_workers"),
|
|
137
|
+
attachments=custom.get("attachments"),
|
|
138
|
+
params={k: str(v) for k, v in custom.get("params").items()}
|
|
139
|
+
if custom.get("params")
|
|
140
|
+
else None,
|
|
141
|
+
inputs=custom.get("inputs"),
|
|
142
|
+
outputs=custom.get("outputs"),
|
|
143
|
+
)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
raise ScriptConfigParsingError(
|
|
146
|
+
f"Error when parsing script meta: {e}"
|
|
147
|
+
) from e
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -49,6 +49,7 @@ Requires-Dist: platformdirs
|
|
|
49
49
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
50
50
|
Requires-Dist: tabulate
|
|
51
51
|
Requires-Dist: websockets
|
|
52
|
+
Requires-Dist: tomli; python_version < "3.11"
|
|
52
53
|
Provides-Extra: docs
|
|
53
54
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
54
55
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -102,7 +103,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
102
103
|
Requires-Dist: defusedxml; extra == "examples"
|
|
103
104
|
Requires-Dist: accelerate; extra == "examples"
|
|
104
105
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
105
|
-
Requires-Dist: ultralytics==8.3.
|
|
106
|
+
Requires-Dist: ultralytics==8.3.78; extra == "examples"
|
|
106
107
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
107
108
|
|
|
108
109
|
================
|
|
@@ -175,7 +176,7 @@ high confidence scores.
|
|
|
175
176
|
|
|
176
177
|
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
177
178
|
& (Column("meta.inference.class_") == "cat"))
|
|
178
|
-
likely_cats.
|
|
179
|
+
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
179
180
|
|
|
180
181
|
|
|
181
182
|
Example: LLM based text-file evaluation
|
|
@@ -216,7 +217,7 @@ Python code:
|
|
|
216
217
|
)
|
|
217
218
|
|
|
218
219
|
successful_chain = chain.filter(Column("is_success") == True)
|
|
219
|
-
successful_chain.
|
|
220
|
+
successful_chain.to_storage("./output_mistral")
|
|
220
221
|
|
|
221
222
|
print(f"{successful_chain.count()} files were exported")
|
|
222
223
|
|
|
@@ -77,6 +77,7 @@ src/datachain/nodes_fetcher.py
|
|
|
77
77
|
src/datachain/nodes_thread_pool.py
|
|
78
78
|
src/datachain/progress.py
|
|
79
79
|
src/datachain/py.typed
|
|
80
|
+
src/datachain/script_meta.py
|
|
80
81
|
src/datachain/studio.py
|
|
81
82
|
src/datachain/telemetry.py
|
|
82
83
|
src/datachain/utils.py
|
|
@@ -232,6 +233,7 @@ tests/func/__init__.py
|
|
|
232
233
|
tests/func/fake-service-account-credentials.json
|
|
233
234
|
tests/func/test_catalog.py
|
|
234
235
|
tests/func/test_client.py
|
|
236
|
+
tests/func/test_cloud_transfer.py
|
|
235
237
|
tests/func/test_data_storage.py
|
|
236
238
|
tests/func/test_datachain.py
|
|
237
239
|
tests/func/test_datachain_merge.py
|
|
@@ -278,6 +280,7 @@ tests/unit/test_pytorch.py
|
|
|
278
280
|
tests/unit/test_query.py
|
|
279
281
|
tests/unit/test_query_metrics.py
|
|
280
282
|
tests/unit/test_query_params.py
|
|
283
|
+
tests/unit/test_script_meta.py
|
|
281
284
|
tests/unit/test_serializer.py
|
|
282
285
|
tests/unit/test_session.py
|
|
283
286
|
tests/unit/test_utils.py
|
|
@@ -32,6 +32,9 @@ dvc-studio-client<1,>=0.21
|
|
|
32
32
|
tabulate
|
|
33
33
|
websockets
|
|
34
34
|
|
|
35
|
+
[:python_version < "3.11"]
|
|
36
|
+
tomli
|
|
37
|
+
|
|
35
38
|
[dev]
|
|
36
39
|
datachain[docs,tests]
|
|
37
40
|
mypy==1.15.0
|
|
@@ -55,7 +58,7 @@ datachain[tests]
|
|
|
55
58
|
defusedxml
|
|
56
59
|
accelerate
|
|
57
60
|
huggingface_hub[hf_transfer]
|
|
58
|
-
ultralytics==8.3.
|
|
61
|
+
ultralytics==8.3.78
|
|
59
62
|
open_clip_torch
|
|
60
63
|
|
|
61
64
|
[hf]
|
|
@@ -472,9 +472,9 @@ def cloud_server_credentials(cloud_server, monkeypatch):
|
|
|
472
472
|
|
|
473
473
|
def get_cloud_test_catalog(cloud_server, tmp_path, metastore, warehouse):
|
|
474
474
|
cache_dir = tmp_path / ".datachain" / "cache"
|
|
475
|
-
cache_dir.mkdir(parents=True)
|
|
475
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
476
476
|
tmpfile_dir = tmp_path / ".datachain" / "tmp"
|
|
477
|
-
tmpfile_dir.mkdir()
|
|
477
|
+
tmpfile_dir.mkdir(exist_ok=True)
|
|
478
478
|
|
|
479
479
|
catalog = Catalog(
|
|
480
480
|
metastore=metastore,
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from datachain import Session
|
|
4
|
+
from datachain.lib.dc import DataChain
|
|
5
|
+
from tests.conftest import get_cloud_test_catalog, make_cloud_server
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_cross_cloud_transfer(
|
|
9
|
+
request,
|
|
10
|
+
tmp_upath_factory,
|
|
11
|
+
tree,
|
|
12
|
+
tmp_path,
|
|
13
|
+
metastore,
|
|
14
|
+
warehouse,
|
|
15
|
+
):
|
|
16
|
+
disabled_remotes = request.config.getoption("--disable-remotes") or []
|
|
17
|
+
|
|
18
|
+
if any(remote in disabled_remotes for remote in ["azure", "gs", "all"]):
|
|
19
|
+
pytest.skip("Skipping all tests for azure, gs or all remotes")
|
|
20
|
+
|
|
21
|
+
azure_path = tmp_upath_factory.mktemp("azure", version_aware=False)
|
|
22
|
+
azure_server = make_cloud_server(azure_path, "azure", tree)
|
|
23
|
+
|
|
24
|
+
gcloud_path = tmp_upath_factory.mktemp("gs", version_aware=False)
|
|
25
|
+
gcloud_server = make_cloud_server(gcloud_path, "gs", tree)
|
|
26
|
+
|
|
27
|
+
# Initialize cloud catalogs
|
|
28
|
+
azure_catalog = get_cloud_test_catalog(azure_server, tmp_path, metastore, warehouse)
|
|
29
|
+
gcloud_catalog = get_cloud_test_catalog(
|
|
30
|
+
gcloud_server, tmp_path, metastore, warehouse
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Define test file paths
|
|
34
|
+
test_filename = "image_1.jpg"
|
|
35
|
+
test_content = b"bytes"
|
|
36
|
+
|
|
37
|
+
source_dir = f"{azure_catalog.src_uri}/source-test-images"
|
|
38
|
+
source_file = f"{source_dir}/{test_filename}"
|
|
39
|
+
|
|
40
|
+
dest_dir = f"{gcloud_catalog.src_uri}/destination-test-images"
|
|
41
|
+
dest_file = f"{dest_dir}/{test_filename}"
|
|
42
|
+
|
|
43
|
+
# Get cloud clients
|
|
44
|
+
azure_client = azure_catalog.catalog.get_client(source_file)
|
|
45
|
+
gcloud_client = gcloud_catalog.catalog.get_client(dest_file)
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
# Create test file in Azure
|
|
49
|
+
with azure_client.fs.open(source_file, "wb") as f:
|
|
50
|
+
f.write(test_content)
|
|
51
|
+
|
|
52
|
+
# Perform cross-cloud transfer
|
|
53
|
+
combined_config = azure_server.client_config | gcloud_server.client_config
|
|
54
|
+
with Session("testSession", client_config=combined_config):
|
|
55
|
+
datachain = DataChain.from_storage(source_dir)
|
|
56
|
+
datachain.to_storage(dest_dir, placement="filename")
|
|
57
|
+
|
|
58
|
+
# Verify transfer
|
|
59
|
+
with gcloud_client.fs.open(dest_file, "rb") as f:
|
|
60
|
+
assert f.read() == test_content
|
|
61
|
+
|
|
62
|
+
finally:
|
|
63
|
+
# Cleanup
|
|
64
|
+
try:
|
|
65
|
+
azure_client.fs.rm(source_dir, recursive=True)
|
|
66
|
+
gcloud_client.fs.rm(dest_dir, recursive=True)
|
|
67
|
+
except FileNotFoundError:
|
|
68
|
+
pass
|