datachain 0.14.0__tar.gz → 0.14.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.14.0/src/datachain.egg-info → datachain-0.14.2}/PKG-INFO +6 -6
- {datachain-0.14.0 → datachain-0.14.2}/README.rst +3 -3
- {datachain-0.14.0 → datachain-0.14.2}/docs/examples.md +20 -20
- {datachain-0.14.0 → datachain-0.14.2}/docs/quick-start.md +9 -9
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/data-types/file.md +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/data-types/imagefile.md +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/data-types/textfile.md +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/data-types/videofile.md +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/datachain.md +9 -9
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/remotes.md +5 -4
- {datachain-0.14.0 → datachain-0.14.2}/examples/computer_vision/iptc_exif_xmp_lib.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/computer_vision/llava2_image_desc_lib.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/computer_vision/openimage-detect.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/computer_vision/ultralytics-bbox.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/computer_vision/ultralytics-pose.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/computer_vision/ultralytics-segment.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/get_started/common_sql_functions.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/get_started/json-csv-reader.py +7 -7
- {datachain-0.14.0 → datachain-0.14.2}/examples/get_started/torch-loader.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/get_started/udfs/parallel.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/get_started/udfs/simple.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/get_started/udfs/stateful.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/llm_and_nlp/claude-query.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/examples/multimodal/clip_inference.py +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/examples/multimodal/hf_pipeline.py +4 -4
- {datachain-0.14.0 → datachain-0.14.2}/examples/multimodal/openai_image_desc_lib.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/examples/multimodal/wds.py +3 -3
- {datachain-0.14.0 → datachain-0.14.2}/examples/multimodal/wds_filtered.py +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/pyproject.toml +4 -3
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/__init__.py +18 -18
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/catalog/catalog.py +6 -6
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/commands/show.py +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/client/fsspec.py +3 -3
- datachain-0.14.2/src/datachain/lib/dc/__init__.py +32 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/csv.py +5 -5
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/datachain.py +42 -42
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/datasets.py +7 -7
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/hf.py +5 -5
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/json.py +6 -6
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/listings.py +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/pandas.py +4 -4
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/parquet.py +5 -5
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/records.py +4 -4
- datachain-0.14.2/src/datachain/lib/dc/storage.py +171 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/values.py +4 -4
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/listing.py +11 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/meta_formats.py +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/pytorch.py +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/udf.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/dataset.py +52 -16
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/toolkit/split.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2/src/datachain.egg-info}/PKG-INFO +6 -6
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/benchmarks/test_datachain.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/benchmarks/test_ls.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/conftest.py +3 -3
- {datachain-0.14.0 → datachain-0.14.2}/tests/examples/test_wds_e2e.py +3 -3
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_catalog.py +4 -4
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_cloud_transfer.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_datachain.py +202 -108
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_datachain_merge.py +4 -4
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_datasets.py +6 -6
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_feature_pickling.py +5 -5
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_file.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_hidden_field.py +3 -3
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_listing.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_ls.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_pull.py +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_pytorch.py +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_query.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_session.py +2 -2
- {datachain-0.14.0 → datachain-0.14.2}/tests/scripts/feature_class.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/scripts/feature_class_exception.py +5 -5
- {datachain-0.14.0 → datachain-0.14.2}/tests/scripts/feature_class_parallel.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/scripts/feature_class_parallel_data_model.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/scripts/name_len_slow.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/test_import_time.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/test_telemetry.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_arrow.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_datachain.py +225 -225
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_datachain_bootstrap.py +4 -4
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_datachain_merge.py +17 -17
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_diff.py +36 -36
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_feature_utils.py +9 -9
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_schema.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/sql/test_selectable.py +1 -1
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_func.py +4 -4
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_listing.py +3 -3
- {datachain-0.14.0 → datachain-0.14.2}/tests/utils.py +1 -1
- datachain-0.14.0/src/datachain/lib/dc/__init__.py +0 -32
- datachain-0.14.0/src/datachain/lib/dc/storage.py +0 -118
- {datachain-0.14.0 → datachain-0.14.2}/.cruft.json +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.gitattributes +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.github/codecov.yaml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.github/dependabot.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.github/workflows/release.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.github/workflows/tests.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.gitignore +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/.pre-commit-config.yaml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/LICENSE +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/assets/datachain.svg +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/contributing.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/index.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/overrides/main.html +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/data-types/index.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/data-types/pose.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/data-types/segment.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/func.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/index.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/toolkit.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/torch.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/references/udf.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/docs/tutorials.md +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/mkdocs.yml +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/noxfile.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/setup.cfg +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/__main__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/asyn.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cache.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/cli/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/client/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/client/azure.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/client/gcs.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/client/hf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/client/local.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/client/s3.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/config.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/dataset.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/error.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/fs/reference.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/fs/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/array.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/base.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/conditional.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/func.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/numeric.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/path.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/random.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/string.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/func/window.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/job.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/clip.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/file.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/hf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/image.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/settings.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/tar.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/text.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/video.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/listing.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/model/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/model/bbox.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/model/pose.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/model/segment.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/model/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/node.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/progress.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/py.typed +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/batch.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/metrics.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/params.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/queue.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/schema.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/session.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/udf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/query/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/remote/studio.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/script_meta.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/types.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/sql/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/studio.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/telemetry.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain/utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/data.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/examples/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/examples/test_examples.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/examples/wds_data.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/data/lena.jpg +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/model/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_client.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_data_storage.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_hf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_image.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_metrics.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_toolkit.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_video.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/func/test_warehouse.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/test_atomicity.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/test_cli_e2e.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/test_cli_studio.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/test_query_e2e.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/model/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_asyn.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_cache.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_catalog.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_client.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_config.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_dataset.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_metastore.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_query.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_query_params.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_serializer.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_session.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_utils.py +0 -0
- {datachain-0.14.0 → datachain-0.14.2}/tests/unit/test_warehouse.py +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.14.
|
|
3
|
+
Version: 0.14.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
-
License: Apache-2.0
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
8
|
Project-URL: Issues, https://github.com/iterative/datachain/issues
|
|
9
9
|
Project-URL: Source, https://github.com/iterative/datachain
|
|
@@ -38,7 +38,7 @@ Requires-Dist: sqlalchemy>=2
|
|
|
38
38
|
Requires-Dist: multiprocess==0.70.16
|
|
39
39
|
Requires-Dist: cloudpickle
|
|
40
40
|
Requires-Dist: orjson>=3.10.5
|
|
41
|
-
Requires-Dist: pydantic<
|
|
41
|
+
Requires-Dist: pydantic<2.11,>=2
|
|
42
42
|
Requires-Dist: jmespath>=1.0
|
|
43
43
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
44
44
|
Requires-Dist: Pillow<12,>=10.0.0
|
|
@@ -171,8 +171,8 @@ high confidence scores.
|
|
|
171
171
|
|
|
172
172
|
import datachain as dc
|
|
173
173
|
|
|
174
|
-
meta = dc.
|
|
175
|
-
images = dc.
|
|
174
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
175
|
+
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
176
176
|
|
|
177
177
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
178
178
|
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
@@ -213,7 +213,7 @@ Python code:
|
|
|
213
213
|
return result.lower().startswith("success")
|
|
214
214
|
|
|
215
215
|
chain = (
|
|
216
|
-
dc.
|
|
216
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
217
217
|
.settings(parallel=4, cache=True)
|
|
218
218
|
.map(is_success=eval_dialogue)
|
|
219
219
|
.save("mistral_files")
|
|
@@ -60,8 +60,8 @@ high confidence scores.
|
|
|
60
60
|
|
|
61
61
|
import datachain as dc
|
|
62
62
|
|
|
63
|
-
meta = dc.
|
|
64
|
-
images = dc.
|
|
63
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
64
|
+
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
65
65
|
|
|
66
66
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
67
67
|
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
@@ -102,7 +102,7 @@ Python code:
|
|
|
102
102
|
return result.lower().startswith("success")
|
|
103
103
|
|
|
104
104
|
chain = (
|
|
105
|
-
dc.
|
|
105
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
106
106
|
.settings(parallel=4, cache=True)
|
|
107
107
|
.map(is_success=eval_dialogue)
|
|
108
108
|
.save("mistral_files")
|
|
@@ -16,7 +16,7 @@ title: Examples
|
|
|
16
16
|
import datachain as dc # (1)!
|
|
17
17
|
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
|
|
18
18
|
|
|
19
|
-
images = dc.
|
|
19
|
+
images = dc.read_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
|
|
20
20
|
|
|
21
21
|
model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-mix-224")
|
|
22
22
|
processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
|
|
@@ -93,20 +93,20 @@ dc.DataModel.register(MistralModel)
|
|
|
93
93
|
|
|
94
94
|
chain = (
|
|
95
95
|
dc
|
|
96
|
-
.
|
|
96
|
+
.read_storage("gs://datachain-demo/chatbot-KiT/", type="text")
|
|
97
97
|
.filter(dc.Column("file.name").glob("*.txt"))
|
|
98
98
|
.limit(5)
|
|
99
99
|
.settings(parallel=4, cache=True)
|
|
100
100
|
.map(
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
101
|
+
mistral=lambda file: MistralClient(api_key=api_key).chat(
|
|
102
|
+
model="open-mixtral-8x22b",
|
|
103
|
+
response_format={"type": "json_object"},
|
|
104
|
+
messages=[
|
|
105
|
+
ChatMessage(role="system", content=f"{prompt}"),
|
|
106
|
+
ChatMessage(role="user", content=f"{file.read()}")
|
|
107
|
+
]
|
|
108
|
+
),
|
|
109
|
+
output=MistralModel
|
|
110
110
|
)
|
|
111
111
|
.save("dialog-rating")
|
|
112
112
|
)
|
|
@@ -145,7 +145,7 @@ The cost of 5 calls to Mixtral 8x22b : $0.0142
|
|
|
145
145
|
The “save” operation makes chain dataset persistent in the current (working) directory of the query. A hidden folder `.datachain/` holds the records. A persistent dataset can be accessed later to start a derivative chain:
|
|
146
146
|
|
|
147
147
|
```python
|
|
148
|
-
dc.
|
|
148
|
+
dc.read_dataset("rating").limit(2).save("dialog-rating")
|
|
149
149
|
```
|
|
150
150
|
|
|
151
151
|
Persistent datasets are immutable and automatically versioned. Here is how to access the dataset registry:
|
|
@@ -167,7 +167,7 @@ dialog-rating@v2
|
|
|
167
167
|
By default, when a saved dataset is loaded, the latest version is fetched but another version can be requested:
|
|
168
168
|
|
|
169
169
|
```python
|
|
170
|
-
ds = dc.
|
|
170
|
+
ds = dc.read_dataset("dialog-rating", version=1)
|
|
171
171
|
```
|
|
172
172
|
|
|
173
173
|
### Chain execution, optimization and parallelism
|
|
@@ -189,8 +189,8 @@ Here is an example of reading a simple CSV file where schema is heuristically de
|
|
|
189
189
|
```python
|
|
190
190
|
from datachain import DataChain
|
|
191
191
|
|
|
192
|
-
uri="gs://datachain-demo/chatbot-csv/"
|
|
193
|
-
csv_dataset = dc.
|
|
192
|
+
uri = "gs://datachain-demo/chatbot-csv/"
|
|
193
|
+
csv_dataset = dc.read_csv(uri)
|
|
194
194
|
|
|
195
195
|
print(csv_dataset.to_pandas())
|
|
196
196
|
```
|
|
@@ -233,12 +233,12 @@ However, Datachain can easily parse the entire COCO structure via several readin
|
|
|
233
233
|
```python
|
|
234
234
|
import datachain as dc
|
|
235
235
|
|
|
236
|
-
images_uri="gs://datachain-demo/coco2017/images/val/"
|
|
237
|
-
captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json"
|
|
236
|
+
images_uri = "gs://datachain-demo/coco2017/images/val/"
|
|
237
|
+
captions_uri = "gs://datachain-demo/coco2017/annotations/captions_val2017.json"
|
|
238
238
|
|
|
239
|
-
images = dc.
|
|
240
|
-
meta = dc.
|
|
241
|
-
captions = dc.
|
|
239
|
+
images = dc.read_storage(images_uri)
|
|
240
|
+
meta = dc.read_json(captions_uri, jmespath="images")
|
|
241
|
+
captions = dc.read_json(captions_uri, jmespath="annotations")
|
|
242
242
|
|
|
243
243
|
images_meta = images.merge(meta, on="file.name", right_on="images.file_name")
|
|
244
244
|
captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id")
|
|
@@ -39,8 +39,8 @@ using JSON metadata:
|
|
|
39
39
|
``` py
|
|
40
40
|
import datachain as dc
|
|
41
41
|
|
|
42
|
-
meta = dc.
|
|
43
|
-
images = dc.
|
|
42
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
43
|
+
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
44
44
|
|
|
45
45
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
46
46
|
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
@@ -77,7 +77,7 @@ def is_positive_dialogue_ending(file) -> bool:
|
|
|
77
77
|
return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
|
|
78
78
|
|
|
79
79
|
chain = (
|
|
80
|
-
dc.
|
|
80
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
81
81
|
object_name="file", type="text", anon=True)
|
|
82
82
|
.settings(parallel=8, cache=True)
|
|
83
83
|
.map(is_positive=is_positive_dialogue_ending)
|
|
@@ -132,7 +132,7 @@ def eval_dialogue(file: dc.File) -> bool:
|
|
|
132
132
|
return result.lower().startswith("success")
|
|
133
133
|
|
|
134
134
|
chain = (
|
|
135
|
-
dc.
|
|
135
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
136
136
|
.map(is_success=eval_dialogue)
|
|
137
137
|
.save("mistral_files")
|
|
138
138
|
)
|
|
@@ -177,7 +177,7 @@ def eval_dialog(file: dc.File) -> ChatCompletionResponse:
|
|
|
177
177
|
{"role": "user", "content": file.read()}])
|
|
178
178
|
|
|
179
179
|
chain = (
|
|
180
|
-
dc.
|
|
180
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
181
181
|
.settings(parallel=4, cache=True)
|
|
182
182
|
.map(response=eval_dialog)
|
|
183
183
|
.map(status=lambda response: response.choices[0].message.content.lower()[:7])
|
|
@@ -210,14 +210,14 @@ name usage usage usage
|
|
|
210
210
|
|
|
211
211
|
In the previous examples, datasets were saved in the embedded database
|
|
212
212
|
(`SQLite` in folder `.datachain` of the working directory). These datasets were automatically versioned, and
|
|
213
|
-
can be accessed using `dc.
|
|
213
|
+
can be accessed using `dc.read_dataset("dataset_name")`.
|
|
214
214
|
|
|
215
215
|
Here is how to retrieve a saved dataset and iterate over the objects:
|
|
216
216
|
|
|
217
217
|
``` py
|
|
218
218
|
import datachain as dc
|
|
219
219
|
|
|
220
|
-
chain = dc.
|
|
220
|
+
chain = dc.read_dataset("response")
|
|
221
221
|
|
|
222
222
|
# Iterating one-by-one: support out-of-memory workflow
|
|
223
223
|
for file, response in chain.limit(5).collect("file", "response"):
|
|
@@ -248,7 +248,7 @@ output tokens:
|
|
|
248
248
|
|
|
249
249
|
``` py
|
|
250
250
|
import datachain as dc
|
|
251
|
-
chain = dc.
|
|
251
|
+
chain = dc.read_dataset("mistral_dataset")
|
|
252
252
|
|
|
253
253
|
cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
|
|
254
254
|
+ chain.sum("response.usage.completion_tokens")*0.000006
|
|
@@ -276,7 +276,7 @@ import datachain as dc
|
|
|
276
276
|
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
277
277
|
|
|
278
278
|
chain = (
|
|
279
|
-
dc.
|
|
279
|
+
dc.read_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
|
|
280
280
|
.map(label=lambda name: name.split(".")[0], params=["file.name"])
|
|
281
281
|
.select("file", "label").to_pytorch(
|
|
282
282
|
transform=processor.image_processor,
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
`File` is a special [`DataModel`](index.md#datachain.lib.data_model.DataModel),
|
|
4
4
|
which is automatically generated when a `DataChain` is created from files,
|
|
5
|
-
such as in [`dc.
|
|
5
|
+
such as in [`dc.read_storage`](../datachain.md#datachain.lib.dc.storage.read_storage):
|
|
6
6
|
|
|
7
7
|
```python
|
|
8
8
|
import datachain as dc
|
|
9
9
|
|
|
10
|
-
chain = dc.
|
|
10
|
+
chain = dc.read_storage("gs://datachain-demo/dogs-and-cats")
|
|
11
11
|
chain.print_schema()
|
|
12
12
|
```
|
|
13
13
|
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
`ImageFile` is inherited from [`File`](file.md) with additional methods for working with image files.
|
|
4
4
|
|
|
5
|
-
`ImageFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.storage.
|
|
5
|
+
`ImageFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.storage.read_storage), using `type="image"` param:
|
|
6
6
|
|
|
7
7
|
```python
|
|
8
8
|
import datachain as dc
|
|
9
9
|
|
|
10
|
-
chain = dc.
|
|
10
|
+
chain = dc.read_storage("s3://bucket-name/", type="image")
|
|
11
11
|
```
|
|
12
12
|
|
|
13
13
|
::: datachain.lib.file.ImageFile
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
`TextFile` is inherited from [`File`](file.md) with additional methods for working with text files.
|
|
4
4
|
|
|
5
|
-
`TextFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.storage.
|
|
5
|
+
`TextFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.storage.read_storage), using `type="text"` param:
|
|
6
6
|
|
|
7
7
|
```python
|
|
8
8
|
import datachain as dc
|
|
9
9
|
|
|
10
|
-
chain = dc.
|
|
10
|
+
chain = dc.read_storage("s3://bucket-name/", type="text")
|
|
11
11
|
```
|
|
12
12
|
|
|
13
13
|
::: datachain.lib.file.TextFile
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
`VideoFile` extends [`File`](file.md) and provides additional methods for working with video files.
|
|
4
4
|
|
|
5
|
-
`VideoFile` instances are created when a `DataChain` is initialized [from storage](../datachain.md#datachain.lib.dc.storage.
|
|
5
|
+
`VideoFile` instances are created when a `DataChain` is initialized [from storage](../datachain.md#datachain.lib.dc.storage.read_storage) with the `type="video"` parameter:
|
|
6
6
|
|
|
7
7
|
```python
|
|
8
8
|
import datachain as dc
|
|
9
9
|
|
|
10
|
-
chain = dc.
|
|
10
|
+
chain = dc.read_storage("s3://bucket-name/", type="video")
|
|
11
11
|
```
|
|
12
12
|
|
|
13
13
|
There are additional models for working with video files:
|
|
@@ -9,27 +9,27 @@ for examples of how to create a chain.
|
|
|
9
9
|
|
|
10
10
|
::: datachain.query.schema.Column
|
|
11
11
|
|
|
12
|
-
::: datachain.lib.dc.csv.
|
|
12
|
+
::: datachain.lib.dc.csv.read_csv
|
|
13
13
|
|
|
14
|
-
::: datachain.lib.dc.datasets.
|
|
14
|
+
::: datachain.lib.dc.datasets.read_dataset
|
|
15
15
|
|
|
16
16
|
::: datachain.lib.dc.datasets.datasets
|
|
17
17
|
|
|
18
|
-
::: datachain.lib.dc.hf.
|
|
18
|
+
::: datachain.lib.dc.hf.read_hf
|
|
19
19
|
|
|
20
|
-
::: datachain.lib.dc.json.
|
|
20
|
+
::: datachain.lib.dc.json.read_json
|
|
21
21
|
|
|
22
22
|
::: datachain.lib.dc.listings.listings
|
|
23
23
|
|
|
24
|
-
::: datachain.lib.dc.pandas.
|
|
24
|
+
::: datachain.lib.dc.pandas.read_pandas
|
|
25
25
|
|
|
26
|
-
::: datachain.lib.dc.parquet.
|
|
26
|
+
::: datachain.lib.dc.parquet.read_parquet
|
|
27
27
|
|
|
28
|
-
::: datachain.lib.dc.records.
|
|
28
|
+
::: datachain.lib.dc.records.read_records
|
|
29
29
|
|
|
30
|
-
::: datachain.lib.dc.storage.
|
|
30
|
+
::: datachain.lib.dc.storage.read_storage
|
|
31
31
|
|
|
32
|
-
::: datachain.lib.dc.values.
|
|
32
|
+
::: datachain.lib.dc.values.read_values
|
|
33
33
|
|
|
34
34
|
::: datachain.lib.dc.DataChain
|
|
35
35
|
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
# Interacting with remote storage
|
|
2
2
|
|
|
3
|
-
DataChain supports reading and writing data from different remote storages using methods like `dc.
|
|
3
|
+
DataChain supports reading and writing data from different remote storages using methods like `dc.read_storage` and `dc.to_storage`. The supported storages includes: local file system, AWS S3 storage, Google Cloud Storage, Azure Blob Storage, Hugging Face and more.
|
|
4
4
|
|
|
5
5
|
Example implementation for reading and writing data from/to different remote storages:
|
|
6
6
|
|
|
7
7
|
```python
|
|
8
8
|
import datachain as dc
|
|
9
9
|
|
|
10
|
-
dc = dc.
|
|
10
|
+
dc = dc.read_storage("s3://bucket-name/path/to/data")
|
|
11
11
|
dc.to_storage("gs://bucket-name/path/to/data")
|
|
12
12
|
```
|
|
13
13
|
|
|
14
|
-
DataChain uses [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) to interact with different remote storages. You can pass the following fsspec-supported URIs to `
|
|
14
|
+
DataChain uses [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) to interact with different remote storages. You can pass the following fsspec-supported URIs to `read_storage` and `to_storage` methods.
|
|
15
15
|
|
|
16
16
|
- Local file system: `file://path/to/data`
|
|
17
17
|
- AWS S3 storage: `s3://bucket-name/path/to/data`
|
|
@@ -134,8 +134,9 @@ DataChain uses [s3fs](https://s3fs.readthedocs.io/en/latest/) to interact with A
|
|
|
134
134
|
|
|
135
135
|
|
|
136
136
|
Example:
|
|
137
|
+
|
|
137
138
|
```python
|
|
138
|
-
chain = dc.
|
|
139
|
+
chain = dc.read_storage(
|
|
139
140
|
"s3://my-bucket/my-dir",
|
|
140
141
|
client_config = {
|
|
141
142
|
"endpoint_url": "<minio-endpoint-url>",
|
|
@@ -41,7 +41,7 @@ def openimage_detect(args):
|
|
|
41
41
|
source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
|
|
42
42
|
|
|
43
43
|
(
|
|
44
|
-
dc.
|
|
44
|
+
dc.read_storage(source)
|
|
45
45
|
.filter(dc.C("file.path").glob("*.jpg") | dc.C("file.path").glob("*.json"))
|
|
46
46
|
.agg(
|
|
47
47
|
openimage_detect,
|
|
@@ -10,7 +10,7 @@ def process_bboxes(yolo: YOLO, file: dc.File) -> YoloBBoxes:
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
(
|
|
13
|
-
dc.
|
|
13
|
+
dc.read_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
|
|
14
14
|
.filter(dc.C("file.path").glob("*.jpg"))
|
|
15
15
|
.limit(20)
|
|
16
16
|
.setup(yolo=lambda: YOLO("yolo11n.pt"))
|
|
@@ -10,7 +10,7 @@ def process_poses(yolo: YOLO, file: dc.File) -> YoloPoses:
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
(
|
|
13
|
-
dc.
|
|
13
|
+
dc.read_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
|
|
14
14
|
.filter(dc.C("file.path").glob("*.jpg"))
|
|
15
15
|
.limit(20)
|
|
16
16
|
.setup(yolo=lambda: YOLO("yolo11n-pose.pt"))
|
|
@@ -10,7 +10,7 @@ def process_segments(yolo: YOLO, file: dc.File) -> YoloSegments:
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
(
|
|
13
|
-
dc.
|
|
13
|
+
dc.read_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
|
|
14
14
|
.filter(dc.C("file.path").glob("*.jpg"))
|
|
15
15
|
.limit(20)
|
|
16
16
|
.setup(yolo=lambda: YOLO("yolo11n-seg.pt"))
|
|
@@ -9,7 +9,7 @@ def num_chars_udf(file):
|
|
|
9
9
|
return ([],)
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
chain = dc.
|
|
12
|
+
chain = dc.read_storage("gs://datachain-demo/dogs-and-cats/", anon=True)
|
|
13
13
|
chain.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
|
|
14
14
|
"file.path", "num_chars"
|
|
15
15
|
).show(5)
|
|
@@ -31,7 +31,7 @@ ChatFeature = ModelStore.register(ChatDialog)
|
|
|
31
31
|
def main():
|
|
32
32
|
# Dynamic JSONl schema from 2 objects
|
|
33
33
|
uri = "gs://datachain-demo/jsonl/object.jsonl"
|
|
34
|
-
jsonl_ds = dc.
|
|
34
|
+
jsonl_ds = dc.read_json(uri, format="jsonl", anon="True")
|
|
35
35
|
jsonl_ds.show()
|
|
36
36
|
|
|
37
37
|
# Dynamic JSON schema from 200 OpenImage json-pairs with validation errors
|
|
@@ -39,7 +39,7 @@ def main():
|
|
|
39
39
|
schema_uri = (
|
|
40
40
|
"gs://datachain-demo/openimages-v6-test-jsonpairs/08392c290ecc9d2a.json"
|
|
41
41
|
)
|
|
42
|
-
json_pairs_ds = dc.
|
|
42
|
+
json_pairs_ds = dc.read_json(
|
|
43
43
|
uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage", anon="True"
|
|
44
44
|
)
|
|
45
45
|
json_pairs_ds.show()
|
|
@@ -47,29 +47,29 @@ def main():
|
|
|
47
47
|
uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
48
48
|
|
|
49
49
|
# Print JSON schema in Pydantic format from main COCO annotation
|
|
50
|
-
chain = dc.
|
|
50
|
+
chain = dc.read_storage(uri, anon="True").filter(dc.C("file.path").glob("*.json"))
|
|
51
51
|
file = next(chain.limit(1).collect("file"))
|
|
52
52
|
print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
|
|
53
53
|
|
|
54
54
|
# Static JSON schema test parsing 3/7 objects
|
|
55
|
-
static_json_ds = dc.
|
|
55
|
+
static_json_ds = dc.read_json(
|
|
56
56
|
uri, jmespath="licenses", spec=LicenseFeature, nrows=3, anon="True"
|
|
57
57
|
)
|
|
58
58
|
static_json_ds.show()
|
|
59
59
|
|
|
60
60
|
# Dynamic JSON schema test parsing 5K objects
|
|
61
|
-
dynamic_json_ds = dc.
|
|
61
|
+
dynamic_json_ds = dc.read_json(uri, jmespath="images", anon="True")
|
|
62
62
|
print(dynamic_json_ds.to_pandas())
|
|
63
63
|
|
|
64
64
|
# Static CSV with header schema test parsing 3.5K objects
|
|
65
65
|
uri = "gs://datachain-demo/chatbot-csv/"
|
|
66
|
-
static_csv_ds = dc.
|
|
66
|
+
static_csv_ds = dc.read_csv(uri, output=ChatDialog, object_name="chat", anon="True")
|
|
67
67
|
static_csv_ds.print_schema()
|
|
68
68
|
static_csv_ds.show()
|
|
69
69
|
|
|
70
70
|
# Dynamic CSV with header schema test parsing 3/3M objects
|
|
71
71
|
uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
|
|
72
|
-
dynamic_csv_ds = dc.
|
|
72
|
+
dynamic_csv_ds = dc.read_csv(uri, object_name="laion", nrows=3, anon="True")
|
|
73
73
|
dynamic_csv_ds.print_schema()
|
|
74
74
|
dynamic_csv_ds.show()
|
|
75
75
|
|
|
@@ -34,7 +34,7 @@ class ImageEncoder(dc.Mapper):
|
|
|
34
34
|
if __name__ == "__main__":
|
|
35
35
|
# Run in chain
|
|
36
36
|
(
|
|
37
|
-
dc.
|
|
37
|
+
dc.read_storage("gs://datachain-demo/dogs-and-cats/", type="image")
|
|
38
38
|
.filter(dc.C("file.path").glob("*cat*.jpg"))
|
|
39
39
|
.settings(parallel=2)
|
|
40
40
|
.limit(5)
|
|
@@ -48,7 +48,7 @@ def eval_dialog(
|
|
|
48
48
|
# Save to HF as Parquet. Dataset can be previewed here:
|
|
49
49
|
# https://huggingface.co/datasets/dvcorg/test-datachain-llm-eval/viewer
|
|
50
50
|
(
|
|
51
|
-
dc.
|
|
51
|
+
dc.read_csv("hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv")
|
|
52
52
|
.settings(parallel=10)
|
|
53
53
|
.setup(client=lambda: InferenceClient("meta-llama/Llama-3.1-70B-Instruct"))
|
|
54
54
|
.map(response=eval_dialog)
|
|
@@ -58,7 +58,7 @@ def eval_dialog(
|
|
|
58
58
|
# Read it back to filter and show.
|
|
59
59
|
# It restores the Pydantic model from Parquet under the hood.
|
|
60
60
|
(
|
|
61
|
-
dc.
|
|
61
|
+
dc.read_parquet(
|
|
62
62
|
"hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet", source=False
|
|
63
63
|
)
|
|
64
64
|
.filter(dc.C("response.result") == "Failure")
|
|
@@ -9,8 +9,8 @@ source = "gs://datachain-demo/50k-laion-files/000000/00000000*"
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def create_dataset():
|
|
12
|
-
imgs = dc.
|
|
13
|
-
captions = dc.
|
|
12
|
+
imgs = dc.read_storage(source, type="image").filter(dc.C("file.path").glob("*.jpg"))
|
|
13
|
+
captions = dc.read_storage(source, type="text").filter(
|
|
14
14
|
dc.C("file.path").glob("*.txt")
|
|
15
15
|
)
|
|
16
16
|
return imgs.merge(
|
|
@@ -50,7 +50,7 @@ if __name__ == "__main__":
|
|
|
50
50
|
print("** HuggingFace pipeline helper model zoo demo **")
|
|
51
51
|
print("\nZero-shot object detection and classification:")
|
|
52
52
|
(
|
|
53
|
-
dc.
|
|
53
|
+
dc.read_storage(
|
|
54
54
|
image_source,
|
|
55
55
|
anon=True,
|
|
56
56
|
type="image",
|
|
@@ -72,7 +72,7 @@ if __name__ == "__main__":
|
|
|
72
72
|
|
|
73
73
|
print("\nNot-safe-for-work image detection:")
|
|
74
74
|
(
|
|
75
|
-
dc.
|
|
75
|
+
dc.read_storage(
|
|
76
76
|
image_source,
|
|
77
77
|
anon=True,
|
|
78
78
|
type="image",
|
|
@@ -95,7 +95,7 @@ if __name__ == "__main__":
|
|
|
95
95
|
try:
|
|
96
96
|
subprocess.run(["ffmpeg", "-L"], check=True) # noqa: S603, S607
|
|
97
97
|
(
|
|
98
|
-
dc.
|
|
98
|
+
dc.read_storage(
|
|
99
99
|
audio_source,
|
|
100
100
|
anon=True,
|
|
101
101
|
type="binary",
|
|
@@ -118,7 +118,7 @@ if __name__ == "__main__":
|
|
|
118
118
|
|
|
119
119
|
print("\nLong text summarization:")
|
|
120
120
|
(
|
|
121
|
-
dc.
|
|
121
|
+
dc.read_storage(
|
|
122
122
|
text_source,
|
|
123
123
|
anon=True,
|
|
124
124
|
type="text",
|
|
@@ -16,18 +16,18 @@ NPZ_METADATA = os.getenv(
|
|
|
16
16
|
)
|
|
17
17
|
|
|
18
18
|
wds_images = (
|
|
19
|
-
dc.
|
|
19
|
+
dc.read_storage(IMAGE_TARS, type="image")
|
|
20
20
|
.settings(cache=True)
|
|
21
21
|
.gen(laion=process_webdataset(spec=WDSLaion), params="file")
|
|
22
22
|
)
|
|
23
23
|
|
|
24
24
|
wds_with_pq = (
|
|
25
|
-
dc.
|
|
25
|
+
dc.read_parquet(PARQUET_METADATA)
|
|
26
26
|
.settings(cache=True)
|
|
27
27
|
.merge(wds_images, on="uid", right_on="laion.json.uid", inner=True)
|
|
28
28
|
)
|
|
29
29
|
|
|
30
|
-
wds_npz = dc.
|
|
30
|
+
wds_npz = dc.read_storage(NPZ_METADATA).settings(cache=True).gen(emd=process_laion_meta)
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
res = wds_npz.merge(
|
|
@@ -6,10 +6,10 @@ from datachain.lib.webdataset_laion import WDSLaion
|
|
|
6
6
|
|
|
7
7
|
name = "wds"
|
|
8
8
|
try:
|
|
9
|
-
wds = dc.
|
|
9
|
+
wds = dc.read_dataset(name=name)
|
|
10
10
|
except datachain.error.DatasetNotFoundError:
|
|
11
11
|
wds = (
|
|
12
|
-
dc.
|
|
12
|
+
dc.read_storage("gs://datachain-demo/datacomp-small/shards")
|
|
13
13
|
.filter(dc.C("file.path").glob("*/00000000.tar"))
|
|
14
14
|
.settings(cache=True)
|
|
15
15
|
.gen(laion=process_webdataset(spec=WDSLaion), params="file")
|