datachain 0.13.1__tar.gz → 0.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.13.1 → datachain-0.14.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.13.1 → datachain-0.14.0}/PKG-INFO +10 -10
- {datachain-0.13.1 → datachain-0.14.0}/README.rst +9 -9
- {datachain-0.13.1 → datachain-0.14.0}/docs/examples.md +15 -16
- {datachain-0.13.1 → datachain-0.14.0}/docs/quick-start.md +23 -20
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/file.md +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/imagefile.md +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/textfile.md +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/videofile.md +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/datachain.md +22 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/remotes.md +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/iptc_exif_xmp_lib.py +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/llava2_image_desc_lib.py +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/openimage-detect.py +4 -3
- {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/ultralytics-bbox.py +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/ultralytics-pose.py +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/ultralytics-segment.py +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/common_sql_functions.py +14 -14
- {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/json-csv-reader.py +8 -12
- {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/torch-loader.py +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/udfs/parallel.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/udfs/simple.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/udfs/stateful.py +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/examples/llm_and_nlp/claude-query.py +7 -7
- {datachain-0.13.1 → datachain-0.14.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +5 -7
- {datachain-0.13.1 → datachain-0.14.0}/examples/multimodal/clip_inference.py +6 -8
- {datachain-0.13.1 → datachain-0.14.0}/examples/multimodal/hf_pipeline.py +10 -10
- {datachain-0.13.1 → datachain-0.14.0}/examples/multimodal/openai_image_desc_lib.py +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/examples/multimodal/wds.py +4 -8
- {datachain-0.13.1 → datachain-0.14.0}/examples/multimodal/wds_filtered.py +5 -4
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/__init__.py +28 -1
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/catalog/catalog.py +5 -9
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/ls.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/show.py +2 -3
- datachain-0.14.0/src/datachain/lib/dc/__init__.py +32 -0
- datachain-0.14.0/src/datachain/lib/dc/csv.py +127 -0
- datachain-0.13.1/src/datachain/lib/dc.py → datachain-0.14.0/src/datachain/lib/dc/datachain.py +144 -733
- datachain-0.14.0/src/datachain/lib/dc/datasets.py +149 -0
- datachain-0.14.0/src/datachain/lib/dc/hf.py +73 -0
- datachain-0.14.0/src/datachain/lib/dc/json.py +91 -0
- datachain-0.14.0/src/datachain/lib/dc/listings.py +43 -0
- datachain-0.14.0/src/datachain/lib/dc/pandas.py +56 -0
- datachain-0.14.0/src/datachain/lib/dc/parquet.py +65 -0
- datachain-0.14.0/src/datachain/lib/dc/records.py +90 -0
- datachain-0.14.0/src/datachain/lib/dc/storage.py +118 -0
- datachain-0.14.0/src/datachain/lib/dc/utils.py +128 -0
- datachain-0.14.0/src/datachain/lib/dc/values.py +53 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/meta_formats.py +2 -4
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/pytorch.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/udf.py +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/toolkit/split.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/PKG-INFO +10 -10
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/SOURCES.txt +13 -1
- {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/test_datachain.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/tests/conftest.py +52 -4
- {datachain-0.13.1 → datachain-0.14.0}/tests/examples/test_wds_e2e.py +8 -8
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_catalog.py +15 -15
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_cloud_transfer.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_data_storage.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_datachain.py +193 -195
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_datachain_merge.py +5 -5
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_datasets.py +9 -13
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_feature_pickling.py +11 -11
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_file.py +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_hidden_field.py +6 -6
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_listing.py +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_ls.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_pull.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_pytorch.py +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_query.py +5 -4
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_session.py +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/tests/scripts/feature_class.py +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/tests/scripts/feature_class_exception.py +6 -6
- {datachain-0.13.1 → datachain-0.14.0}/tests/scripts/feature_class_parallel.py +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/tests/scripts/feature_class_parallel_data_model.py +3 -2
- {datachain-0.13.1 → datachain-0.14.0}/tests/scripts/name_len_slow.py +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/tests/test_import_time.py +10 -10
- {datachain-0.13.1 → datachain-0.14.0}/tests/test_telemetry.py +2 -2
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_arrow.py +3 -3
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_datachain.py +328 -358
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_datachain_bootstrap.py +6 -5
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_datachain_merge.py +23 -24
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_diff.py +36 -38
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_feature_utils.py +12 -12
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_schema.py +4 -4
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_func.py +169 -115
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_listing.py +4 -8
- {datachain-0.13.1 → datachain-0.14.0}/tests/utils.py +17 -5
- {datachain-0.13.1 → datachain-0.14.0}/.cruft.json +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.gitattributes +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.github/codecov.yaml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.github/dependabot.yml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.github/workflows/release.yml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/.gitignore +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/LICENSE +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/contributing.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/index.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/overrides/main.html +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/func.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/index.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/toolkit.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/torch.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/references/udf.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/docs/tutorials.md +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/mkdocs.yml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/noxfile.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/pyproject.toml +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/setup.cfg +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/__main__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/asyn.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cache.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/local.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/config.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/dataset.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/error.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/array.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/base.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/func.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/path.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/random.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/string.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/window.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/job.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/file.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/listing.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/node.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/progress.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/py.typed +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/dataset.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/params.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/session.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/studio.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain/utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/data.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/examples/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_client.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_hf.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_image.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_video.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/test_atomicity.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/test_cli_studio.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_client.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_config.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_query.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_session.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_warehouse.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -169,16 +169,16 @@ high confidence scores.
|
|
|
169
169
|
|
|
170
170
|
.. code:: py
|
|
171
171
|
|
|
172
|
-
|
|
172
|
+
import datachain as dc
|
|
173
173
|
|
|
174
|
-
meta =
|
|
175
|
-
images =
|
|
174
|
+
meta = dc.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
175
|
+
images = dc.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
176
176
|
|
|
177
177
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
178
178
|
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
179
179
|
|
|
180
|
-
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
181
|
-
& (Column("meta.inference.class_") == "cat"))
|
|
180
|
+
likely_cats = annotated.filter((dc.Column("meta.inference.confidence") > 0.93) \
|
|
181
|
+
& (dc.Column("meta.inference.class_") == "cat"))
|
|
182
182
|
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
183
183
|
|
|
184
184
|
|
|
@@ -199,11 +199,11 @@ Python code:
|
|
|
199
199
|
|
|
200
200
|
import os
|
|
201
201
|
from mistralai import Mistral
|
|
202
|
-
|
|
202
|
+
import datachain as dc
|
|
203
203
|
|
|
204
204
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
205
205
|
|
|
206
|
-
def eval_dialogue(file: File) -> bool:
|
|
206
|
+
def eval_dialogue(file: dc.File) -> bool:
|
|
207
207
|
client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
|
|
208
208
|
response = client.chat.complete(
|
|
209
209
|
model="open-mixtral-8x22b",
|
|
@@ -213,13 +213,13 @@ Python code:
|
|
|
213
213
|
return result.lower().startswith("success")
|
|
214
214
|
|
|
215
215
|
chain = (
|
|
216
|
-
|
|
216
|
+
dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
217
217
|
.settings(parallel=4, cache=True)
|
|
218
218
|
.map(is_success=eval_dialogue)
|
|
219
219
|
.save("mistral_files")
|
|
220
220
|
)
|
|
221
221
|
|
|
222
|
-
successful_chain = chain.filter(Column("is_success") == True)
|
|
222
|
+
successful_chain = chain.filter(dc.Column("is_success") == True)
|
|
223
223
|
successful_chain.to_storage("./output_mistral")
|
|
224
224
|
|
|
225
225
|
print(f"{successful_chain.count()} files were exported")
|
|
@@ -58,16 +58,16 @@ high confidence scores.
|
|
|
58
58
|
|
|
59
59
|
.. code:: py
|
|
60
60
|
|
|
61
|
-
|
|
61
|
+
import datachain as dc
|
|
62
62
|
|
|
63
|
-
meta =
|
|
64
|
-
images =
|
|
63
|
+
meta = dc.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
64
|
+
images = dc.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
65
65
|
|
|
66
66
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
67
67
|
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
68
68
|
|
|
69
|
-
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
70
|
-
& (Column("meta.inference.class_") == "cat"))
|
|
69
|
+
likely_cats = annotated.filter((dc.Column("meta.inference.confidence") > 0.93) \
|
|
70
|
+
& (dc.Column("meta.inference.class_") == "cat"))
|
|
71
71
|
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
72
72
|
|
|
73
73
|
|
|
@@ -88,11 +88,11 @@ Python code:
|
|
|
88
88
|
|
|
89
89
|
import os
|
|
90
90
|
from mistralai import Mistral
|
|
91
|
-
|
|
91
|
+
import datachain as dc
|
|
92
92
|
|
|
93
93
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
94
94
|
|
|
95
|
-
def eval_dialogue(file: File) -> bool:
|
|
95
|
+
def eval_dialogue(file: dc.File) -> bool:
|
|
96
96
|
client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
|
|
97
97
|
response = client.chat.complete(
|
|
98
98
|
model="open-mixtral-8x22b",
|
|
@@ -102,13 +102,13 @@ Python code:
|
|
|
102
102
|
return result.lower().startswith("success")
|
|
103
103
|
|
|
104
104
|
chain = (
|
|
105
|
-
|
|
105
|
+
dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
106
106
|
.settings(parallel=4, cache=True)
|
|
107
107
|
.map(is_success=eval_dialogue)
|
|
108
108
|
.save("mistral_files")
|
|
109
109
|
)
|
|
110
110
|
|
|
111
|
-
successful_chain = chain.filter(Column("is_success") == True)
|
|
111
|
+
successful_chain = chain.filter(dc.Column("is_success") == True)
|
|
112
112
|
successful_chain.to_storage("./output_mistral")
|
|
113
113
|
|
|
114
114
|
print(f"{successful_chain.count()} files were exported")
|
|
@@ -13,10 +13,10 @@ title: Examples
|
|
|
13
13
|
For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”:
|
|
14
14
|
|
|
15
15
|
```python
|
|
16
|
-
|
|
16
|
+
import datachain as dc # (1)!
|
|
17
17
|
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
|
|
18
18
|
|
|
19
|
-
images =
|
|
19
|
+
images = dc.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
|
|
20
20
|
|
|
21
21
|
model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-mix-224")
|
|
22
22
|
processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
|
|
@@ -80,7 +80,7 @@ In the below example, we are calling a Mixtral 8x22b model to judge the “servi
|
|
|
80
80
|
# $ export MISTRAL_API_KEY='your key'
|
|
81
81
|
|
|
82
82
|
import os
|
|
83
|
-
|
|
83
|
+
import datachain as dc
|
|
84
84
|
from mistralai.client import MistralClient
|
|
85
85
|
from mistralai.models.chat_completion import ChatMessage
|
|
86
86
|
from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel
|
|
@@ -89,12 +89,12 @@ prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in
|
|
|
89
89
|
api_key = os.environ["MISTRAL_API_KEY"]
|
|
90
90
|
|
|
91
91
|
## register the data model ###
|
|
92
|
-
DataModel.register(MistralModel)
|
|
92
|
+
dc.DataModel.register(MistralModel)
|
|
93
93
|
|
|
94
94
|
chain = (
|
|
95
|
-
|
|
95
|
+
dc
|
|
96
96
|
.from_storage("gs://datachain-demo/chatbot-KiT/", type="text")
|
|
97
|
-
.filter(Column("file.name").glob("*.txt"))
|
|
97
|
+
.filter(dc.Column("file.name").glob("*.txt"))
|
|
98
98
|
.limit(5)
|
|
99
99
|
.settings(parallel=4, cache=True)
|
|
100
100
|
.map(
|
|
@@ -145,13 +145,13 @@ The cost of 5 calls to Mixtral 8x22b : $0.0142
|
|
|
145
145
|
The “save” operation makes chain dataset persistent in the current (working) directory of the query. A hidden folder `.datachain/` holds the records. A persistent dataset can be accessed later to start a derivative chain:
|
|
146
146
|
|
|
147
147
|
```python
|
|
148
|
-
|
|
148
|
+
dc.from_dataset("rating").limit(2).save("dialog-rating")
|
|
149
149
|
```
|
|
150
150
|
|
|
151
151
|
Persistent datasets are immutable and automatically versioned. Here is how to access the dataset registry:
|
|
152
152
|
|
|
153
153
|
```python
|
|
154
|
-
mydatasets =
|
|
154
|
+
mydatasets = dc.datasets()
|
|
155
155
|
for ds in mydatasets.collect("dataset"):
|
|
156
156
|
print(f"{ds.name}@v{ds.version}")
|
|
157
157
|
|
|
@@ -167,7 +167,7 @@ dialog-rating@v2
|
|
|
167
167
|
By default, when a saved dataset is loaded, the latest version is fetched but another version can be requested:
|
|
168
168
|
|
|
169
169
|
```python
|
|
170
|
-
ds =
|
|
170
|
+
ds = dc.from_dataset("dialog-rating", version = 1)
|
|
171
171
|
```
|
|
172
172
|
|
|
173
173
|
### Chain execution, optimization and parallelism
|
|
@@ -190,7 +190,7 @@ Here is an example of reading a simple CSV file where schema is heuristically de
|
|
|
190
190
|
from datachain import DataChain
|
|
191
191
|
|
|
192
192
|
uri="gs://datachain-demo/chatbot-csv/"
|
|
193
|
-
csv_dataset =
|
|
193
|
+
csv_dataset = dc.from_csv(uri)
|
|
194
194
|
|
|
195
195
|
print(csv_dataset.to_pandas())
|
|
196
196
|
```
|
|
@@ -231,15 +231,14 @@ Note how complicated the setup is. Every image is references by the name, and th
|
|
|
231
231
|
However, Datachain can easily parse the entire COCO structure via several reading and merging operators:
|
|
232
232
|
|
|
233
233
|
```python
|
|
234
|
-
|
|
235
|
-
from datachain import Column, DataChain
|
|
234
|
+
import datachain as dc
|
|
236
235
|
|
|
237
236
|
images_uri="gs://datachain-demo/coco2017/images/val/"
|
|
238
237
|
captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json"
|
|
239
238
|
|
|
240
|
-
images =
|
|
241
|
-
meta =
|
|
242
|
-
captions =
|
|
239
|
+
images = dc.from_storage(images_uri)
|
|
240
|
+
meta = dc.from_json(captions_uri, jmespath = "images")
|
|
241
|
+
captions = dc.from_json(captions_uri, jmespath = "annotations")
|
|
243
242
|
|
|
244
243
|
images_meta = images.merge(meta, on="file.name", right_on="images.file_name")
|
|
245
244
|
captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id")
|
|
@@ -248,7 +247,7 @@ captioned_images = images_meta.merge(captions, on="images.id", right_on="annotat
|
|
|
248
247
|
The resulting dataset has image entries as files decorated with all the metadata and captions:
|
|
249
248
|
|
|
250
249
|
```python
|
|
251
|
-
images_with_dogs = captioned_images.filter(Column("annotations.caption").glob("*dog*"))
|
|
250
|
+
images_with_dogs = captioned_images.filter(dc.Column("annotations.caption").glob("*dog*"))
|
|
252
251
|
images_with_dogs.select("annotations", "file.name").show()
|
|
253
252
|
```
|
|
254
253
|
|
|
@@ -37,16 +37,16 @@ Example of downloading only _`high-confidence cat`_ inferred images
|
|
|
37
37
|
using JSON metadata:
|
|
38
38
|
|
|
39
39
|
``` py
|
|
40
|
-
|
|
40
|
+
import datachain as dc
|
|
41
41
|
|
|
42
|
-
meta =
|
|
43
|
-
images =
|
|
42
|
+
meta = dc.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
43
|
+
images = dc.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
44
44
|
|
|
45
45
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
46
46
|
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
47
47
|
|
|
48
|
-
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
49
|
-
& (Column("meta.inference.class_") == "cat"))
|
|
48
|
+
likely_cats = annotated.filter((dc.Column("meta.inference.confidence") > 0.93) \
|
|
49
|
+
& (dc.Column("meta.inference.class_") == "cat"))
|
|
50
50
|
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
51
51
|
```
|
|
52
52
|
|
|
@@ -67,7 +67,7 @@ sentiment detected are then copied to the local directory.
|
|
|
67
67
|
|
|
68
68
|
``` py
|
|
69
69
|
from transformers import pipeline
|
|
70
|
-
|
|
70
|
+
import datachain as dc
|
|
71
71
|
|
|
72
72
|
classifier = pipeline("sentiment-analysis", device="cpu",
|
|
73
73
|
model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
|
|
@@ -77,7 +77,7 @@ def is_positive_dialogue_ending(file) -> bool:
|
|
|
77
77
|
return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
|
|
78
78
|
|
|
79
79
|
chain = (
|
|
80
|
-
|
|
80
|
+
dc.from_storage("gs://datachain-demo/chatbot-KiT/",
|
|
81
81
|
object_name="file", type="text", anon=True)
|
|
82
82
|
.settings(parallel=8, cache=True)
|
|
83
83
|
.map(is_positive=is_positive_dialogue_ending)
|
|
@@ -118,11 +118,11 @@ to 4 requests at the same time.
|
|
|
118
118
|
``` py
|
|
119
119
|
import os
|
|
120
120
|
from mistralai import Mistral
|
|
121
|
-
|
|
121
|
+
import datachain as dc
|
|
122
122
|
|
|
123
123
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
124
124
|
|
|
125
|
-
def eval_dialogue(file: File) -> bool:
|
|
125
|
+
def eval_dialogue(file: dc.File) -> bool:
|
|
126
126
|
client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
|
|
127
127
|
response = client.chat.complete(
|
|
128
128
|
model="open-mixtral-8x22b",
|
|
@@ -132,12 +132,12 @@ def eval_dialogue(file: File) -> bool:
|
|
|
132
132
|
return result.lower().startswith("success")
|
|
133
133
|
|
|
134
134
|
chain = (
|
|
135
|
-
|
|
135
|
+
dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
136
136
|
.map(is_success=eval_dialogue)
|
|
137
137
|
.save("mistral_files")
|
|
138
138
|
)
|
|
139
139
|
|
|
140
|
-
successful_chain = chain.filter(Column("is_success") == True)
|
|
140
|
+
successful_chain = chain.filter(dc.Column("is_success") == True)
|
|
141
141
|
successful_chain.to_storage("./output_mistral")
|
|
142
142
|
|
|
143
143
|
print(f"{successful_chain.count()} files were exported")
|
|
@@ -165,11 +165,11 @@ serialize the entire LLM response to the internal DB:
|
|
|
165
165
|
``` py
|
|
166
166
|
from mistralai import Mistral
|
|
167
167
|
from mistralai.models import ChatCompletionResponse
|
|
168
|
-
|
|
168
|
+
import datachain as dc
|
|
169
169
|
|
|
170
170
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
171
171
|
|
|
172
|
-
def eval_dialog(file: File) -> ChatCompletionResponse:
|
|
172
|
+
def eval_dialog(file: dc.File) -> ChatCompletionResponse:
|
|
173
173
|
client = MistralClient()
|
|
174
174
|
return client.chat(
|
|
175
175
|
model="open-mixtral-8x22b",
|
|
@@ -177,7 +177,7 @@ def eval_dialog(file: File) -> ChatCompletionResponse:
|
|
|
177
177
|
{"role": "user", "content": file.read()}])
|
|
178
178
|
|
|
179
179
|
chain = (
|
|
180
|
-
|
|
180
|
+
dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
181
181
|
.settings(parallel=4, cache=True)
|
|
182
182
|
.map(response=eval_dialog)
|
|
183
183
|
.map(status=lambda response: response.choices[0].message.content.lower()[:7])
|
|
@@ -186,7 +186,7 @@ chain = (
|
|
|
186
186
|
|
|
187
187
|
chain.select("file.name", "status", "response.usage").show(5)
|
|
188
188
|
|
|
189
|
-
success_rate = chain.filter(Column("status") == "success").count() / chain.count()
|
|
189
|
+
success_rate = chain.filter(dc.Column("status") == "success").count() / chain.count()
|
|
190
190
|
print(f"{100*success_rate:.1f}% dialogs were successful")
|
|
191
191
|
```
|
|
192
192
|
|
|
@@ -210,12 +210,14 @@ name usage usage usage
|
|
|
210
210
|
|
|
211
211
|
In the previous examples, datasets were saved in the embedded database
|
|
212
212
|
(`SQLite` in folder `.datachain` of the working directory). These datasets were automatically versioned, and
|
|
213
|
-
can be accessed using `
|
|
213
|
+
can be accessed using `dc.from_dataset("dataset_name")`.
|
|
214
214
|
|
|
215
215
|
Here is how to retrieve a saved dataset and iterate over the objects:
|
|
216
216
|
|
|
217
217
|
``` py
|
|
218
|
-
|
|
218
|
+
import datachain as dc
|
|
219
|
+
|
|
220
|
+
chain = dc.from_dataset("response")
|
|
219
221
|
|
|
220
222
|
# Iterating one-by-one: support out-of-memory workflow
|
|
221
223
|
for file, response in chain.limit(5).collect("file", "response"):
|
|
@@ -245,7 +247,8 @@ assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M
|
|
|
245
247
|
output tokens:
|
|
246
248
|
|
|
247
249
|
``` py
|
|
248
|
-
|
|
250
|
+
import datachain as dc
|
|
251
|
+
chain = dc.from_dataset("mistral_dataset")
|
|
249
252
|
|
|
250
253
|
cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
|
|
251
254
|
+ chain.sum("response.usage.completion_tokens")*0.000006
|
|
@@ -268,12 +271,12 @@ file name suffix, the following code will do it:
|
|
|
268
271
|
from torch.utils.data import DataLoader
|
|
269
272
|
from transformers import CLIPProcessor
|
|
270
273
|
|
|
271
|
-
|
|
274
|
+
import datachain as dc
|
|
272
275
|
|
|
273
276
|
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
274
277
|
|
|
275
278
|
chain = (
|
|
276
|
-
|
|
279
|
+
dc.from_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
|
|
277
280
|
.map(label=lambda name: name.split(".")[0], params=["file.name"])
|
|
278
281
|
.select("file", "label").to_pytorch(
|
|
279
282
|
transform=processor.image_processor,
|
|
@@ -2,13 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
`File` is a special [`DataModel`](index.md#datachain.lib.data_model.DataModel),
|
|
4
4
|
which is automatically generated when a `DataChain` is created from files,
|
|
5
|
-
such as in [`
|
|
5
|
+
such as in [`dc.from_storage`](../datachain.md#datachain.lib.dc.storage.from_storage):
|
|
6
6
|
|
|
7
7
|
```python
|
|
8
|
-
|
|
8
|
+
import datachain as dc
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
chain = dc.from_storage("gs://datachain-demo/dogs-and-cats")
|
|
11
|
+
chain.print_schema()
|
|
12
12
|
```
|
|
13
13
|
|
|
14
14
|
Output:
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
`ImageFile` is inherited from [`File`](file.md) with additional methods for working with image files.
|
|
4
4
|
|
|
5
|
-
`ImageFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.
|
|
5
|
+
`ImageFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.storage.from_storage), using `type="image"` param:
|
|
6
6
|
|
|
7
7
|
```python
|
|
8
|
-
|
|
8
|
+
import datachain as dc
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
chain = dc.from_storage("s3://bucket-name/", type="image")
|
|
11
11
|
```
|
|
12
12
|
|
|
13
13
|
::: datachain.lib.file.ImageFile
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
`TextFile` is inherited from [`File`](file.md) with additional methods for working with text files.
|
|
4
4
|
|
|
5
|
-
`TextFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.
|
|
5
|
+
`TextFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.storage.from_storage), using `type="text"` param:
|
|
6
6
|
|
|
7
7
|
```python
|
|
8
|
-
|
|
8
|
+
import datachain as dc
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
chain = dc.from_storage("s3://bucket-name/", type="text")
|
|
11
11
|
```
|
|
12
12
|
|
|
13
13
|
::: datachain.lib.file.TextFile
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
`VideoFile` extends [`File`](file.md) and provides additional methods for working with video files.
|
|
4
4
|
|
|
5
|
-
`VideoFile` instances are created when a `DataChain` is initialized [from storage](../datachain.md#datachain.lib.dc.
|
|
5
|
+
`VideoFile` instances are created when a `DataChain` is initialized [from storage](../datachain.md#datachain.lib.dc.storage.from_storage) with the `type="video"` parameter:
|
|
6
6
|
|
|
7
7
|
```python
|
|
8
|
-
|
|
8
|
+
import datachain as dc
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
chain = dc.from_storage("s3://bucket-name/", type="video")
|
|
11
11
|
```
|
|
12
12
|
|
|
13
13
|
There are additional models for working with video files:
|
|
@@ -9,6 +9,28 @@ for examples of how to create a chain.
|
|
|
9
9
|
|
|
10
10
|
::: datachain.query.schema.Column
|
|
11
11
|
|
|
12
|
+
::: datachain.lib.dc.csv.from_csv
|
|
13
|
+
|
|
14
|
+
::: datachain.lib.dc.datasets.from_dataset
|
|
15
|
+
|
|
16
|
+
::: datachain.lib.dc.datasets.datasets
|
|
17
|
+
|
|
18
|
+
::: datachain.lib.dc.hf.from_hf
|
|
19
|
+
|
|
20
|
+
::: datachain.lib.dc.json.from_json
|
|
21
|
+
|
|
22
|
+
::: datachain.lib.dc.listings.listings
|
|
23
|
+
|
|
24
|
+
::: datachain.lib.dc.pandas.from_pandas
|
|
25
|
+
|
|
26
|
+
::: datachain.lib.dc.parquet.from_parquet
|
|
27
|
+
|
|
28
|
+
::: datachain.lib.dc.records.from_records
|
|
29
|
+
|
|
30
|
+
::: datachain.lib.dc.storage.from_storage
|
|
31
|
+
|
|
32
|
+
::: datachain.lib.dc.values.from_values
|
|
33
|
+
|
|
12
34
|
::: datachain.lib.dc.DataChain
|
|
13
35
|
|
|
14
36
|
::: datachain.lib.utils.DataChainError
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
# Interacting with remote storage
|
|
2
2
|
|
|
3
|
-
DataChain supports reading and writing data from different remote storages using methods like `
|
|
3
|
+
DataChain supports reading and writing data from different remote storages using methods like `dc.from_storage` and `dc.to_storage`. The supported storages includes: local file system, AWS S3 storage, Google Cloud Storage, Azure Blob Storage, Hugging Face and more.
|
|
4
4
|
|
|
5
5
|
Example implementation for reading and writing data from/to different remote storages:
|
|
6
6
|
|
|
7
7
|
```python
|
|
8
|
-
|
|
8
|
+
import datachain as dc
|
|
9
9
|
|
|
10
|
-
dc =
|
|
10
|
+
dc = dc.from_storage("s3://bucket-name/path/to/data")
|
|
11
11
|
dc.to_storage("gs://bucket-name/path/to/data")
|
|
12
12
|
```
|
|
13
13
|
|
|
@@ -135,7 +135,7 @@ DataChain uses [s3fs](https://s3fs.readthedocs.io/en/latest/) to interact with A
|
|
|
135
135
|
|
|
136
136
|
Example:
|
|
137
137
|
```python
|
|
138
|
-
chain =
|
|
138
|
+
chain = dc.from_storage(
|
|
139
139
|
"s3://my-bucket/my-dir",
|
|
140
140
|
client_config = {
|
|
141
141
|
"endpoint_url": "<minio-endpoint-url>",
|
|
@@ -13,7 +13,7 @@ from PIL import (
|
|
|
13
13
|
TiffImagePlugin,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
import datachain as dc
|
|
17
17
|
|
|
18
18
|
source = "gs://datachain-demo/open-images-v6/"
|
|
19
19
|
|
|
@@ -67,9 +67,9 @@ def image_description(file):
|
|
|
67
67
|
|
|
68
68
|
if __name__ == "__main__":
|
|
69
69
|
(
|
|
70
|
-
|
|
70
|
+
dc.from_storage(source, type="image")
|
|
71
71
|
.settings(parallel=-1)
|
|
72
|
-
.filter(C("file.path").glob("*.jpg"))
|
|
72
|
+
.filter(dc.C("file.path").glob("*.jpg"))
|
|
73
73
|
.limit(5000)
|
|
74
74
|
.map(
|
|
75
75
|
image_description,
|
|
@@ -77,6 +77,6 @@ if __name__ == "__main__":
|
|
|
77
77
|
output={"xmp": dict, "exif": dict, "iptc": dict, "error": str},
|
|
78
78
|
)
|
|
79
79
|
.select("file.path", "xmp", "exif", "iptc", "error")
|
|
80
|
-
.filter((C("xmp") != "{}") | (C("exif") != "{}") | (C("iptc") != "{}"))
|
|
80
|
+
.filter((dc.C("xmp") != "{}") | (dc.C("exif") != "{}") | (dc.C("iptc") != "{}"))
|
|
81
81
|
.show()
|
|
82
82
|
)
|
|
@@ -11,7 +11,7 @@ from transformers import (
|
|
|
11
11
|
LlavaForConditionalGeneration,
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
import datachain as dc
|
|
15
15
|
|
|
16
16
|
model = "llava-hf/llava-1.5-7b-hf"
|
|
17
17
|
|
|
@@ -41,7 +41,7 @@ def infer_dtype(device):
|
|
|
41
41
|
return torch.float16
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
class LLaVADescribe(Mapper):
|
|
44
|
+
class LLaVADescribe(dc.Mapper):
|
|
45
45
|
def __init__(self, device="cpu", model="llava-hf/llava-1.5-7b-hf", max_tokens=300):
|
|
46
46
|
self.device = device
|
|
47
47
|
self.model_name = model
|
|
@@ -71,8 +71,8 @@ class LLaVADescribe(Mapper):
|
|
|
71
71
|
|
|
72
72
|
if __name__ == "__main__":
|
|
73
73
|
(
|
|
74
|
-
|
|
75
|
-
.filter(C("file.path").glob("*/cat*.jpg"))
|
|
74
|
+
dc.from_storage(source, type="image")
|
|
75
|
+
.filter(dc.C("file.path").glob("*/cat*.jpg"))
|
|
76
76
|
.map(
|
|
77
77
|
desc=LLaVADescribe(
|
|
78
78
|
device=device,
|
|
@@ -2,7 +2,8 @@ import json
|
|
|
2
2
|
|
|
3
3
|
from PIL import Image
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
import datachain as dc
|
|
6
|
+
from datachain import File, model
|
|
6
7
|
from datachain.func import path
|
|
7
8
|
|
|
8
9
|
|
|
@@ -40,8 +41,8 @@ def openimage_detect(args):
|
|
|
40
41
|
source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
|
|
41
42
|
|
|
42
43
|
(
|
|
43
|
-
|
|
44
|
-
.filter(C("file.path").glob("*.jpg") | C("file.path").glob("*.json"))
|
|
44
|
+
dc.from_storage(source)
|
|
45
|
+
.filter(dc.C("file.path").glob("*.jpg") | dc.C("file.path").glob("*.json"))
|
|
45
46
|
.agg(
|
|
46
47
|
openimage_detect,
|
|
47
48
|
partition_by=path.file_stem("file.path"),
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
from ultralytics import YOLO
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import datachain as dc
|
|
4
4
|
from datachain.model.ultralytics import YoloBBoxes
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
def process_bboxes(yolo: YOLO, file: File) -> YoloBBoxes:
|
|
7
|
+
def process_bboxes(yolo: YOLO, file: dc.File) -> YoloBBoxes:
|
|
8
8
|
results = yolo(file.as_image_file().read(), verbose=False)
|
|
9
9
|
return YoloBBoxes.from_results(results)
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
(
|
|
13
|
-
|
|
14
|
-
.filter(C("file.path").glob("*.jpg"))
|
|
13
|
+
dc.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
|
|
14
|
+
.filter(dc.C("file.path").glob("*.jpg"))
|
|
15
15
|
.limit(20)
|
|
16
16
|
.setup(yolo=lambda: YOLO("yolo11n.pt"))
|
|
17
17
|
.map(boxes=process_bboxes)
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
from ultralytics import YOLO
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import datachain as dc
|
|
4
4
|
from datachain.model.ultralytics import YoloPoses
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
def process_poses(yolo: YOLO, file: File) -> YoloPoses:
|
|
7
|
+
def process_poses(yolo: YOLO, file: dc.File) -> YoloPoses:
|
|
8
8
|
results = yolo(file.as_image_file().read(), verbose=False)
|
|
9
9
|
return YoloPoses.from_results(results)
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
(
|
|
13
|
-
|
|
14
|
-
.filter(C("file.path").glob("*.jpg"))
|
|
13
|
+
dc.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
|
|
14
|
+
.filter(dc.C("file.path").glob("*.jpg"))
|
|
15
15
|
.limit(20)
|
|
16
16
|
.setup(yolo=lambda: YOLO("yolo11n-pose.pt"))
|
|
17
17
|
.map(poses=process_poses)
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
from ultralytics import YOLO
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import datachain as dc
|
|
4
4
|
from datachain.model.ultralytics import YoloSegments
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
def process_segments(yolo: YOLO, file: File) -> YoloSegments:
|
|
7
|
+
def process_segments(yolo: YOLO, file: dc.File) -> YoloSegments:
|
|
8
8
|
results = yolo(file.as_image_file().read(), verbose=False)
|
|
9
9
|
return YoloSegments.from_results(results)
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
(
|
|
13
|
-
|
|
14
|
-
.filter(C("file.path").glob("*.jpg"))
|
|
13
|
+
dc.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
|
|
14
|
+
.filter(dc.C("file.path").glob("*.jpg"))
|
|
15
15
|
.limit(20)
|
|
16
16
|
.setup(yolo=lambda: YOLO("yolo11n-seg.pt"))
|
|
17
17
|
.map(segments=process_segments)
|