datachain 0.2.5__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain-0.2.6/PKG-INFO +429 -0
- datachain-0.2.6/README.rst +346 -0
- datachain-0.2.6/src/datachain/__init__.py +34 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/feature.py +7 -2
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/feature_utils.py +35 -17
- datachain-0.2.6/src/datachain.egg-info/PKG-INFO +429 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/conftest.py +8 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_datachain_bootstrap.py +3 -3
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_feature_utils.py +37 -1
- datachain-0.2.6/tests/unit/test_module_exports.py +31 -0
- datachain-0.2.5/PKG-INFO +0 -376
- datachain-0.2.5/README.rst +0 -293
- datachain-0.2.5/src/datachain.egg-info/PKG-INFO +0 -376
- datachain-0.2.5/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.cruft.json +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.gitattributes +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.github/codecov.yaml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.github/dependabot.yml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.github/workflows/release.yml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.github/workflows/tests.yml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.gitignore +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.pre-commit-config.yaml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/.reuse/dep5 +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/CONTRIBUTING.rst +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/LICENSE +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/LICENSES/Apache-2.0.txt +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/LICENSES/BSD-3-Clause.txt +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/LICENSES/Python-2.0.txt +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/docs/assets/datachain.png +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/docs/index.md +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/docs/references/catalog.md +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/docs/references/datachain.md +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/docs/tutorials/cv_intro.md +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/docs/tutorials/udfs.md +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/blip2_image_desc_lib.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/clip.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/common_sql_functions.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/README.md +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/hf_pipeline.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/json-csv-reader.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/llava2_image_desc_lib.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/llm-claude-aggregate-query.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/llm-claude-simple-query.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/llm-claude.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/loader.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/neurips/README +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/neurips/distance_to_query.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/neurips/llm_chat.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/neurips/requirements.txt +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/neurips/single_query.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/neurips/text_loaders.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/openai_image_desc_lib.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/openimage-detect.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/pose_detection.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/torch-loader.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/udfs/batching.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/udfs/image_transformation.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/udfs/parallel.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/udfs/simple.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/udfs/stateful.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/udfs/stateful_similarity.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/unstructured-text.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/wds.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/wds_filtered.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/zalando/zalando_clip.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/zalando/zalando_dir_as_class.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/zalando/zalando_splits_and_classes_ds.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/examples/zalando/zalando_splits_and_classes_output.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/mkdocs.yml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/noxfile.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/pyproject.toml +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/setup.cfg +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/__main__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/asyn.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/cache.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/cli.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/cli_utils.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/client/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/client/azure.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/client/gcs.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/client/local.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/client/s3.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/config.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/dataset.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/error.py +0 -0
- {datachain-0.2.5/src/datachain → datachain-0.2.6/src/datachain/lib}/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/cached_stream.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/claude.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/clip.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/dc.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/feature_registry.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/file.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/gpt4_vision.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/hf_image_to_text.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/hf_pipeline.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/image.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/image_transform.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/iptc_exif_xmp.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/settings.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/text.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/udf.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/unstructured.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/utils.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/listing.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/node.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/progress.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/py.typed +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/query/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/query/batch.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/query/builtins.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/query/dataset.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/query/metrics.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/query/params.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/query/schema.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/query/session.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/query/udf.py +0 -0
- {datachain-0.2.5/src/datachain/lib → datachain-0.2.6/src/datachain/remote}/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/remote/studio.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/types.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/sql/utils.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/storage.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain/utils.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/__init__.py +0 -0
- {datachain-0.2.5/src/datachain/remote → datachain-0.2.6/tests/benchmarks}/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/data.py +0 -0
- {datachain-0.2.5/tests/benchmarks → datachain-0.2.6/tests/examples}/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/examples/wds_data.py +0 -0
- {datachain-0.2.5/tests/examples → datachain-0.2.6/tests/func}/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/func/test_catalog.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/func/test_client.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/func/test_datachain.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/func/test_datasets.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/func/test_ls.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/func/test_pull.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/func/test_pytorch.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/func/test_query.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/scripts/feature_class.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/scripts/name_len_normal.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/test_cli_e2e.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/test_query_e2e.py +0 -0
- {datachain-0.2.5/tests/func → datachain-0.2.6/tests/unit}/__init__.py +0 -0
- {datachain-0.2.5/tests/unit → datachain-0.2.6/tests/unit/lib}/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.2.5/tests/unit/lib → datachain-0.2.6/tests/unit/sql}/__init__.py +0 -0
- {datachain-0.2.5/tests/unit/sql → datachain-0.2.6/tests/unit/sql/sqlite}/__init__.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_asyn.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_cache.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_catalog.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_client.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_dataset.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_listing.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_metastore.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_query_params.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_serializer.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_session.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_storage.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_udf.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_utils.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.2.5 → datachain-0.2.6}/tests/utils.py +0 -0
datachain-0.2.6/PKG-INFO
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datachain
|
|
3
|
+
Version: 0.2.6
|
|
4
|
+
Summary: Wrangle unstructured AI data at scale
|
|
5
|
+
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
|
+
Project-URL: Issues, https://github.com/iterative/dvcx/issues
|
|
9
|
+
Project-URL: Source, https://github.com/iterative/dvcx
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/x-rst
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pyyaml
|
|
20
|
+
Requires-Dist: tomlkit
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Requires-Dist: numpy
|
|
23
|
+
Requires-Dist: numpy<2,>=1; sys_platform == "win32"
|
|
24
|
+
Requires-Dist: pandas>=2.0.0
|
|
25
|
+
Requires-Dist: pyarrow
|
|
26
|
+
Requires-Dist: typing-extensions
|
|
27
|
+
Requires-Dist: python-dateutil>=2
|
|
28
|
+
Requires-Dist: attrs>=21.3.0
|
|
29
|
+
Requires-Dist: s3fs>=2024.2.0
|
|
30
|
+
Requires-Dist: gcsfs>=2024.2.0
|
|
31
|
+
Requires-Dist: adlfs>=2024.2.0
|
|
32
|
+
Requires-Dist: dvc-data<4,>=3.10
|
|
33
|
+
Requires-Dist: dvc-objects<6,>=4
|
|
34
|
+
Requires-Dist: shtab<2,>=1.3.4
|
|
35
|
+
Requires-Dist: sqlalchemy>=2
|
|
36
|
+
Requires-Dist: multiprocess==0.70.16
|
|
37
|
+
Requires-Dist: dill==0.3.8
|
|
38
|
+
Requires-Dist: ujson>=5.9.0
|
|
39
|
+
Requires-Dist: pydantic<3,>=2
|
|
40
|
+
Requires-Dist: jmespath>=1.0
|
|
41
|
+
Requires-Dist: datamodel-code-generator>=0.25
|
|
42
|
+
Provides-Extra: docs
|
|
43
|
+
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
44
|
+
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
45
|
+
Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
|
|
46
|
+
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
47
|
+
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
48
|
+
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
49
|
+
Provides-Extra: cv
|
|
50
|
+
Requires-Dist: Pillow<11,>=10.0.0; extra == "cv"
|
|
51
|
+
Requires-Dist: torch>=2.1.0; extra == "cv"
|
|
52
|
+
Requires-Dist: torchvision; extra == "cv"
|
|
53
|
+
Requires-Dist: transformers>=4.36.0; extra == "cv"
|
|
54
|
+
Provides-Extra: remote
|
|
55
|
+
Requires-Dist: lz4; extra == "remote"
|
|
56
|
+
Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
|
|
57
|
+
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
58
|
+
Provides-Extra: vector
|
|
59
|
+
Requires-Dist: usearch; extra == "vector"
|
|
60
|
+
Provides-Extra: tests
|
|
61
|
+
Requires-Dist: datachain[cv,remote,vector]; extra == "tests"
|
|
62
|
+
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
63
|
+
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
64
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
65
|
+
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
66
|
+
Requires-Dist: pytest-servers[all]>=0.5.4; extra == "tests"
|
|
67
|
+
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
68
|
+
Requires-Dist: pytest-asyncio>=0.23.2; extra == "tests"
|
|
69
|
+
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
70
|
+
Requires-Dist: virtualenv; extra == "tests"
|
|
71
|
+
Requires-Dist: dulwich; extra == "tests"
|
|
72
|
+
Requires-Dist: hypothesis; extra == "tests"
|
|
73
|
+
Requires-Dist: open_clip_torch; extra == "tests"
|
|
74
|
+
Requires-Dist: aiotools>=1.7.0; extra == "tests"
|
|
75
|
+
Requires-Dist: requests-mock; extra == "tests"
|
|
76
|
+
Provides-Extra: dev
|
|
77
|
+
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
78
|
+
Requires-Dist: mypy==1.10.1; extra == "dev"
|
|
79
|
+
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
80
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
81
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
82
|
+
Requires-Dist: types-ujson; extra == "dev"
|
|
83
|
+
|
|
84
|
+
|PyPI| |Python Version| |Codecov| |Tests|
|
|
85
|
+
|
|
86
|
+
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
87
|
+
:target: https://pypi.org/project/datachain/
|
|
88
|
+
:alt: PyPI
|
|
89
|
+
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
90
|
+
:target: https://pypi.org/project/datachain
|
|
91
|
+
:alt: Python Version
|
|
92
|
+
.. |Codecov| image:: https://codecov.io/gh/iterative/dvcx/branch/main/graph/badge.svg?token=VSCP2T9R5X
|
|
93
|
+
:target: https://app.codecov.io/gh/iterative/dvcx
|
|
94
|
+
:alt: Codecov
|
|
95
|
+
.. |Tests| image:: https://github.com/iterative/dvcx/workflows/Tests/badge.svg
|
|
96
|
+
:target: https://github.com/iterative/dvcx/actions?workflow=Tests
|
|
97
|
+
:alt: Tests
|
|
98
|
+
|
|
99
|
+
AI 🔗 DataChain
|
|
100
|
+
----------------
|
|
101
|
+
|
|
102
|
+
DataChain is an open-source Python data processing library for wrangling unstructured AI data at scale.
|
|
103
|
+
|
|
104
|
+
Datachain enables multimodal API calls and local AI inferences to run in parallel over many samples as chained operations. The resulting datasets can be saved, versioned, and sent directly to PyTorch and TensorFlow for training. Datachain can persist features of Python objects returned by AI models, and enables vectorized analytical operations over them.
|
|
105
|
+
|
|
106
|
+
The typical use cases are data curation, LLM analytics and validation, image segmentation, pose detection, and GenAI alignment. Datachain is especially helpful if batch operations can be optimized – for instance, when synchronous API calls can be parallelized or where an LLM API offers batch processing.
|
|
107
|
+
|
|
108
|
+
.. code:: console
|
|
109
|
+
|
|
110
|
+
$ pip install datachain
|
|
111
|
+
|
|
112
|
+
Operation basics
|
|
113
|
+
----------------
|
|
114
|
+
|
|
115
|
+
DataChain is built by composing wrangling operations.
|
|
116
|
+
|
|
117
|
+
For example, let us consider a dataset from Karlsruhe Institute of Technology detailing dialogs between users and customer service chatbots. We can use the chain to read data from the cloud, map it onto the parallel API calls for LLM evaluation, and organize the output into a dataset :
|
|
118
|
+
|
|
119
|
+
.. code:: py
|
|
120
|
+
|
|
121
|
+
# pip install mistralai
|
|
122
|
+
# this example requires a free Mistral API key, get yours at https://console.mistral.ai
|
|
123
|
+
# add the key to your shell environment: $ export MISTRAL_API_KEY= your key
|
|
124
|
+
|
|
125
|
+
# pip install mistralai
|
|
126
|
+
# this example requires a free Mistral API key, get yours at https://console.mistral.ai
|
|
127
|
+
# add the key to your shell environment: $ export MISTRAL_API_KEY= your key
|
|
128
|
+
|
|
129
|
+
import os
|
|
130
|
+
|
|
131
|
+
from mistralai.client import MistralClient
|
|
132
|
+
from mistralai.models.chat_completion import ChatMessage
|
|
133
|
+
|
|
134
|
+
from datachain.lib.dc import DataChain, Column
|
|
135
|
+
|
|
136
|
+
PROMPT = "Was this bot dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
|
|
137
|
+
|
|
138
|
+
model = "mistral-large-latest"
|
|
139
|
+
api_key = os.environ["MISTRAL_API_KEY"]
|
|
140
|
+
|
|
141
|
+
chain = (
|
|
142
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/")
|
|
143
|
+
.limit(5)
|
|
144
|
+
.settings(cache=True, parallel=5)
|
|
145
|
+
.map(
|
|
146
|
+
mistral_response=lambda file: MistralClient(api_key=api_key)
|
|
147
|
+
.chat(
|
|
148
|
+
model=model,
|
|
149
|
+
response_format={"type": "json_object"},
|
|
150
|
+
messages=[
|
|
151
|
+
ChatMessage(role="user", content=f"{PROMPT}: {file.get_value()}")
|
|
152
|
+
],
|
|
153
|
+
)
|
|
154
|
+
.choices[0]
|
|
155
|
+
.message.content,
|
|
156
|
+
)
|
|
157
|
+
.save()
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
print(chain.select("mistral_response").results())
|
|
162
|
+
except Exception as e:
|
|
163
|
+
print(f"do you have the right Mistral API key? {e}")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
.. code:: shell
|
|
167
|
+
|
|
168
|
+
[('{"result": "Yes"}',), ('{"result": "No"}',), ... , ('{"result": "Yes"}',)]
|
|
169
|
+
|
|
170
|
+
Now we have parallel-processed an LLM API-based query over cloud data and persisted the results.
|
|
171
|
+
|
|
172
|
+
Vectorized analytics
|
|
173
|
+
--------------------
|
|
174
|
+
|
|
175
|
+
Datachain internally represents datasets as tables, so analytical queries on the chain are automatically vectorized:
|
|
176
|
+
|
|
177
|
+
.. code:: py
|
|
178
|
+
|
|
179
|
+
failed_dialogs = chain.filter(Column("mistral_response") == '{"result": "No"}')
|
|
180
|
+
success_rate = failed_dialogs.count() / chain.count()
|
|
181
|
+
print(f"Chatbot dialog success rate: {100*success_rate:.2f}%")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
.. code:: shell
|
|
185
|
+
|
|
186
|
+
"40.00%"
|
|
187
|
+
|
|
188
|
+
Note that DataChain represents file samples as pointers into their respective storage locations. This means a newly created dataset version does not duplicate files in storage, and storage remains the single source of truth for the original samples
|
|
189
|
+
|
|
190
|
+
Handling Python objects
|
|
191
|
+
-----------------------
|
|
192
|
+
In addition to storing primitive Python data types, chain is also capable of using data models.
|
|
193
|
+
|
|
194
|
+
For example, instead of collecting just a text response from Mistral API, we might be interested in more fields of the Mistral response object. For this task, we can define a Pydantic-like model and populate it from the API replies:
|
|
195
|
+
|
|
196
|
+
.. code:: py
|
|
197
|
+
|
|
198
|
+
import os
|
|
199
|
+
|
|
200
|
+
from mistralai.client import MistralClient
|
|
201
|
+
from mistralai.models.chat_completion import ChatMessage
|
|
202
|
+
|
|
203
|
+
from datachain.lib.dc import DataChain
|
|
204
|
+
from datachain.lib.feature import Feature
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
PROMPT = (
|
|
208
|
+
"Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
model = "mistral-large-latest"
|
|
212
|
+
api_key = os.environ["MISTRAL_API_KEY"]
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
## define the data model ###
|
|
216
|
+
class Usage(Feature):
|
|
217
|
+
prompt_tokens: int = 0
|
|
218
|
+
completion_tokens: int = 0
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class MyChatMessage(Feature):
|
|
222
|
+
role: str = ""
|
|
223
|
+
content: str = ""
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class CompletionResponseChoice(Feature):
|
|
227
|
+
message: MyChatMessage = MyChatMessage()
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class MistralModel(Feature):
|
|
231
|
+
id: str = ""
|
|
232
|
+
choices: list[CompletionResponseChoice]
|
|
233
|
+
usage: Usage = Usage()
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
## Populate model instances ###
|
|
237
|
+
chain = (
|
|
238
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/")
|
|
239
|
+
.limit(5)
|
|
240
|
+
.settings(cache=True, parallel=5)
|
|
241
|
+
.map(
|
|
242
|
+
mistral_response=lambda file: MistralModel(
|
|
243
|
+
**MistralClient(api_key=api_key)
|
|
244
|
+
.chat(
|
|
245
|
+
model=model,
|
|
246
|
+
response_format={"type": "json_object"},
|
|
247
|
+
messages=[
|
|
248
|
+
ChatMessage(role="user", content=f"{PROMPT}: {file.get_value()}")
|
|
249
|
+
],
|
|
250
|
+
)
|
|
251
|
+
.dict()
|
|
252
|
+
),
|
|
253
|
+
output=MistralModel,
|
|
254
|
+
)
|
|
255
|
+
.save("dialog-eval")
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
After the chain execution, we can collect the objects:
|
|
259
|
+
|
|
260
|
+
.. code:: py
|
|
261
|
+
|
|
262
|
+
for obj in responses:
|
|
263
|
+
assert isinstance(obj, MistralModel)
|
|
264
|
+
print(obj.dict())
|
|
265
|
+
|
|
266
|
+
.. code:: shell
|
|
267
|
+
|
|
268
|
+
{'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 610, 'completion_tokens': 6}}
|
|
269
|
+
{'choices': [{'message': {'role': 'assistant', 'content': '{"result": "No"}'}}], 'usage': {'prompt_tokens': 3983, 'completion_tokens': 6}}
|
|
270
|
+
{'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 706, 'completion_tokens': 6}}
|
|
271
|
+
{'choices': [{'message': {'role': 'assistant', 'content': '{"result": "No"}'}}], 'usage': {'prompt_tokens': 1250, 'completion_tokens': 6}}
|
|
272
|
+
{'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 1217, 'completion_tokens': 6}}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
Dataset persistence
|
|
276
|
+
--------------------
|
|
277
|
+
|
|
278
|
+
The “save” operation makes chain dataset persistent in the current (working) directory of the query. A hidden folder .datachain/ holds the records. A persistent dataset can be accessed later to start a derivative chain:
|
|
279
|
+
|
|
280
|
+
.. code:: py
|
|
281
|
+
|
|
282
|
+
DataChain.from_dataset("dialog-eval").limit(2).save("dialog-eval")
|
|
283
|
+
|
|
284
|
+
Persistent datasets are immutable and automatically versioned. Versions can be listed from shell:
|
|
285
|
+
|
|
286
|
+
.. code:: shell
|
|
287
|
+
|
|
288
|
+
$ datachain ls-datasets
|
|
289
|
+
|
|
290
|
+
dialog-rate (v1)
|
|
291
|
+
dialog-rate (v2)
|
|
292
|
+
|
|
293
|
+
By default, when a persistent dataset is loaded, the latest version is fetched but another version can be requested:
|
|
294
|
+
|
|
295
|
+
.. code:: py
|
|
296
|
+
|
|
297
|
+
ds = DataChain.from_dataset("dialog-eval", version = 1)
|
|
298
|
+
|
|
299
|
+
Chain optimization and execution
|
|
300
|
+
--------------------------------
|
|
301
|
+
|
|
302
|
+
Datachain avoids redundant operations. Execution is triggered only when a downstream operation requests the processed results. However, it would be inefficient to run, say, LLM queries again every time you just want to collect several objects.
|
|
303
|
+
|
|
304
|
+
“Save” operation nails execution results and automatically refers to them every time the downstream functions ask for data. Saving without an explicit name generates an auto-named dataset which serves the same purpose.
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
Matching data with metadata
|
|
308
|
+
----------------------------
|
|
309
|
+
It is common for AI data to come with pre-computed metadata (annotations, classes, etc).
|
|
310
|
+
|
|
311
|
+
DataChain library understands common metadata formats (JSON, CSV and parquet), and can unite data samples from storage with side-loaded metadata. The schema for metadata can be set explicitly or be inferred.
|
|
312
|
+
|
|
313
|
+
Here is an example of reading a CSV file where schema is heuristically derived from the header:
|
|
314
|
+
|
|
315
|
+
.. code:: py
|
|
316
|
+
|
|
317
|
+
from datachain.lib.dc import DataChain
|
|
318
|
+
csv_dataset = DataChain.from_csv("gs://datachain-demo/chatbot-csv/")
|
|
319
|
+
|
|
320
|
+
print(csv_dataset.to_pandas())
|
|
321
|
+
|
|
322
|
+
Reading metadata from JSON format is a more complicated scenario because a JSON-annotated dataset typically references data samples (e.g. images) in annotation arrays somewhere within JSON files.
|
|
323
|
+
|
|
324
|
+
Here is an example from MS COCO “captions” JSON which employs separate sections for image meta and captions:
|
|
325
|
+
|
|
326
|
+
.. code:: json
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
{
|
|
330
|
+
"images": [
|
|
331
|
+
{
|
|
332
|
+
"license": 4,
|
|
333
|
+
"file_name": "000000397133.jpg",
|
|
334
|
+
"coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg",
|
|
335
|
+
"height": 427,
|
|
336
|
+
"width": 640,
|
|
337
|
+
"date_captured": "2013-11-14 17:02:52",
|
|
338
|
+
"flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg",
|
|
339
|
+
"id": 397133
|
|
340
|
+
},
|
|
341
|
+
...
|
|
342
|
+
],
|
|
343
|
+
"annotations": [
|
|
344
|
+
{
|
|
345
|
+
"image_id" : "179765",
|
|
346
|
+
"id" : 38,
|
|
347
|
+
"caption" : "A black Honda motorcycle parked in front of a garage."
|
|
348
|
+
},
|
|
349
|
+
...
|
|
350
|
+
],
|
|
351
|
+
...
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
To deal with this layout, we can take the following steps:
|
|
355
|
+
|
|
356
|
+
1. Generate a dataset of raw image files from storage
|
|
357
|
+
2. Generate a meta-information dataset from the JSON section “images”
|
|
358
|
+
3. Join these datasets via the matching id keys
|
|
359
|
+
|
|
360
|
+
.. code:: python
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
from datachain.lib.dc import DataChain
|
|
364
|
+
|
|
365
|
+
images = DataChain.from_storage("gs://datachain-demo/coco2017/images/val/")
|
|
366
|
+
meta = DataChain.from_json("gs://datachain-demo/coco2017/annotations_captions", jmespath = "images")
|
|
367
|
+
|
|
368
|
+
images_with_meta = images.merge(meta, on="file.name", right_on="images.file_name")
|
|
369
|
+
|
|
370
|
+
print(images_with_meta.limit(1).results())
|
|
371
|
+
|
|
372
|
+
.. code:: shell
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
Processed: 5000 rows [00:00, 15481.66 rows/s]
|
|
376
|
+
Processed: 1 rows [00:00, 1291.75 rows/s]
|
|
377
|
+
Processed: 1 rows [00:00, 4.70 rows/s]
|
|
378
|
+
Generated: 5000 rows [00:00, 27128.67 rows/s]
|
|
379
|
+
[(1, 2336066478558845549, '', 0, 'coco2017/images/val', '000000000139.jpg', 'CNvXoemj8IYDEAE=', '1719096046021595', 1, datetime.datetime(2024, 6, 22, 22, 40, 46, 70000, tzinfo=datetime.timezone.utc), 161811, '', '', None, 'gs://datachain-demo', 'gs://datachain-demo', 'coco2017/images/val', '000000000139.jpg', 161811, '1719096046021595', 'CNvXoemj8IYDEAE=', 1, datetime.datetime(1970, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), None, '', 4146, 6967063844996569113, 2, '000000000139.jpg', 'http://images.cocodataset.org/val2017/000000000139.jpg', 426, 640, '2013-11-21 01:34:01', 'http://farm9.staticflickr.com/8035/8024364858_9c41dc1666_z.jpg', 139)]
|
|
380
|
+
|
|
381
|
+
Passing data to training
|
|
382
|
+
------------------------
|
|
383
|
+
|
|
384
|
+
Chain results can be exported or passed directly to Pytorch dataloader. For example, if we are interested in passing three columns to training, the following Pytorch code will do it:
|
|
385
|
+
|
|
386
|
+
.. code:: py
|
|
387
|
+
|
|
388
|
+
ds = train.select("file", "caption_choices", "label_ind").to_pytorch(
|
|
389
|
+
transform=preprocess,
|
|
390
|
+
tokenizer=clip.tokenize,
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
loader = DataLoader(ds, batch_size=2)
|
|
394
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
|
|
395
|
+
train(loader, model, optimizer)
|
|
396
|
+
|
|
397
|
+
Tutorials
|
|
398
|
+
------------------
|
|
399
|
+
|
|
400
|
+
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
|
|
401
|
+
|
|
402
|
+
Contributions
|
|
403
|
+
--------------------
|
|
404
|
+
|
|
405
|
+
Contributions are very welcome.
|
|
406
|
+
To learn more, see the `Contributor Guide`_.
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
License
|
|
410
|
+
-------
|
|
411
|
+
|
|
412
|
+
Distributed under the terms of the `Apache 2.0 license`_,
|
|
413
|
+
*DataChain* is free and open source software.
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
Issues
|
|
417
|
+
------
|
|
418
|
+
|
|
419
|
+
If you encounter any problems,
|
|
420
|
+
please `file an issue`_ along with a detailed description.
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
.. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
|
|
424
|
+
.. _PyPI: https://pypi.org/
|
|
425
|
+
.. _file an issue: https://github.com/iterative/dvcx/issues
|
|
426
|
+
.. _pip: https://pip.pypa.io/
|
|
427
|
+
.. github-only
|
|
428
|
+
.. _Contributor Guide: CONTRIBUTING.rst
|
|
429
|
+
.. _Pydantic: https://github.com/pydantic/pydantic
|