datachain 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.1 → datachain-0.3.2}/.github/workflows/benchmarks.yml +1 -1
- {datachain-0.3.1 → datachain-0.3.2}/.github/workflows/tests.yml +29 -0
- {datachain-0.3.1 → datachain-0.3.2}/.pre-commit-config.yaml +1 -1
- {datachain-0.3.1/src/datachain.egg-info → datachain-0.3.2}/PKG-INFO +74 -86
- {datachain-0.3.1 → datachain-0.3.2}/README.rst +64 -85
- {datachain-0.3.1 → datachain-0.3.2}/docs/index.md +5 -6
- {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/common_sql_functions.py +13 -11
- {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/torch-loader.py +3 -2
- {datachain-0.3.1 → datachain-0.3.2}/examples/llm_and_nlp/unstructured-text.py +15 -15
- {datachain-0.3.1 → datachain-0.3.2}/examples/multimodal/hf_pipeline.py +28 -19
- {datachain-0.3.1 → datachain-0.3.2}/examples/multimodal/wds.py +17 -6
- {datachain-0.3.1 → datachain-0.3.2}/examples/multimodal/wds_filtered.py +4 -2
- {datachain-0.3.1 → datachain-0.3.2}/noxfile.py +11 -0
- {datachain-0.3.1 → datachain-0.3.2}/pyproject.toml +17 -2
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/catalog/catalog.py +10 -1
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/schema.py +22 -8
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/sqlite.py +5 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/dc.py +27 -13
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/meta_formats.py +8 -2
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/node.py +1 -1
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/schema.py +4 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/default/base.py +3 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/sqlite/base.py +3 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/types.py +120 -11
- {datachain-0.3.1 → datachain-0.3.2/src/datachain.egg-info}/PKG-INFO +74 -86
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain.egg-info/SOURCES.txt +2 -3
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain.egg-info/requires.txt +10 -0
- datachain-0.3.2/tests/examples/test_examples.py +96 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_datachain.py +20 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_dataset_query.py +17 -38
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_datachain.py +91 -1
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_datachain_merge.py +8 -7
- datachain-0.3.2/tests/unit/lib/test_schema.py +22 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_data_storage.py +50 -1
- datachain-0.3.1/examples/get_started/json-metadata-tutorial.ipynb +0 -2020
- datachain-0.3.1/examples/llm/llm_chatbot_evaluation.ipynb +0 -683
- datachain-0.3.1/examples/multimodal/clip_fine_tuning.ipynb +0 -1948
- {datachain-0.3.1 → datachain-0.3.2}/.cruft.json +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/.gitattributes +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/.github/codecov.yaml +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/.github/dependabot.yml +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/.github/workflows/release.yml +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/.gitignore +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/LICENSE +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/docs/assets/datachain.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/docs/references/datachain.md +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/docs/references/datatype.md +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/docs/references/file.md +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/docs/references/index.md +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/docs/references/sql.md +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/docs/references/torch.md +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/docs/references/udf.md +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/README.md +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/llm_and_nlp/llm-claude.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/mkdocs.yml +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/setup.cfg +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/__main__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/asyn.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/cache.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/cli.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/local.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/config.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/dataset.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/error.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/job.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/file.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/listing.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/progress.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/py.typed +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/builtins.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/dataset.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/params.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/session.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/storage.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain/utils.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/conftest.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/data.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/examples/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_catalog.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_client.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_datasets.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_ls.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_pull.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_query.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_client.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_session.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_udf.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.1 → datachain-0.3.2}/tests/utils.py +0 -0
|
@@ -199,3 +199,32 @@ jobs:
|
|
|
199
199
|
--splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
|
|
200
200
|
tests ../datachain/tests
|
|
201
201
|
working-directory: backend/datachain_server
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
examples:
|
|
205
|
+
runs-on: ${{ matrix.os }}
|
|
206
|
+
timeout-minutes: 60
|
|
207
|
+
strategy:
|
|
208
|
+
fail-fast: false
|
|
209
|
+
matrix:
|
|
210
|
+
os: [ubuntu-latest-16-cores, macos-latest, windows-latest-8-cores]
|
|
211
|
+
pyv: ['3.9', '3.12']
|
|
212
|
+
group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
|
|
213
|
+
steps:
|
|
214
|
+
|
|
215
|
+
- uses: actions/checkout@v4
|
|
216
|
+
|
|
217
|
+
- name: Set up Python ${{ matrix.pyv }}
|
|
218
|
+
uses: actions/setup-python@v5
|
|
219
|
+
with:
|
|
220
|
+
python-version: ${{ matrix.pyv }}
|
|
221
|
+
cache: 'pip'
|
|
222
|
+
|
|
223
|
+
- name: Upgrade nox and uv
|
|
224
|
+
run: |
|
|
225
|
+
python -m pip install --upgrade 'nox[uv]'
|
|
226
|
+
nox --version
|
|
227
|
+
uv --version
|
|
228
|
+
|
|
229
|
+
- name: Run examples
|
|
230
|
+
run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -81,6 +81,15 @@ Requires-Dist: types-python-dateutil; extra == "dev"
|
|
|
81
81
|
Requires-Dist: types-pytz; extra == "dev"
|
|
82
82
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
83
83
|
Requires-Dist: types-requests; extra == "dev"
|
|
84
|
+
Provides-Extra: examples
|
|
85
|
+
Requires-Dist: datachain[tests]; extra == "examples"
|
|
86
|
+
Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
87
|
+
Requires-Dist: defusedxml; extra == "examples"
|
|
88
|
+
Requires-Dist: accelerate; extra == "examples"
|
|
89
|
+
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
90
|
+
Requires-Dist: pdfplumber==0.11.3; extra == "examples"
|
|
91
|
+
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
92
|
+
Requires-Dist: nltk==3.8.1; extra == "examples"
|
|
84
93
|
|
|
85
94
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
86
95
|
|
|
@@ -100,102 +109,78 @@ Requires-Dist: types-requests; extra == "dev"
|
|
|
100
109
|
AI 🔗 DataChain
|
|
101
110
|
----------------
|
|
102
111
|
|
|
103
|
-
DataChain is a data-frame library designed for
|
|
104
|
-
|
|
105
|
-
|
|
112
|
+
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
113
|
+
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
114
|
+
your local machine.
|
|
106
115
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
Local), version and update datasets.
|
|
116
|
+
Key Features
|
|
117
|
+
============
|
|
110
118
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
119
|
+
📂 **Storage as a Source of Truth.**
|
|
120
|
+
- Process unstructured data without redundant copies: S3, GCP, Azure, and local
|
|
121
|
+
file systems.
|
|
122
|
+
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
123
|
+
- Join files and metadata together into persistent, versioned, columnar datasets.
|
|
114
124
|
|
|
115
|
-
|
|
116
|
-
|
|
125
|
+
🐍 **Python-friendly data pipelines.**
|
|
126
|
+
- Operate on Python objects and object fields.
|
|
127
|
+
- Built-in parallelization and out-of-memory compute without a need in SQL or
|
|
128
|
+
Spark jobs.
|
|
117
129
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
130
|
+
🧠 **Data Enrichment and Processing.**
|
|
131
|
+
- Generate metadata columns using local AI models and LLM APIs.
|
|
132
|
+
- Filter, join, and group by AI metadata. Vector similarity search.
|
|
133
|
+
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
121
134
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
135
|
+
🚀 **Efficiency.**
|
|
136
|
+
- Parallelization, out-of-memory workloads and data caching.
|
|
137
|
+
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
138
|
+
- Vector search on embeddings.
|
|
125
139
|
|
|
126
140
|
|
|
141
|
+
Quick Start
|
|
142
|
+
-----------
|
|
143
|
+
|
|
127
144
|
.. code:: console
|
|
128
145
|
|
|
129
146
|
$ pip install datachain
|
|
130
147
|
|
|
131
148
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
DataChain introduces expressive data structures tailored for AI-specific workload:
|
|
136
|
-
|
|
137
|
-
- **Dataset:** Preserves the file-references and meta-information. Takes care of Python
|
|
138
|
-
object serialization, dataset versioning and difference. Operations on dataset:
|
|
139
|
-
|
|
140
|
-
- **Transformations:** traditional data-frame or SQL operations such as filtering,
|
|
141
|
-
grouping, joining.
|
|
142
|
-
- **Enrichments:** mapping, aggregating and generating using customer’s Python
|
|
143
|
-
code. This is needed to work with ML inference and LLM calls.
|
|
144
|
-
|
|
145
|
-
- **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
|
|
146
|
-
mode - only when needed.
|
|
147
|
-
|
|
148
|
-
DataChain name comes from these major data structures: dataset and chaining.
|
|
149
|
-
|
|
149
|
+
Selecting files using JSON metadata
|
|
150
|
+
======================================
|
|
150
151
|
|
|
151
|
-
|
|
152
|
-
|
|
152
|
+
A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`),
|
|
153
|
+
annotated with ground truth and model inferences in the 'json-pairs' format,
|
|
154
|
+
where each image has a matching JSON file like `cat.1009.json`:
|
|
153
155
|
|
|
154
|
-
|
|
155
|
-
use-cases and at the same time to fit it into traditional data infrastructure.
|
|
156
|
+
.. code:: json
|
|
156
157
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
needed for distributed computations.
|
|
162
|
-
- **Resuming data processing** (in development). Introduces idempotent operations,
|
|
163
|
-
allowing data processing to resume from the last successful process file/record/batch
|
|
164
|
-
if it fails due to issues like failed LLM calls, ML inference or file download.
|
|
158
|
+
{
|
|
159
|
+
"class": "cat", "id": "1009", "num_annotators": 8,
|
|
160
|
+
"inference": {"class": "dog", "confidence": 0.68}
|
|
161
|
+
}
|
|
165
162
|
|
|
166
|
-
|
|
163
|
+
Example of downloading only high-confidence cat images using JSON metadata:
|
|
167
164
|
|
|
168
|
-
- **Functional style data processing.** Using a functional/chaining approach to data
|
|
169
|
-
processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
|
|
170
|
-
- **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
|
|
171
|
-
and implements data versioning, extending ideas from DVC (developed by the same team).
|
|
172
165
|
|
|
166
|
+
.. code:: py
|
|
173
167
|
|
|
174
|
-
|
|
175
|
-
======================
|
|
176
|
-
|
|
177
|
-
- **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
|
|
178
|
-
`SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
|
|
179
|
-
version.
|
|
180
|
-
- **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
|
|
181
|
-
it delegates heavy data transformations to underlying data warehouses and focuses on
|
|
182
|
-
AI specific data enrichments and orchestrating all the pieces together.
|
|
183
|
-
|
|
168
|
+
from datachain import Column, DataChain
|
|
184
169
|
|
|
185
|
-
|
|
186
|
-
|
|
170
|
+
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
|
|
171
|
+
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
|
|
187
172
|
|
|
188
|
-
|
|
189
|
-
|
|
173
|
+
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
174
|
+
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
190
175
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
Our goal is to identify the successful dialogs.
|
|
176
|
+
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
177
|
+
& (Column("meta.inference.class_") == "cat"))
|
|
178
|
+
likely_cats.export_files("high-confidence-cats/", signal="file")
|
|
195
179
|
|
|
196
|
-
The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
|
|
197
180
|
|
|
198
|
-
|
|
181
|
+
Data curation with a local AI model
|
|
182
|
+
===================================
|
|
183
|
+
Batch inference with a simple sentiment model using the `transformers` library:
|
|
199
184
|
|
|
200
185
|
.. code:: shell
|
|
201
186
|
|
|
@@ -246,30 +231,30 @@ LLM judging chatbots
|
|
|
246
231
|
=============================
|
|
247
232
|
|
|
248
233
|
LLMs can work as efficient universal classifiers. In the example below,
|
|
249
|
-
we employ a free API from Mistral to judge the chatbot
|
|
234
|
+
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
250
235
|
Mistral API key at https://console.mistral.ai
|
|
251
236
|
|
|
237
|
+
|
|
252
238
|
.. code:: shell
|
|
253
239
|
|
|
254
|
-
$ pip install mistralai
|
|
240
|
+
$ pip install mistralai (Requires version >=1.0.0)
|
|
255
241
|
$ export MISTRAL_API_KEY=_your_key_
|
|
256
242
|
|
|
257
243
|
DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
|
|
258
244
|
|
|
259
245
|
.. code:: py
|
|
260
246
|
|
|
261
|
-
from mistralai
|
|
262
|
-
from mistralai.models.chat_completion import ChatMessage
|
|
247
|
+
from mistralai import Mistral
|
|
263
248
|
from datachain import File, DataChain, Column
|
|
264
249
|
|
|
265
250
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
266
251
|
|
|
267
252
|
def eval_dialogue(file: File) -> bool:
|
|
268
|
-
client =
|
|
269
|
-
response = client.chat(
|
|
253
|
+
client = Mistral()
|
|
254
|
+
response = client.chat.complete(
|
|
270
255
|
model="open-mixtral-8x22b",
|
|
271
|
-
messages=[
|
|
272
|
-
|
|
256
|
+
messages=[{"role": "system", "content": PROMPT},
|
|
257
|
+
{"role": "user", "content": file.read()}])
|
|
273
258
|
result = response.choices[0].message.content
|
|
274
259
|
return result.lower().startswith("success")
|
|
275
260
|
|
|
@@ -309,8 +294,8 @@ Instead of extracting this information from the Mistral response data structure
|
|
|
309
294
|
|
|
310
295
|
.. code:: py
|
|
311
296
|
|
|
312
|
-
from mistralai
|
|
313
|
-
from mistralai.models
|
|
297
|
+
from mistralai import Mistral
|
|
298
|
+
from mistralai.models import ChatCompletionResponse
|
|
314
299
|
from datachain import File, DataChain, Column
|
|
315
300
|
|
|
316
301
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
@@ -319,8 +304,8 @@ Instead of extracting this information from the Mistral response data structure
|
|
|
319
304
|
client = MistralClient()
|
|
320
305
|
return client.chat(
|
|
321
306
|
model="open-mixtral-8x22b",
|
|
322
|
-
messages=[
|
|
323
|
-
|
|
307
|
+
messages=[{"role": "system", "content": PROMPT},
|
|
308
|
+
{"role": "user", "content": file.read()}])
|
|
324
309
|
|
|
325
310
|
chain = (
|
|
326
311
|
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
@@ -438,7 +423,10 @@ Tutorials
|
|
|
438
423
|
---------
|
|
439
424
|
|
|
440
425
|
* `Getting Started`_
|
|
441
|
-
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/
|
|
426
|
+
* `Multimodal <https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`__)
|
|
427
|
+
* `LLM evaluations <https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`__)
|
|
428
|
+
* `Reading JSON metadata <https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`__)
|
|
429
|
+
|
|
442
430
|
|
|
443
431
|
Contributions
|
|
444
432
|
-------------
|
|
@@ -16,102 +16,78 @@
|
|
|
16
16
|
AI 🔗 DataChain
|
|
17
17
|
----------------
|
|
18
18
|
|
|
19
|
-
DataChain is a data-frame library designed for
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
20
|
+
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
21
|
+
your local machine.
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
Local), version and update datasets.
|
|
23
|
+
Key Features
|
|
24
|
+
============
|
|
26
25
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
26
|
+
📂 **Storage as a Source of Truth.**
|
|
27
|
+
- Process unstructured data without redundant copies: S3, GCP, Azure, and local
|
|
28
|
+
file systems.
|
|
29
|
+
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
30
|
+
- Join files and metadata together into persistent, versioned, columnar datasets.
|
|
30
31
|
|
|
31
|
-
|
|
32
|
-
|
|
32
|
+
🐍 **Python-friendly data pipelines.**
|
|
33
|
+
- Operate on Python objects and object fields.
|
|
34
|
+
- Built-in parallelization and out-of-memory compute without a need in SQL or
|
|
35
|
+
Spark jobs.
|
|
33
36
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
+
🧠 **Data Enrichment and Processing.**
|
|
38
|
+
- Generate metadata columns using local AI models and LLM APIs.
|
|
39
|
+
- Filter, join, and group by AI metadata. Vector similarity search.
|
|
40
|
+
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
37
41
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
🚀 **Efficiency.**
|
|
43
|
+
- Parallelization, out-of-memory workloads and data caching.
|
|
44
|
+
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
45
|
+
- Vector search on embeddings.
|
|
41
46
|
|
|
42
47
|
|
|
48
|
+
Quick Start
|
|
49
|
+
-----------
|
|
50
|
+
|
|
43
51
|
.. code:: console
|
|
44
52
|
|
|
45
53
|
$ pip install datachain
|
|
46
54
|
|
|
47
55
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
DataChain introduces expressive data structures tailored for AI-specific workload:
|
|
52
|
-
|
|
53
|
-
- **Dataset:** Preserves the file-references and meta-information. Takes care of Python
|
|
54
|
-
object serialization, dataset versioning and difference. Operations on dataset:
|
|
55
|
-
|
|
56
|
-
- **Transformations:** traditional data-frame or SQL operations such as filtering,
|
|
57
|
-
grouping, joining.
|
|
58
|
-
- **Enrichments:** mapping, aggregating and generating using customer’s Python
|
|
59
|
-
code. This is needed to work with ML inference and LLM calls.
|
|
60
|
-
|
|
61
|
-
- **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
|
|
62
|
-
mode - only when needed.
|
|
63
|
-
|
|
64
|
-
DataChain name comes from these major data structures: dataset and chaining.
|
|
65
|
-
|
|
56
|
+
Selecting files using JSON metadata
|
|
57
|
+
======================================
|
|
66
58
|
|
|
67
|
-
|
|
68
|
-
|
|
59
|
+
A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`),
|
|
60
|
+
annotated with ground truth and model inferences in the 'json-pairs' format,
|
|
61
|
+
where each image has a matching JSON file like `cat.1009.json`:
|
|
69
62
|
|
|
70
|
-
|
|
71
|
-
use-cases and at the same time to fit it into traditional data infrastructure.
|
|
63
|
+
.. code:: json
|
|
72
64
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
needed for distributed computations.
|
|
78
|
-
- **Resuming data processing** (in development). Introduces idempotent operations,
|
|
79
|
-
allowing data processing to resume from the last successful process file/record/batch
|
|
80
|
-
if it fails due to issues like failed LLM calls, ML inference or file download.
|
|
65
|
+
{
|
|
66
|
+
"class": "cat", "id": "1009", "num_annotators": 8,
|
|
67
|
+
"inference": {"class": "dog", "confidence": 0.68}
|
|
68
|
+
}
|
|
81
69
|
|
|
82
|
-
|
|
70
|
+
Example of downloading only high-confidence cat images using JSON metadata:
|
|
83
71
|
|
|
84
|
-
- **Functional style data processing.** Using a functional/chaining approach to data
|
|
85
|
-
processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
|
|
86
|
-
- **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
|
|
87
|
-
and implements data versioning, extending ideas from DVC (developed by the same team).
|
|
88
72
|
|
|
73
|
+
.. code:: py
|
|
89
74
|
|
|
90
|
-
|
|
91
|
-
======================
|
|
92
|
-
|
|
93
|
-
- **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
|
|
94
|
-
`SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
|
|
95
|
-
version.
|
|
96
|
-
- **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
|
|
97
|
-
it delegates heavy data transformations to underlying data warehouses and focuses on
|
|
98
|
-
AI specific data enrichments and orchestrating all the pieces together.
|
|
99
|
-
|
|
75
|
+
from datachain import Column, DataChain
|
|
100
76
|
|
|
101
|
-
|
|
102
|
-
|
|
77
|
+
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
|
|
78
|
+
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
|
|
103
79
|
|
|
104
|
-
|
|
105
|
-
|
|
80
|
+
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
81
|
+
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
106
82
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
Our goal is to identify the successful dialogs.
|
|
83
|
+
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
84
|
+
& (Column("meta.inference.class_") == "cat"))
|
|
85
|
+
likely_cats.export_files("high-confidence-cats/", signal="file")
|
|
111
86
|
|
|
112
|
-
The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
|
|
113
87
|
|
|
114
|
-
|
|
88
|
+
Data curation with a local AI model
|
|
89
|
+
===================================
|
|
90
|
+
Batch inference with a simple sentiment model using the `transformers` library:
|
|
115
91
|
|
|
116
92
|
.. code:: shell
|
|
117
93
|
|
|
@@ -162,30 +138,30 @@ LLM judging chatbots
|
|
|
162
138
|
=============================
|
|
163
139
|
|
|
164
140
|
LLMs can work as efficient universal classifiers. In the example below,
|
|
165
|
-
we employ a free API from Mistral to judge the chatbot
|
|
141
|
+
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
166
142
|
Mistral API key at https://console.mistral.ai
|
|
167
143
|
|
|
144
|
+
|
|
168
145
|
.. code:: shell
|
|
169
146
|
|
|
170
|
-
$ pip install mistralai
|
|
147
|
+
$ pip install mistralai (Requires version >=1.0.0)
|
|
171
148
|
$ export MISTRAL_API_KEY=_your_key_
|
|
172
149
|
|
|
173
150
|
DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
|
|
174
151
|
|
|
175
152
|
.. code:: py
|
|
176
153
|
|
|
177
|
-
from mistralai
|
|
178
|
-
from mistralai.models.chat_completion import ChatMessage
|
|
154
|
+
from mistralai import Mistral
|
|
179
155
|
from datachain import File, DataChain, Column
|
|
180
156
|
|
|
181
157
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
182
158
|
|
|
183
159
|
def eval_dialogue(file: File) -> bool:
|
|
184
|
-
client =
|
|
185
|
-
response = client.chat(
|
|
160
|
+
client = Mistral()
|
|
161
|
+
response = client.chat.complete(
|
|
186
162
|
model="open-mixtral-8x22b",
|
|
187
|
-
messages=[
|
|
188
|
-
|
|
163
|
+
messages=[{"role": "system", "content": PROMPT},
|
|
164
|
+
{"role": "user", "content": file.read()}])
|
|
189
165
|
result = response.choices[0].message.content
|
|
190
166
|
return result.lower().startswith("success")
|
|
191
167
|
|
|
@@ -225,8 +201,8 @@ Instead of extracting this information from the Mistral response data structure
|
|
|
225
201
|
|
|
226
202
|
.. code:: py
|
|
227
203
|
|
|
228
|
-
from mistralai
|
|
229
|
-
from mistralai.models
|
|
204
|
+
from mistralai import Mistral
|
|
205
|
+
from mistralai.models import ChatCompletionResponse
|
|
230
206
|
from datachain import File, DataChain, Column
|
|
231
207
|
|
|
232
208
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
@@ -235,8 +211,8 @@ Instead of extracting this information from the Mistral response data structure
|
|
|
235
211
|
client = MistralClient()
|
|
236
212
|
return client.chat(
|
|
237
213
|
model="open-mixtral-8x22b",
|
|
238
|
-
messages=[
|
|
239
|
-
|
|
214
|
+
messages=[{"role": "system", "content": PROMPT},
|
|
215
|
+
{"role": "user", "content": file.read()}])
|
|
240
216
|
|
|
241
217
|
chain = (
|
|
242
218
|
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
@@ -354,7 +330,10 @@ Tutorials
|
|
|
354
330
|
---------
|
|
355
331
|
|
|
356
332
|
* `Getting Started`_
|
|
357
|
-
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/
|
|
333
|
+
* `Multimodal <https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`__)
|
|
334
|
+
* `LLM evaluations <https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`__)
|
|
335
|
+
* `Reading JSON metadata <https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`__)
|
|
336
|
+
|
|
358
337
|
|
|
359
338
|
Contributions
|
|
360
339
|
-------------
|
|
@@ -24,8 +24,7 @@ For example, let us consider the New Yorker Cartoon caption contest dataset, whe
|
|
|
24
24
|
# pip install transformers
|
|
25
25
|
#
|
|
26
26
|
|
|
27
|
-
from datachain.lib.dc import Column, DataChain
|
|
28
|
-
from datachain.lib.file import File
|
|
27
|
+
from datachain.lib.dc import Column, DataChain, File
|
|
29
28
|
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
|
|
30
29
|
|
|
31
30
|
images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
|
|
@@ -75,7 +74,7 @@ plt.show()
|
|
|
75
74
|
|
|
76
75
|
If interested to see more multimodal examples for DataChain, please follow this tutorial:
|
|
77
76
|
|
|
78
|
-
[https://github.com/iterative/datachain/blob/main/
|
|
77
|
+
[https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
|
|
79
78
|
|
|
80
79
|
### Handling Python objects
|
|
81
80
|
|
|
@@ -134,7 +133,7 @@ chain = (
|
|
|
134
133
|
|
|
135
134
|
If you are interested in more LLM evaluation examples for DataChain, please follow this tutorial:
|
|
136
135
|
|
|
137
|
-
[https://github.com/iterative/datachain/blob/main/
|
|
136
|
+
[https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb](https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb)
|
|
138
137
|
|
|
139
138
|
### Vectorized analytics
|
|
140
139
|
|
|
@@ -280,7 +279,7 @@ images_with_dogs.select("annotations", "file.name").show()
|
|
|
280
279
|
```
|
|
281
280
|
For in-depth review of working with JSON metadata, please follow this tutorial:
|
|
282
281
|
|
|
283
|
-
[https://github.com/iterative/datachain/blob/main/
|
|
282
|
+
[https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb](https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb)
|
|
284
283
|
|
|
285
284
|
### Passing data to training
|
|
286
285
|
|
|
@@ -300,4 +299,4 @@ train(loader, model, optimizer)
|
|
|
300
299
|
|
|
301
300
|
See a larger example for CLIP fine-tuning here:
|
|
302
301
|
|
|
303
|
-
[https://github.com/iterative/datachain/blob/main/
|
|
302
|
+
[https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
|
|
@@ -10,13 +10,13 @@ def num_chars_udf(file):
|
|
|
10
10
|
return ([],)
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
dc = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/")
|
|
14
|
+
dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
|
|
15
15
|
"file.path", "num_chars"
|
|
16
16
|
).show(5)
|
|
17
17
|
|
|
18
18
|
(
|
|
19
|
-
|
|
19
|
+
dc.mutate(
|
|
20
20
|
length=string.length(path.name(C("file.path"))),
|
|
21
21
|
parts=string.split(path.name(C("file.path")), literal(".")),
|
|
22
22
|
)
|
|
@@ -25,7 +25,7 @@ ds.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
|
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
(
|
|
28
|
-
|
|
28
|
+
dc.mutate(
|
|
29
29
|
stem=path.file_stem(path.name(C("file.path"))),
|
|
30
30
|
ext=path.file_ext(path.name(C("file.path"))),
|
|
31
31
|
)
|
|
@@ -33,14 +33,16 @@ ds.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
|
|
|
33
33
|
.show(5)
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
+
|
|
37
|
+
chain = dc.mutate(
|
|
38
|
+
a=array.length(string.split(C("file.path"), literal("/"))),
|
|
39
|
+
b=array.length(string.split(path.name(C("file.path")), literal("0"))),
|
|
40
|
+
)
|
|
41
|
+
|
|
36
42
|
(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
)
|
|
41
|
-
.mutate(
|
|
42
|
-
greatest=greatest(C("a"), C("b")),
|
|
43
|
-
least=least(C("a"), C("b")),
|
|
43
|
+
chain.mutate(
|
|
44
|
+
greatest=greatest(chain.column("a"), C("b")),
|
|
45
|
+
least=least(chain.column("a"), C("b")),
|
|
44
46
|
)
|
|
45
47
|
.select("a", "b", "greatest", "least")
|
|
46
48
|
.show(10)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# pip install Pillow torchvision
|
|
2
2
|
|
|
3
|
+
import os
|
|
3
4
|
from posixpath import basename
|
|
4
5
|
|
|
5
6
|
import torch
|
|
@@ -11,6 +12,7 @@ from datachain import C, DataChain
|
|
|
11
12
|
from datachain.torch import label_to_int
|
|
12
13
|
|
|
13
14
|
STORAGE = "gs://datachain-demo/dogs-and-cats/"
|
|
15
|
+
NUM_EPOCHS = os.getenv("NUM_EPOCHS", "3")
|
|
14
16
|
|
|
15
17
|
# Define transformation for data preprocessing
|
|
16
18
|
transform = v2.Compose(
|
|
@@ -66,8 +68,7 @@ if __name__ == "__main__":
|
|
|
66
68
|
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
|
67
69
|
|
|
68
70
|
# Train the model
|
|
69
|
-
|
|
70
|
-
for epoch in range(num_epochs):
|
|
71
|
+
for epoch in range(int(NUM_EPOCHS)):
|
|
71
72
|
for i, data in enumerate(train_loader):
|
|
72
73
|
inputs, labels = data
|
|
73
74
|
optimizer.zero_grad()
|