datachain 0.2.10__tar.gz → 0.2.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.2.10 → datachain-0.2.12}/.github/workflows/tests.yml +1 -1
- {datachain-0.2.10 → datachain-0.2.12}/CONTRIBUTING.rst +3 -3
- datachain-0.2.12/PKG-INFO +412 -0
- datachain-0.2.12/README.rst +328 -0
- datachain-0.2.12/docs/assets/captioned_cartoons.png +0 -0
- datachain-0.2.12/docs/assets/flowchart.png +0 -0
- datachain-0.2.12/docs/index.md +304 -0
- datachain-0.2.12/docs/references/datachain.md +18 -0
- datachain-0.2.12/docs/references/datatype.md +19 -0
- datachain-0.2.12/docs/references/file.md +22 -0
- datachain-0.2.12/docs/references/index.md +8 -0
- datachain-0.2.12/docs/references/sql.md +18 -0
- datachain-0.2.12/docs/references/torch.md +17 -0
- datachain-0.2.12/docs/references/udf.md +18 -0
- datachain-0.2.12/examples/computer_vision/blip2_image_desc_lib.py +102 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/.gitignore +1 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +775 -1217
- datachain-0.2.12/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +4083 -0
- datachain-0.2.12/examples/computer_vision/fashion_product_images/3-train-model.ipynb +1080 -0
- datachain-0.2.12/examples/computer_vision/fashion_product_images/4-inference.ipynb +754 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/README.md +1 -1
- datachain-0.2.12/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +44 -0
- datachain-0.2.12/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +49 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +10 -18
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +5 -7
- datachain-0.2.12/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +52 -0
- datachain-0.2.12/examples/computer_vision/fashion_product_images/src/train.py +143 -0
- datachain-0.2.12/examples/computer_vision/iptc_exif_xmp_lib.py +75 -0
- datachain-0.2.12/examples/computer_vision/llava2_image_desc_lib.py +82 -0
- datachain-0.2.12/examples/computer_vision/openimage-detect.py +63 -0
- datachain-0.2.12/examples/get_started/common_sql_functions.py +93 -0
- {datachain-0.2.10/examples → datachain-0.2.12/examples/get_started}/json-csv-reader.py +14 -31
- {datachain-0.2.10/examples → datachain-0.2.12/examples/get_started}/torch-loader.py +9 -5
- datachain-0.2.12/examples/get_started/udfs/parallel.py +39 -0
- datachain-0.2.12/examples/get_started/udfs/simple.py +19 -0
- datachain-0.2.12/examples/get_started/udfs/stateful.py +43 -0
- {datachain-0.2.10/examples → datachain-0.2.12/examples/llm_and_nlp}/llm-claude-aggregate-query.py +3 -5
- {datachain-0.2.10/examples → datachain-0.2.12/examples/llm_and_nlp}/llm-claude-simple-query.py +10 -5
- {datachain-0.2.10/examples → datachain-0.2.12/examples/llm_and_nlp}/llm-claude.py +2 -5
- datachain-0.2.12/examples/llm_and_nlp/unstructured-text.py +63 -0
- {datachain-0.2.10/examples → datachain-0.2.12/examples/multimodal}/clip.py +6 -6
- {datachain-0.2.10 → datachain-0.2.12}/examples/multimodal/clip_fine_tuning.ipynb +532 -277
- datachain-0.2.12/examples/multimodal/hf_pipeline.py +124 -0
- datachain-0.2.12/examples/multimodal/openai_image_desc_lib.py +95 -0
- {datachain-0.2.10/examples → datachain-0.2.12/examples/multimodal}/wds.py +6 -6
- datachain-0.2.12/examples/multimodal/wds_filtered.py +38 -0
- {datachain-0.2.10 → datachain-0.2.12}/mkdocs.yml +10 -5
- {datachain-0.2.10 → datachain-0.2.12}/pyproject.toml +14 -5
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/__init__.py +3 -4
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/cache.py +10 -4
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/catalog/catalog.py +35 -15
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/cli.py +37 -32
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/metastore.py +24 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/warehouse.py +3 -1
- datachain-0.2.12/src/datachain/job.py +56 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/arrow.py +19 -7
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/clip.py +89 -66
- datachain-0.2.10/src/datachain/lib/convert/type_converter.py → datachain-0.2.12/src/datachain/lib/convert/python_to_sql.py +6 -6
- datachain-0.2.12/src/datachain/lib/convert/sql_to_python.py +23 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/convert/values_to_tuples.py +51 -33
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/data_model.py +6 -27
- datachain-0.2.12/src/datachain/lib/dataset_info.py +70 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/dc.py +646 -152
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/file.py +117 -15
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/image.py +1 -1
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/meta_formats.py +14 -2
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/model_store.py +3 -2
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/pytorch.py +10 -7
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/signal_schema.py +39 -14
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/text.py +2 -1
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/udf.py +56 -5
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/udf_signature.py +1 -1
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/webdataset.py +4 -3
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/node.py +11 -8
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/dataset.py +66 -147
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/dispatch.py +15 -13
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/schema.py +2 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/session.py +4 -4
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/array.py +12 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/string.py +8 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/torch/__init__.py +1 -1
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/utils.py +45 -0
- datachain-0.2.12/src/datachain.egg-info/PKG-INFO +412 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain.egg-info/SOURCES.txt +37 -48
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain.egg-info/requires.txt +2 -1
- {datachain-0.2.10 → datachain-0.2.12}/tests/examples/test_wds_e2e.py +5 -5
- {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_catalog.py +1 -1
- datachain-0.2.12/tests/func/test_datachain.py +217 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_dataset_query.py +156 -123
- datachain-0.2.12/tests/func/test_feature_pickling.py +209 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_query.py +2 -2
- {datachain-0.2.10 → datachain-0.2.12}/tests/scripts/feature_class_parallel.py +0 -1
- datachain-0.2.12/tests/scripts/feature_class_parallel_data_model.py +28 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/scripts/name_len_normal.py +1 -1
- {datachain-0.2.10 → datachain-0.2.12}/tests/test_query_e2e.py +57 -16
- datachain-0.2.12/tests/unit/lib/conftest.py +72 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_arrow.py +17 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_clip.py +2 -4
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_datachain.py +208 -77
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_datachain_bootstrap.py +5 -5
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_datachain_merge.py +14 -8
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_feature.py +1 -1
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_feature_utils.py +2 -2
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_file.py +115 -2
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_image.py +4 -5
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_signal_schema.py +39 -13
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_text.py +6 -8
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_utils.py +4 -4
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_catalog.py +13 -13
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_module_exports.py +0 -4
- datachain-0.2.10/.reuse/dep5 +0 -8
- datachain-0.2.10/LICENSES/Apache-2.0.txt +0 -73
- datachain-0.2.10/LICENSES/BSD-3-Clause.txt +0 -11
- datachain-0.2.10/LICENSES/Python-2.0.txt +0 -72
- datachain-0.2.10/PKG-INFO +0 -430
- datachain-0.2.10/README.rst +0 -347
- datachain-0.2.10/docs/index.md +0 -3
- datachain-0.2.10/docs/references/catalog.md +0 -3
- datachain-0.2.10/docs/references/datachain.md +0 -3
- datachain-0.2.10/examples/blip2_image_desc_lib.py +0 -35
- datachain-0.2.10/examples/common_sql_functions.py +0 -78
- datachain-0.2.10/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -3589
- datachain-0.2.10/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -91
- datachain-0.2.10/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -51
- datachain-0.2.10/examples/hf_pipeline.py +0 -98
- datachain-0.2.10/examples/iptc_exif_xmp_lib.py +0 -15
- datachain-0.2.10/examples/llava2_image_desc_lib.py +0 -43
- datachain-0.2.10/examples/loader.py +0 -31
- datachain-0.2.10/examples/neurips/README +0 -18
- datachain-0.2.10/examples/neurips/distance_to_query.py +0 -29
- datachain-0.2.10/examples/neurips/llm_chat.py +0 -46
- datachain-0.2.10/examples/neurips/requirements.txt +0 -9
- datachain-0.2.10/examples/neurips/single_query.py +0 -119
- datachain-0.2.10/examples/neurips/text_loaders.py +0 -80
- datachain-0.2.10/examples/openai_image_desc_lib.py +0 -29
- datachain-0.2.10/examples/openimage-detect.py +0 -72
- datachain-0.2.10/examples/pose_detection.py +0 -219
- datachain-0.2.10/examples/udfs/batching.py +0 -34
- datachain-0.2.10/examples/udfs/image_transformation.py +0 -45
- datachain-0.2.10/examples/udfs/parallel.py +0 -55
- datachain-0.2.10/examples/udfs/simple.py +0 -42
- datachain-0.2.10/examples/udfs/stateful.py +0 -44
- datachain-0.2.10/examples/udfs/stateful_similarity.py +0 -79
- datachain-0.2.10/examples/unstructured-text.py +0 -54
- datachain-0.2.10/examples/wds_filtered.py +0 -55
- datachain-0.2.10/examples/zalando/zalando_clip.py +0 -44
- datachain-0.2.10/examples/zalando/zalando_dir_as_class.py +0 -31
- datachain-0.2.10/examples/zalando/zalando_splits_and_classes_ds.py +0 -9
- datachain-0.2.10/examples/zalando/zalando_splits_and_classes_output.py +0 -17
- datachain-0.2.10/src/datachain/lib/feature_registry.py +0 -77
- datachain-0.2.10/src/datachain/lib/gpt4_vision.py +0 -97
- datachain-0.2.10/src/datachain/lib/hf_image_to_text.py +0 -97
- datachain-0.2.10/src/datachain/lib/hf_pipeline.py +0 -90
- datachain-0.2.10/src/datachain/lib/image_transform.py +0 -103
- datachain-0.2.10/src/datachain/lib/iptc_exif_xmp.py +0 -76
- datachain-0.2.10/src/datachain/lib/unstructured.py +0 -41
- datachain-0.2.10/src/datachain/text/__init__.py +0 -3
- datachain-0.2.10/src/datachain.egg-info/PKG-INFO +0 -430
- datachain-0.2.10/tests/func/test_datachain.py +0 -58
- datachain-0.2.10/tests/unit/lib/conftest.py +0 -21
- {datachain-0.2.10 → datachain-0.2.12}/.cruft.json +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.gitattributes +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.github/codecov.yaml +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.github/dependabot.yml +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.github/workflows/release.yml +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.gitignore +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/.pre-commit-config.yaml +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/LICENSE +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/docs/assets/datachain.png +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/noxfile.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/setup.cfg +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/__main__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/asyn.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/cli_utils.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/azure.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/gcs.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/local.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/s3.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/config.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/dataset.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/error.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/settings.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/utils.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/listing.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/progress.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/py.typed +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/batch.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/builtins.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/metrics.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/params.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/udf.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/remote/studio.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/types.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/utils.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain/storage.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/conftest.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/data.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/examples/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/examples/wds_data.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/func/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_client.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_datasets.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_ls.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_pull.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_pytorch.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/scripts/feature_class.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/test_cli_e2e.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_asyn.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_cache.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_client.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_dataset.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_listing.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_metastore.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_query_params.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_serializer.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_session.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_storage.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_udf.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_utils.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.2.10 → datachain-0.2.12}/tests/utils.py +0 -0
|
@@ -191,7 +191,7 @@ jobs:
|
|
|
191
191
|
# Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
|
|
192
192
|
run: >
|
|
193
193
|
pytest
|
|
194
|
-
--config-file=pyproject.toml -
|
|
194
|
+
--config-file=pyproject.toml -rs
|
|
195
195
|
--splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
|
|
196
196
|
tests ../datachain/tests
|
|
197
197
|
working-directory: backend/datachain_server
|
|
@@ -13,9 +13,9 @@ Here is a list of important resources for contributors:
|
|
|
13
13
|
- `Code of Conduct`_
|
|
14
14
|
|
|
15
15
|
.. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
|
|
16
|
-
.. _Source Code: https://github.com/iterative/
|
|
16
|
+
.. _Source Code: https://github.com/iterative/datachain
|
|
17
17
|
.. _Documentation: https://docs.dvc.ai/datachain
|
|
18
|
-
.. _Issue Tracker: https://github.com/iterative/
|
|
18
|
+
.. _Issue Tracker: https://github.com/iterative/datachain/issues
|
|
19
19
|
|
|
20
20
|
How to report a bug
|
|
21
21
|
-------------------
|
|
@@ -124,6 +124,6 @@ To run linting and code formatting checks, you can invoke a `lint` session in no
|
|
|
124
124
|
It is recommended to open an issue before starting work on anything.
|
|
125
125
|
This will allow a chance to talk it over with the owners and validate your approach.
|
|
126
126
|
|
|
127
|
-
.. _pull request: https://github.com/iterative/
|
|
127
|
+
.. _pull request: https://github.com/iterative/datachain/pulls
|
|
128
128
|
.. github-only
|
|
129
129
|
.. _Code of Conduct: CODE_OF_CONDUCT.rst
|
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datachain
|
|
3
|
+
Version: 0.2.12
|
|
4
|
+
Summary: Wrangle unstructured AI data at scale
|
|
5
|
+
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
|
+
Project-URL: Issues, https://github.com/iterative/datachain/issues
|
|
9
|
+
Project-URL: Source, https://github.com/iterative/datachain
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/x-rst
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pyyaml
|
|
20
|
+
Requires-Dist: tomlkit
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Requires-Dist: numpy
|
|
23
|
+
Requires-Dist: numpy<2,>=1; sys_platform == "win32"
|
|
24
|
+
Requires-Dist: pandas>=2.0.0
|
|
25
|
+
Requires-Dist: pyarrow
|
|
26
|
+
Requires-Dist: typing-extensions
|
|
27
|
+
Requires-Dist: python-dateutil>=2
|
|
28
|
+
Requires-Dist: attrs>=21.3.0
|
|
29
|
+
Requires-Dist: s3fs>=2024.2.0
|
|
30
|
+
Requires-Dist: gcsfs>=2024.2.0
|
|
31
|
+
Requires-Dist: adlfs>=2024.2.0
|
|
32
|
+
Requires-Dist: dvc-data<4,>=3.10
|
|
33
|
+
Requires-Dist: dvc-objects<6,>=4
|
|
34
|
+
Requires-Dist: shtab<2,>=1.3.4
|
|
35
|
+
Requires-Dist: sqlalchemy>=2
|
|
36
|
+
Requires-Dist: multiprocess==0.70.16
|
|
37
|
+
Requires-Dist: dill==0.3.8
|
|
38
|
+
Requires-Dist: cloudpickle
|
|
39
|
+
Requires-Dist: ujson>=5.9.0
|
|
40
|
+
Requires-Dist: pydantic<3,>=2
|
|
41
|
+
Requires-Dist: jmespath>=1.0
|
|
42
|
+
Requires-Dist: datamodel-code-generator>=0.25
|
|
43
|
+
Requires-Dist: Pillow<11,>=10.0.0
|
|
44
|
+
Provides-Extra: docs
|
|
45
|
+
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
46
|
+
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
47
|
+
Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
|
|
48
|
+
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
49
|
+
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
50
|
+
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
51
|
+
Provides-Extra: torch
|
|
52
|
+
Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
53
|
+
Requires-Dist: torchvision; extra == "torch"
|
|
54
|
+
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
55
|
+
Provides-Extra: remote
|
|
56
|
+
Requires-Dist: lz4; extra == "remote"
|
|
57
|
+
Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
|
|
58
|
+
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
59
|
+
Provides-Extra: vector
|
|
60
|
+
Requires-Dist: usearch; extra == "vector"
|
|
61
|
+
Provides-Extra: tests
|
|
62
|
+
Requires-Dist: datachain[remote,torch,vector]; extra == "tests"
|
|
63
|
+
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
64
|
+
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
65
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
66
|
+
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
67
|
+
Requires-Dist: pytest-servers[all]>=0.5.5; extra == "tests"
|
|
68
|
+
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
69
|
+
Requires-Dist: pytest-asyncio>=0.23.2; extra == "tests"
|
|
70
|
+
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
71
|
+
Requires-Dist: virtualenv; extra == "tests"
|
|
72
|
+
Requires-Dist: dulwich; extra == "tests"
|
|
73
|
+
Requires-Dist: hypothesis; extra == "tests"
|
|
74
|
+
Requires-Dist: open_clip_torch; extra == "tests"
|
|
75
|
+
Requires-Dist: aiotools>=1.7.0; extra == "tests"
|
|
76
|
+
Requires-Dist: requests-mock; extra == "tests"
|
|
77
|
+
Provides-Extra: dev
|
|
78
|
+
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
79
|
+
Requires-Dist: mypy==1.10.1; extra == "dev"
|
|
80
|
+
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
81
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
82
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
83
|
+
Requires-Dist: types-ujson; extra == "dev"
|
|
84
|
+
|
|
85
|
+
|PyPI| |Python Version| |Codecov| |Tests|
|
|
86
|
+
|
|
87
|
+
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
88
|
+
:target: https://pypi.org/project/datachain/
|
|
89
|
+
:alt: PyPI
|
|
90
|
+
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
91
|
+
:target: https://pypi.org/project/datachain
|
|
92
|
+
:alt: Python Version
|
|
93
|
+
.. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
|
|
94
|
+
:target: https://codecov.io/gh/iterative/datachain
|
|
95
|
+
:alt: Codecov
|
|
96
|
+
.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
|
|
97
|
+
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
98
|
+
:alt: Tests
|
|
99
|
+
|
|
100
|
+
AI 🔗 DataChain
|
|
101
|
+
----------------
|
|
102
|
+
|
|
103
|
+
DataChain is an open-source Python library for processing and curating unstructured
|
|
104
|
+
data at scale.
|
|
105
|
+
|
|
106
|
+
🤖 AI-Driven Data Curation: Use local ML models, LLM APIs calls to enrich your data.
|
|
107
|
+
|
|
108
|
+
🚀 GenAI Dataset scale: Handle 10s of milions of files or file snippets.
|
|
109
|
+
|
|
110
|
+
🐍 Python-friendly: Use strictly typed `Pydantic`_ objects instead of JSON.
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
To ensure efficiency, Datachain supports parallel processing, parallel data
|
|
114
|
+
downloads, and out-of-memory computing. It excels at optimizing batch operations.
|
|
115
|
+
While most GenAI tools focus on online applications and realtime, DataChain is designed
|
|
116
|
+
for offline data processing, data curation and ETL.
|
|
117
|
+
|
|
118
|
+
The typical use cases are Computer Vision data curation, LLM analytics
|
|
119
|
+
and validation.
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
.. code:: console
|
|
123
|
+
|
|
124
|
+
$ pip install datachain
|
|
125
|
+
|
|
126
|
+
|Flowchart|
|
|
127
|
+
|
|
128
|
+
Quick Start
|
|
129
|
+
-----------
|
|
130
|
+
|
|
131
|
+
Basic evaluation
|
|
132
|
+
================
|
|
133
|
+
|
|
134
|
+
We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
|
|
135
|
+
- 50 files total in the example.
|
|
136
|
+
These dialogs involve users looking for better wireless plans chatting with bot.
|
|
137
|
+
Our goal is to identify successful dialogs.
|
|
138
|
+
|
|
139
|
+
The data used in the examples is publicly available. Please feel free to run this code.
|
|
140
|
+
|
|
141
|
+
First, we'll use a simple sentiment analysis model. Please install transformers.
|
|
142
|
+
|
|
143
|
+
.. code:: shell
|
|
144
|
+
|
|
145
|
+
pip install transformers
|
|
146
|
+
|
|
147
|
+
The code below downloads files the cloud, applies function
|
|
148
|
+
`is_positive_dialogue_ending()` to each. All files with a positive sentiment
|
|
149
|
+
are copied to local directory `output/`.
|
|
150
|
+
|
|
151
|
+
.. code:: py
|
|
152
|
+
|
|
153
|
+
from transformers import pipeline
|
|
154
|
+
from datachain import DataChain, Column
|
|
155
|
+
|
|
156
|
+
classifier = pipeline("sentiment-analysis", device="cpu",
|
|
157
|
+
model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
|
|
158
|
+
|
|
159
|
+
def is_positive_dialogue_ending(file) -> bool:
|
|
160
|
+
dialogue_ending = file.read()[-512:]
|
|
161
|
+
return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
|
|
162
|
+
|
|
163
|
+
chain = (
|
|
164
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
|
|
165
|
+
object_name="file", type="text")
|
|
166
|
+
.settings(parallel=8, cache=True)
|
|
167
|
+
.map(is_positive=is_positive_dialogue_ending)
|
|
168
|
+
.save("file_response")
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
positive_chain = chain.filter(Column("is_positive") == True)
|
|
172
|
+
positive_chain.export_files("./output1")
|
|
173
|
+
|
|
174
|
+
print(f"{positive_chain.count()} files were exported")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
13 files were exported
|
|
179
|
+
|
|
180
|
+
.. code:: shell
|
|
181
|
+
|
|
182
|
+
$ ls output/datachain-demo/chatbot-KiT/
|
|
183
|
+
15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
|
|
184
|
+
$ ls output/datachain-demo/chatbot-KiT/ | wc -l
|
|
185
|
+
13
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
LLM judging LLMs dialogs
|
|
189
|
+
==========================
|
|
190
|
+
|
|
191
|
+
Finding good dialogs using an LLM can be more efficient. In this example,
|
|
192
|
+
we use Mistral with a free API. Please install the package and get a free
|
|
193
|
+
Mistral API key at https://console.mistral.ai
|
|
194
|
+
|
|
195
|
+
.. code:: shell
|
|
196
|
+
|
|
197
|
+
$ pip install mistralai
|
|
198
|
+
$ export MISTRAL_API_KEY=_your_key_
|
|
199
|
+
|
|
200
|
+
Below is a similar code example, but this time using an LLM to evaluate the dialogs.
|
|
201
|
+
Note, only 4 threads were used in this example `parallel=4` due to a limitation of
|
|
202
|
+
the free LLM service.
|
|
203
|
+
|
|
204
|
+
.. code:: py
|
|
205
|
+
|
|
206
|
+
from mistralai.client import MistralClient
|
|
207
|
+
from mistralai.models.chat_completion import ChatMessage
|
|
208
|
+
from datachain import File, DataChain, Column
|
|
209
|
+
|
|
210
|
+
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
211
|
+
|
|
212
|
+
def eval_dialogue(file: File) -> bool:
|
|
213
|
+
client = MistralClient()
|
|
214
|
+
response = client.chat(
|
|
215
|
+
model="open-mixtral-8x22b",
|
|
216
|
+
messages=[ChatMessage(role="system", content=PROMPT),
|
|
217
|
+
ChatMessage(role="user", content=file.read())])
|
|
218
|
+
result = response.choices[0].message.content
|
|
219
|
+
return result.lower().startswith("success")
|
|
220
|
+
|
|
221
|
+
chain = (
|
|
222
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
223
|
+
.settings(parallel=4, cache=True)
|
|
224
|
+
.map(is_success=eval_dialogue)
|
|
225
|
+
.save("mistral_files")
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
successful_chain = chain.filter(Column("is_success") == True)
|
|
229
|
+
successful_chain.export_files("./output_mistral")
|
|
230
|
+
|
|
231
|
+
print(f"{successful_chain.count()} files were exported")
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
With the current prompt, we found 31 files considered successful dialogs:
|
|
235
|
+
|
|
236
|
+
.. code:: shell
|
|
237
|
+
|
|
238
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/
|
|
239
|
+
1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
|
|
240
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
|
|
241
|
+
31
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
Serializing Python-objects
|
|
246
|
+
==========================
|
|
247
|
+
|
|
248
|
+
LLM responses contain valuable information for analytics, such as tokens used and the
|
|
249
|
+
model. Preserving this information can be beneficial.
|
|
250
|
+
|
|
251
|
+
Instead of extracting this information from the Mistral data structure (class
|
|
252
|
+
`ChatCompletionResponse`), we serialize the entire Python object to the internal DB.
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
.. code:: py
|
|
256
|
+
|
|
257
|
+
from mistralai.client import MistralClient
|
|
258
|
+
from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
|
|
259
|
+
from datachain import File, DataChain, Column
|
|
260
|
+
|
|
261
|
+
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
262
|
+
|
|
263
|
+
def eval_dialog(file: File) -> ChatCompletionResponse:
|
|
264
|
+
client = MistralClient()
|
|
265
|
+
return client.chat(
|
|
266
|
+
model="open-mixtral-8x22b",
|
|
267
|
+
messages=[ChatMessage(role="system", content=PROMPT),
|
|
268
|
+
ChatMessage(role="user", content=file.read())])
|
|
269
|
+
|
|
270
|
+
chain = (
|
|
271
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
272
|
+
.settings(parallel=4, cache=True)
|
|
273
|
+
.map(response=eval_dialog)
|
|
274
|
+
.map(status=lambda response: response.choices[0].message.content.lower()[:7])
|
|
275
|
+
.save("response")
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
chain.select("file.name", "status", "response.usage").show(5)
|
|
279
|
+
|
|
280
|
+
success_rate = chain.filter(Column("status") == "success").count() / chain.count()
|
|
281
|
+
print(f"{100*success_rate:.1f}% dialogs were successful")
|
|
282
|
+
|
|
283
|
+
Output:
|
|
284
|
+
|
|
285
|
+
.. code:: shell
|
|
286
|
+
|
|
287
|
+
file status response response response
|
|
288
|
+
name usage usage usage
|
|
289
|
+
prompt_tokens total_tokens completion_tokens
|
|
290
|
+
0 1.txt success 547 548 1
|
|
291
|
+
1 10.txt failure 3576 3578 2
|
|
292
|
+
2 11.txt failure 626 628 2
|
|
293
|
+
3 12.txt failure 1144 1182 38
|
|
294
|
+
4 13.txt success 1100 1101 1
|
|
295
|
+
|
|
296
|
+
[Limited by 5 rows]
|
|
297
|
+
64.0% dialogs were successful
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
Complex Python data structures
|
|
301
|
+
=============================================
|
|
302
|
+
|
|
303
|
+
In the previous examples, a few dataset were saved in the embedded database
|
|
304
|
+
(`SQLite`_ in directory `.datachain`).
|
|
305
|
+
These datasets are versioned, and can be accessed using
|
|
306
|
+
`DataChain.from_dataset("dataset_name")`.
|
|
307
|
+
|
|
308
|
+
.. code:: py
|
|
309
|
+
|
|
310
|
+
chain = DataChain.from_dataset("response")
|
|
311
|
+
|
|
312
|
+
# Iterating one-by-one: out of memory
|
|
313
|
+
for file, response in chain.limit(5).collect("file", "response"):
|
|
314
|
+
# You work with Python objects
|
|
315
|
+
assert isinstance(response, ChatCompletionResponse)
|
|
316
|
+
|
|
317
|
+
status = response.choices[0].message.content[:7]
|
|
318
|
+
tokens = response.usage.total_tokens
|
|
319
|
+
print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
|
|
320
|
+
|
|
321
|
+
Output:
|
|
322
|
+
|
|
323
|
+
.. code:: shell
|
|
324
|
+
|
|
325
|
+
gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
|
|
326
|
+
gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
|
|
327
|
+
gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
|
|
328
|
+
gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
|
|
329
|
+
gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
Vectorized analytics over Python objects
|
|
333
|
+
========================================
|
|
334
|
+
|
|
335
|
+
Some operations can be efficiently run inside the DB without deserializing Python objects.
|
|
336
|
+
Let's calculate the cost of using LLM APIs in a vectorized way.
|
|
337
|
+
Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
|
|
338
|
+
|
|
339
|
+
.. code:: py
|
|
340
|
+
|
|
341
|
+
chain = DataChain.from_dataset("mistral_dataset")
|
|
342
|
+
|
|
343
|
+
cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
|
|
344
|
+
+ chain.sum("response.usage.completion_tokens")*0.000006
|
|
345
|
+
print(f"Spent ${cost:.2f} on {chain.count()} calls")
|
|
346
|
+
|
|
347
|
+
Output:
|
|
348
|
+
|
|
349
|
+
.. code:: shell
|
|
350
|
+
|
|
351
|
+
Spent $0.08 on 50 calls
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
PyTorch data loader
|
|
355
|
+
===================
|
|
356
|
+
|
|
357
|
+
Chain results can be exported or passed directly to PyTorch dataloader.
|
|
358
|
+
For example, if we are interested in passing image and a label based on file
|
|
359
|
+
name suffix, the following code will do it:
|
|
360
|
+
|
|
361
|
+
.. code:: py
|
|
362
|
+
|
|
363
|
+
from torch.utils.data import DataLoader
|
|
364
|
+
from transformers import CLIPProcessor
|
|
365
|
+
|
|
366
|
+
from datachain import C, DataChain
|
|
367
|
+
|
|
368
|
+
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
369
|
+
|
|
370
|
+
chain = (
|
|
371
|
+
DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
|
|
372
|
+
.map(label=lambda name: name.split(".")[0], params=["file.name"])
|
|
373
|
+
.select("file", "label").to_pytorch(
|
|
374
|
+
transform=processor.image_processor,
|
|
375
|
+
tokenizer=processor.tokenizer,
|
|
376
|
+
)
|
|
377
|
+
)
|
|
378
|
+
loader = DataLoader(chain, batch_size=1)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
Tutorials
|
|
382
|
+
---------
|
|
383
|
+
|
|
384
|
+
* `Getting Started`_
|
|
385
|
+
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
|
|
386
|
+
|
|
387
|
+
Contributions
|
|
388
|
+
-------------
|
|
389
|
+
|
|
390
|
+
Contributions are very welcome.
|
|
391
|
+
To learn more, see the `Contributor Guide`_.
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
Community and Support
|
|
395
|
+
---------------------
|
|
396
|
+
|
|
397
|
+
* `Docs <https://datachain.dvc.ai/>`_
|
|
398
|
+
* `File an issue`_ if you encounter any problems
|
|
399
|
+
* `Discord Chat <https://dvc.org/chat>`_
|
|
400
|
+
* `Email <mailto:support@dvc.org>`_
|
|
401
|
+
* `Twitter <https://twitter.com/DVCorg>`_
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
.. _PyPI: https://pypi.org/
|
|
405
|
+
.. _file an issue: https://github.com/iterative/datachain/issues
|
|
406
|
+
.. github-only
|
|
407
|
+
.. _Contributor Guide: CONTRIBUTING.rst
|
|
408
|
+
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
409
|
+
.. _SQLite: https://www.sqlite.org/
|
|
410
|
+
.. _Getting Started: https://datachain.dvc.ai/
|
|
411
|
+
.. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
|
|
412
|
+
:alt: DataChain FlowChart
|