datachain 0.2.12__tar.gz → 0.2.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.2.12 → datachain-0.2.13}/.github/workflows/tests.yml +8 -23
- {datachain-0.2.12 → datachain-0.2.13}/.pre-commit-config.yaml +2 -0
- {datachain-0.2.12/src/datachain.egg-info → datachain-0.2.13}/PKG-INFO +41 -42
- {datachain-0.2.12 → datachain-0.2.13}/README.rst +39 -41
- datachain-0.2.13/examples/llm/llm_chatbot_evaluation.ipynb +772 -0
- {datachain-0.2.12 → datachain-0.2.13}/pyproject.toml +1 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/catalog/catalog.py +7 -1
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/cli.py +11 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/file.py +17 -11
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/query/dataset.py +10 -2
- {datachain-0.2.12 → datachain-0.2.13/src/datachain.egg-info}/PKG-INFO +41 -42
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain.egg-info/requires.txt +1 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/conftest.py +42 -26
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/test_catalog.py +39 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/test_datachain.py +59 -1
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/test_dataset_query.py +29 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_datachain.py +1 -1
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_file.py +57 -1
- {datachain-0.2.12 → datachain-0.2.13}/tests/utils.py +6 -0
- {datachain-0.2.12 → datachain-0.2.13}/.cruft.json +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/.gitattributes +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/.github/codecov.yaml +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/.github/dependabot.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/.github/workflows/release.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/.gitignore +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/CONTRIBUTING.rst +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/LICENSE +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/assets/datachain.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/assets/flowchart.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/index.md +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/references/datachain.md +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/references/datatype.md +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/references/file.md +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/references/index.md +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/references/sql.md +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/references/torch.md +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/docs/references/udf.md +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/README.md +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/llm_and_nlp/llm-claude.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/multimodal/clip.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/multimodal/wds.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/mkdocs.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/noxfile.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/setup.cfg +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/__main__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/asyn.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/cache.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/cli_utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/client/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/client/azure.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/client/gcs.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/client/local.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/client/s3.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/config.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/dataset.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/error.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/job.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/clip.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/dc.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/image.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/settings.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/text.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/udf.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/listing.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/node.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/progress.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/py.typed +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/query/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/query/batch.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/query/builtins.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/query/metrics.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/query/params.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/query/schema.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/query/session.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/query/udf.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/remote/studio.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/types.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/sql/utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/storage.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain/utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/data.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/examples/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/examples/wds_data.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/test_client.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/test_datasets.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/test_ls.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/test_pull.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/test_pytorch.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/func/test_query.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/scripts/feature_class.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/scripts/name_len_normal.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/test_cli_e2e.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/test_query_e2e.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_asyn.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_cache.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_catalog.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_client.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_dataset.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_listing.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_metastore.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_query_params.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_serializer.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_session.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_storage.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_udf.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.13}/tests/unit/test_warehouse.py +0 -0
|
@@ -69,26 +69,6 @@ jobs:
|
|
|
69
69
|
pyv: '3.12'
|
|
70
70
|
|
|
71
71
|
steps:
|
|
72
|
-
|
|
73
|
-
# https://github.com/iterative/pytest-servers/pull/122
|
|
74
|
-
# https://github.com/abiosoft/colima/issues/468
|
|
75
|
-
# https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
|
|
76
|
-
# colima v0.5.6 seems to run more stable than the latest - that has occasional network failures (ports are not open)
|
|
77
|
-
# see: https://github.com/abiosoft/colima/issues/962
|
|
78
|
-
- name: Use colima as default docker host on MacOS
|
|
79
|
-
if: runner.os == 'macOS'
|
|
80
|
-
run: |
|
|
81
|
-
brew install docker lima || true # avoid non-zero exit code if brew link fails
|
|
82
|
-
sudo curl -L -o /usr/local/bin/colima https://github.com/abiosoft/colima/releases/download/v0.5.6/colima-Darwin-x86_64
|
|
83
|
-
sudo chmod +x /usr/local/bin/colima
|
|
84
|
-
colima start
|
|
85
|
-
sudo ln -vsf "${HOME}"/.colima/default/docker.sock /var/run/docker.sock
|
|
86
|
-
env:
|
|
87
|
-
HOMEBREW_NO_AUTO_UPDATE: true
|
|
88
|
-
HOMEBREW_NO_INSTALL_CLEANUP: true
|
|
89
|
-
HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: true
|
|
90
|
-
HOMEBREW_NO_INSTALL_UPGRADE: true
|
|
91
|
-
|
|
92
72
|
- name: Check out the repository
|
|
93
73
|
uses: actions/checkout@v4
|
|
94
74
|
with:
|
|
@@ -106,12 +86,17 @@ jobs:
|
|
|
106
86
|
nox --version
|
|
107
87
|
uv --version
|
|
108
88
|
|
|
109
|
-
- name: Skip flaky azure, gs remotes
|
|
89
|
+
- name: Skip flaky azure, gs remotes on macOS
|
|
110
90
|
if: runner.os == 'macOS'
|
|
111
|
-
run: echo '
|
|
91
|
+
run: echo 'DISABLE_REMOTES_ARG=--disable-remotes=azure,gs' >> "$GITHUB_ENV"
|
|
92
|
+
|
|
93
|
+
- name: Skip all remotes on Windows
|
|
94
|
+
if: runner.os == 'Windows'
|
|
95
|
+
run: echo 'DISABLE_REMOTES_ARG=--disable-remotes=azure,gs' >> $env:GITHUB_ENV
|
|
112
96
|
|
|
113
97
|
- name: Run tests
|
|
114
|
-
run: nox -s tests-${{ matrix.pyv }}
|
|
98
|
+
run: nox -s tests-${{ matrix.pyv }} -- $DISABLE_REMOTES_ARG
|
|
99
|
+
shell: bash
|
|
115
100
|
|
|
116
101
|
- name: Upload coverage report
|
|
117
102
|
uses: codecov/codecov-action@v4
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.13
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -78,6 +78,7 @@ Provides-Extra: dev
|
|
|
78
78
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
79
79
|
Requires-Dist: mypy==1.10.1; extra == "dev"
|
|
80
80
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
81
|
+
Requires-Dist: types-pytz; extra == "dev"
|
|
81
82
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
82
83
|
Requires-Dist: types-requests; extra == "dev"
|
|
83
84
|
Requires-Dist: types-ujson; extra == "dev"
|
|
@@ -103,20 +104,18 @@ AI 🔗 DataChain
|
|
|
103
104
|
DataChain is an open-source Python library for processing and curating unstructured
|
|
104
105
|
data at scale.
|
|
105
106
|
|
|
106
|
-
🤖 AI-Driven Data Curation: Use local ML models
|
|
107
|
+
🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
|
|
107
108
|
|
|
108
|
-
🚀 GenAI Dataset scale: Handle
|
|
109
|
+
🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
|
|
109
110
|
|
|
110
|
-
🐍 Python-friendly: Use strictly
|
|
111
|
+
🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
|
|
111
112
|
|
|
112
113
|
|
|
113
|
-
|
|
114
|
-
downloads, and out-of-memory computing. It excels at optimizing batch operations.
|
|
115
|
-
While most GenAI tools focus on online applications and realtime, DataChain is designed
|
|
116
|
-
for offline data processing, data curation and ETL.
|
|
114
|
+
Datachain supports parallel processing, parallel data
|
|
115
|
+
downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
|
|
117
116
|
|
|
118
|
-
The typical use cases
|
|
119
|
-
and validation.
|
|
117
|
+
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
118
|
+
and validation of multimodal AI applications.
|
|
120
119
|
|
|
121
120
|
|
|
122
121
|
.. code:: console
|
|
@@ -128,25 +127,25 @@ and validation.
|
|
|
128
127
|
Quick Start
|
|
129
128
|
-----------
|
|
130
129
|
|
|
131
|
-
|
|
132
|
-
|
|
130
|
+
Data curation with a local model
|
|
131
|
+
=================================
|
|
133
132
|
|
|
134
133
|
We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
|
|
135
|
-
- 50 files total in
|
|
136
|
-
These dialogs involve users looking for better wireless plans
|
|
137
|
-
Our goal is to identify successful dialogs.
|
|
134
|
+
- 50 files total in this example.
|
|
135
|
+
These dialogs involve users chatting with a bot while looking for better wireless plans.
|
|
136
|
+
Our goal is to identify the successful dialogs.
|
|
138
137
|
|
|
139
|
-
The data used in the examples is publicly available.
|
|
138
|
+
The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
|
|
140
139
|
|
|
141
|
-
First, we'll
|
|
140
|
+
First, we'll show batch inference with a simple sentiment model using the `transformers` library:
|
|
142
141
|
|
|
143
142
|
.. code:: shell
|
|
144
143
|
|
|
145
144
|
pip install transformers
|
|
146
145
|
|
|
147
|
-
The code below downloads files the cloud, applies function
|
|
148
|
-
|
|
149
|
-
are copied to local directory
|
|
146
|
+
The code below downloads files the cloud, and applies a user-defined function
|
|
147
|
+
to each one of them. All files with a positive sentiment
|
|
148
|
+
detected are then copied to the local directory.
|
|
150
149
|
|
|
151
150
|
.. code:: py
|
|
152
151
|
|
|
@@ -169,7 +168,7 @@ are copied to local directory `output/`.
|
|
|
169
168
|
)
|
|
170
169
|
|
|
171
170
|
positive_chain = chain.filter(Column("is_positive") == True)
|
|
172
|
-
positive_chain.export_files("./
|
|
171
|
+
positive_chain.export_files("./output")
|
|
173
172
|
|
|
174
173
|
print(f"{positive_chain.count()} files were exported")
|
|
175
174
|
|
|
@@ -185,11 +184,11 @@ are copied to local directory `output/`.
|
|
|
185
184
|
13
|
|
186
185
|
|
|
187
186
|
|
|
188
|
-
LLM judging
|
|
189
|
-
|
|
187
|
+
LLM judging chatbots
|
|
188
|
+
=============================
|
|
190
189
|
|
|
191
|
-
|
|
192
|
-
we
|
|
190
|
+
LLMs can work as efficient universal classifiers. In the example below,
|
|
191
|
+
we employ a free API from Mistral to judge the chatbot performance. Please get a free
|
|
193
192
|
Mistral API key at https://console.mistral.ai
|
|
194
193
|
|
|
195
194
|
.. code:: shell
|
|
@@ -197,9 +196,7 @@ Mistral API key at https://console.mistral.ai
|
|
|
197
196
|
$ pip install mistralai
|
|
198
197
|
$ export MISTRAL_API_KEY=_your_key_
|
|
199
198
|
|
|
200
|
-
|
|
201
|
-
Note, only 4 threads were used in this example `parallel=4` due to a limitation of
|
|
202
|
-
the free LLM service.
|
|
199
|
+
DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
|
|
203
200
|
|
|
204
201
|
.. code:: py
|
|
205
202
|
|
|
@@ -231,7 +228,7 @@ the free LLM service.
|
|
|
231
228
|
print(f"{successful_chain.count()} files were exported")
|
|
232
229
|
|
|
233
230
|
|
|
234
|
-
With the
|
|
231
|
+
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
235
232
|
|
|
236
233
|
.. code:: shell
|
|
237
234
|
|
|
@@ -245,11 +242,11 @@ With the current prompt, we found 31 files considered successful dialogs:
|
|
|
245
242
|
Serializing Python-objects
|
|
246
243
|
==========================
|
|
247
244
|
|
|
248
|
-
LLM responses contain valuable information for analytics
|
|
249
|
-
model
|
|
245
|
+
LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
|
|
246
|
+
model performance parameters.
|
|
250
247
|
|
|
251
|
-
Instead of extracting this information from the Mistral data structure (class
|
|
252
|
-
`ChatCompletionResponse`),
|
|
248
|
+
Instead of extracting this information from the Mistral response data structure (class
|
|
249
|
+
`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
|
|
253
250
|
|
|
254
251
|
|
|
255
252
|
.. code:: py
|
|
@@ -297,21 +294,23 @@ Output:
|
|
|
297
294
|
64.0% dialogs were successful
|
|
298
295
|
|
|
299
296
|
|
|
300
|
-
|
|
297
|
+
Iterating over Python data structures
|
|
301
298
|
=============================================
|
|
302
299
|
|
|
303
|
-
In the previous examples,
|
|
304
|
-
(`SQLite`_ in
|
|
305
|
-
These datasets
|
|
300
|
+
In the previous examples, datasets were saved in the embedded database
|
|
301
|
+
(`SQLite`_ in folder `.datachain` of the working directory).
|
|
302
|
+
These datasets were automatically versioned, and can be accessed using
|
|
306
303
|
`DataChain.from_dataset("dataset_name")`.
|
|
307
304
|
|
|
305
|
+
Here is how to retrieve a saved dataset and iterate over the objects:
|
|
306
|
+
|
|
308
307
|
.. code:: py
|
|
309
308
|
|
|
310
309
|
chain = DataChain.from_dataset("response")
|
|
311
310
|
|
|
312
|
-
# Iterating one-by-one: out
|
|
311
|
+
# Iterating one-by-one: support out-of-memory workflow
|
|
313
312
|
for file, response in chain.limit(5).collect("file", "response"):
|
|
314
|
-
#
|
|
313
|
+
# verify the collected Python objects
|
|
315
314
|
assert isinstance(response, ChatCompletionResponse)
|
|
316
315
|
|
|
317
316
|
status = response.choices[0].message.content[:7]
|
|
@@ -332,9 +331,8 @@ Output:
|
|
|
332
331
|
Vectorized analytics over Python objects
|
|
333
332
|
========================================
|
|
334
333
|
|
|
335
|
-
Some operations can
|
|
336
|
-
|
|
337
|
-
Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
|
|
334
|
+
Some operations can run inside the DB without deserialization.
|
|
335
|
+
For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
|
|
338
336
|
|
|
339
337
|
.. code:: py
|
|
340
338
|
|
|
@@ -406,6 +404,7 @@ Community and Support
|
|
|
406
404
|
.. github-only
|
|
407
405
|
.. _Contributor Guide: CONTRIBUTING.rst
|
|
408
406
|
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
407
|
+
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
409
408
|
.. _SQLite: https://www.sqlite.org/
|
|
410
409
|
.. _Getting Started: https://datachain.dvc.ai/
|
|
411
410
|
.. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
|
|
@@ -19,20 +19,18 @@ AI 🔗 DataChain
|
|
|
19
19
|
DataChain is an open-source Python library for processing and curating unstructured
|
|
20
20
|
data at scale.
|
|
21
21
|
|
|
22
|
-
🤖 AI-Driven Data Curation: Use local ML models
|
|
22
|
+
🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
|
|
23
23
|
|
|
24
|
-
🚀 GenAI Dataset scale: Handle
|
|
24
|
+
🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
|
|
25
25
|
|
|
26
|
-
🐍 Python-friendly: Use strictly
|
|
26
|
+
🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
downloads, and out-of-memory computing. It excels at optimizing batch operations.
|
|
31
|
-
While most GenAI tools focus on online applications and realtime, DataChain is designed
|
|
32
|
-
for offline data processing, data curation and ETL.
|
|
29
|
+
Datachain supports parallel processing, parallel data
|
|
30
|
+
downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
|
|
33
31
|
|
|
34
|
-
The typical use cases
|
|
35
|
-
and validation.
|
|
32
|
+
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
33
|
+
and validation of multimodal AI applications.
|
|
36
34
|
|
|
37
35
|
|
|
38
36
|
.. code:: console
|
|
@@ -44,25 +42,25 @@ and validation.
|
|
|
44
42
|
Quick Start
|
|
45
43
|
-----------
|
|
46
44
|
|
|
47
|
-
|
|
48
|
-
|
|
45
|
+
Data curation with a local model
|
|
46
|
+
=================================
|
|
49
47
|
|
|
50
48
|
We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
|
|
51
|
-
- 50 files total in
|
|
52
|
-
These dialogs involve users looking for better wireless plans
|
|
53
|
-
Our goal is to identify successful dialogs.
|
|
49
|
+
- 50 files total in this example.
|
|
50
|
+
These dialogs involve users chatting with a bot while looking for better wireless plans.
|
|
51
|
+
Our goal is to identify the successful dialogs.
|
|
54
52
|
|
|
55
|
-
The data used in the examples is publicly available.
|
|
53
|
+
The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
|
|
56
54
|
|
|
57
|
-
First, we'll
|
|
55
|
+
First, we'll show batch inference with a simple sentiment model using the `transformers` library:
|
|
58
56
|
|
|
59
57
|
.. code:: shell
|
|
60
58
|
|
|
61
59
|
pip install transformers
|
|
62
60
|
|
|
63
|
-
The code below downloads files the cloud, applies function
|
|
64
|
-
|
|
65
|
-
are copied to local directory
|
|
61
|
+
The code below downloads files the cloud, and applies a user-defined function
|
|
62
|
+
to each one of them. All files with a positive sentiment
|
|
63
|
+
detected are then copied to the local directory.
|
|
66
64
|
|
|
67
65
|
.. code:: py
|
|
68
66
|
|
|
@@ -85,7 +83,7 @@ are copied to local directory `output/`.
|
|
|
85
83
|
)
|
|
86
84
|
|
|
87
85
|
positive_chain = chain.filter(Column("is_positive") == True)
|
|
88
|
-
positive_chain.export_files("./
|
|
86
|
+
positive_chain.export_files("./output")
|
|
89
87
|
|
|
90
88
|
print(f"{positive_chain.count()} files were exported")
|
|
91
89
|
|
|
@@ -101,11 +99,11 @@ are copied to local directory `output/`.
|
|
|
101
99
|
13
|
|
102
100
|
|
|
103
101
|
|
|
104
|
-
LLM judging
|
|
105
|
-
|
|
102
|
+
LLM judging chatbots
|
|
103
|
+
=============================
|
|
106
104
|
|
|
107
|
-
|
|
108
|
-
we
|
|
105
|
+
LLMs can work as efficient universal classifiers. In the example below,
|
|
106
|
+
we employ a free API from Mistral to judge the chatbot performance. Please get a free
|
|
109
107
|
Mistral API key at https://console.mistral.ai
|
|
110
108
|
|
|
111
109
|
.. code:: shell
|
|
@@ -113,9 +111,7 @@ Mistral API key at https://console.mistral.ai
|
|
|
113
111
|
$ pip install mistralai
|
|
114
112
|
$ export MISTRAL_API_KEY=_your_key_
|
|
115
113
|
|
|
116
|
-
|
|
117
|
-
Note, only 4 threads were used in this example `parallel=4` due to a limitation of
|
|
118
|
-
the free LLM service.
|
|
114
|
+
DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
|
|
119
115
|
|
|
120
116
|
.. code:: py
|
|
121
117
|
|
|
@@ -147,7 +143,7 @@ the free LLM service.
|
|
|
147
143
|
print(f"{successful_chain.count()} files were exported")
|
|
148
144
|
|
|
149
145
|
|
|
150
|
-
With the
|
|
146
|
+
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
151
147
|
|
|
152
148
|
.. code:: shell
|
|
153
149
|
|
|
@@ -161,11 +157,11 @@ With the current prompt, we found 31 files considered successful dialogs:
|
|
|
161
157
|
Serializing Python-objects
|
|
162
158
|
==========================
|
|
163
159
|
|
|
164
|
-
LLM responses contain valuable information for analytics
|
|
165
|
-
model
|
|
160
|
+
LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
|
|
161
|
+
model performance parameters.
|
|
166
162
|
|
|
167
|
-
Instead of extracting this information from the Mistral data structure (class
|
|
168
|
-
`ChatCompletionResponse`),
|
|
163
|
+
Instead of extracting this information from the Mistral response data structure (class
|
|
164
|
+
`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
|
|
169
165
|
|
|
170
166
|
|
|
171
167
|
.. code:: py
|
|
@@ -213,21 +209,23 @@ Output:
|
|
|
213
209
|
64.0% dialogs were successful
|
|
214
210
|
|
|
215
211
|
|
|
216
|
-
|
|
212
|
+
Iterating over Python data structures
|
|
217
213
|
=============================================
|
|
218
214
|
|
|
219
|
-
In the previous examples,
|
|
220
|
-
(`SQLite`_ in
|
|
221
|
-
These datasets
|
|
215
|
+
In the previous examples, datasets were saved in the embedded database
|
|
216
|
+
(`SQLite`_ in folder `.datachain` of the working directory).
|
|
217
|
+
These datasets were automatically versioned, and can be accessed using
|
|
222
218
|
`DataChain.from_dataset("dataset_name")`.
|
|
223
219
|
|
|
220
|
+
Here is how to retrieve a saved dataset and iterate over the objects:
|
|
221
|
+
|
|
224
222
|
.. code:: py
|
|
225
223
|
|
|
226
224
|
chain = DataChain.from_dataset("response")
|
|
227
225
|
|
|
228
|
-
# Iterating one-by-one: out
|
|
226
|
+
# Iterating one-by-one: support out-of-memory workflow
|
|
229
227
|
for file, response in chain.limit(5).collect("file", "response"):
|
|
230
|
-
#
|
|
228
|
+
# verify the collected Python objects
|
|
231
229
|
assert isinstance(response, ChatCompletionResponse)
|
|
232
230
|
|
|
233
231
|
status = response.choices[0].message.content[:7]
|
|
@@ -248,9 +246,8 @@ Output:
|
|
|
248
246
|
Vectorized analytics over Python objects
|
|
249
247
|
========================================
|
|
250
248
|
|
|
251
|
-
Some operations can
|
|
252
|
-
|
|
253
|
-
Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
|
|
249
|
+
Some operations can run inside the DB without deserialization.
|
|
250
|
+
For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
|
|
254
251
|
|
|
255
252
|
.. code:: py
|
|
256
253
|
|
|
@@ -322,6 +319,7 @@ Community and Support
|
|
|
322
319
|
.. github-only
|
|
323
320
|
.. _Contributor Guide: CONTRIBUTING.rst
|
|
324
321
|
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
322
|
+
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
325
323
|
.. _SQLite: https://www.sqlite.org/
|
|
326
324
|
.. _Getting Started: https://datachain.dvc.ai/
|
|
327
325
|
.. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
|