datachain 0.2.12__tar.gz → 0.2.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.2.12 → datachain-0.2.14}/.github/workflows/tests.yml +8 -23
- {datachain-0.2.12 → datachain-0.2.14}/.pre-commit-config.yaml +2 -0
- {datachain-0.2.12/src/datachain.egg-info → datachain-0.2.14}/PKG-INFO +42 -44
- {datachain-0.2.12 → datachain-0.2.14}/README.rst +39 -41
- {datachain-0.2.12 → datachain-0.2.14}/docs/index.md +2 -2
- datachain-0.2.14/examples/llm/llm_chatbot_evaluation.ipynb +772 -0
- {datachain-0.2.12 → datachain-0.2.14}/pyproject.toml +8 -3
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/catalog/catalog.py +7 -1
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/cli.py +11 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/metastore.py +0 -4
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/schema.py +7 -3
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/sqlite.py +1 -4
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/warehouse.py +1 -24
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/flatten.py +4 -4
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/values_to_tuples.py +4 -1
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/dc.py +100 -5
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/file.py +23 -22
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/meta_formats.py +6 -5
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/dataset.py +29 -23
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/sqlite/base.py +3 -3
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/sqlite/types.py +5 -13
- {datachain-0.2.12 → datachain-0.2.14/src/datachain.egg-info}/PKG-INFO +42 -44
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain.egg-info/requires.txt +2 -2
- {datachain-0.2.12 → datachain-0.2.14}/tests/conftest.py +42 -26
- {datachain-0.2.12 → datachain-0.2.14}/tests/examples/test_wds_e2e.py +1 -1
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_catalog.py +39 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_datachain.py +61 -7
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_dataset_query.py +29 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_datasets.py +7 -6
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_feature_pickling.py +10 -3
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_datachain.py +57 -1
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_datachain_merge.py +19 -19
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_feature.py +7 -7
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_file.py +57 -1
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_module_exports.py +25 -18
- {datachain-0.2.12 → datachain-0.2.14}/tests/utils.py +6 -0
- {datachain-0.2.12 → datachain-0.2.14}/.cruft.json +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/.gitattributes +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/.github/codecov.yaml +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/.github/dependabot.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/.github/workflows/release.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/.gitignore +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/CONTRIBUTING.rst +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/LICENSE +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/docs/assets/datachain.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/docs/assets/flowchart.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/docs/references/datachain.md +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/docs/references/datatype.md +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/docs/references/file.md +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/docs/references/index.md +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/docs/references/sql.md +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/docs/references/torch.md +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/docs/references/udf.md +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/README.md +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/clip.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/wds.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/mkdocs.yml +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/noxfile.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/setup.cfg +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/__main__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/asyn.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/cache.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/cli_utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/azure.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/gcs.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/local.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/s3.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/config.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/dataset.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/error.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/job.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/clip.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/image.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/settings.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/text.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/udf.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/listing.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/node.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/progress.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/py.typed +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/batch.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/builtins.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/metrics.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/params.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/schema.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/session.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/udf.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/remote/studio.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/types.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/storage.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain/utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/data.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/examples/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/examples/wds_data.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_client.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_ls.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_pull.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_pytorch.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_query.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/scripts/feature_class.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/scripts/name_len_normal.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/test_cli_e2e.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/test_query_e2e.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_asyn.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_cache.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_catalog.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_client.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_dataset.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_listing.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_metastore.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_query_params.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_serializer.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_session.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_storage.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_udf.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_utils.py +0 -0
- {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_warehouse.py +0 -0
|
@@ -69,26 +69,6 @@ jobs:
|
|
|
69
69
|
pyv: '3.12'
|
|
70
70
|
|
|
71
71
|
steps:
|
|
72
|
-
|
|
73
|
-
# https://github.com/iterative/pytest-servers/pull/122
|
|
74
|
-
# https://github.com/abiosoft/colima/issues/468
|
|
75
|
-
# https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
|
|
76
|
-
# colima v0.5.6 seems to run more stable than the latest - that has occasional network failures (ports are not open)
|
|
77
|
-
# see: https://github.com/abiosoft/colima/issues/962
|
|
78
|
-
- name: Use colima as default docker host on MacOS
|
|
79
|
-
if: runner.os == 'macOS'
|
|
80
|
-
run: |
|
|
81
|
-
brew install docker lima || true # avoid non-zero exit code if brew link fails
|
|
82
|
-
sudo curl -L -o /usr/local/bin/colima https://github.com/abiosoft/colima/releases/download/v0.5.6/colima-Darwin-x86_64
|
|
83
|
-
sudo chmod +x /usr/local/bin/colima
|
|
84
|
-
colima start
|
|
85
|
-
sudo ln -vsf "${HOME}"/.colima/default/docker.sock /var/run/docker.sock
|
|
86
|
-
env:
|
|
87
|
-
HOMEBREW_NO_AUTO_UPDATE: true
|
|
88
|
-
HOMEBREW_NO_INSTALL_CLEANUP: true
|
|
89
|
-
HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: true
|
|
90
|
-
HOMEBREW_NO_INSTALL_UPGRADE: true
|
|
91
|
-
|
|
92
72
|
- name: Check out the repository
|
|
93
73
|
uses: actions/checkout@v4
|
|
94
74
|
with:
|
|
@@ -106,12 +86,17 @@ jobs:
|
|
|
106
86
|
nox --version
|
|
107
87
|
uv --version
|
|
108
88
|
|
|
109
|
-
- name: Skip flaky azure, gs remotes
|
|
89
|
+
- name: Skip flaky azure, gs remotes on macOS
|
|
110
90
|
if: runner.os == 'macOS'
|
|
111
|
-
run: echo '
|
|
91
|
+
run: echo 'DISABLE_REMOTES_ARG=--disable-remotes=azure,gs' >> "$GITHUB_ENV"
|
|
92
|
+
|
|
93
|
+
- name: Skip all remotes on Windows
|
|
94
|
+
if: runner.os == 'Windows'
|
|
95
|
+
run: echo 'DISABLE_REMOTES_ARG=--disable-remotes=azure,gs' >> $env:GITHUB_ENV
|
|
112
96
|
|
|
113
97
|
- name: Run tests
|
|
114
|
-
run: nox -s tests-${{ matrix.pyv }}
|
|
98
|
+
run: nox -s tests-${{ matrix.pyv }} -- $DISABLE_REMOTES_ARG
|
|
99
|
+
shell: bash
|
|
115
100
|
|
|
116
101
|
- name: Upload coverage report
|
|
117
102
|
uses: codecov/codecov-action@v4
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.14
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -36,7 +36,7 @@ Requires-Dist: sqlalchemy>=2
|
|
|
36
36
|
Requires-Dist: multiprocess==0.70.16
|
|
37
37
|
Requires-Dist: dill==0.3.8
|
|
38
38
|
Requires-Dist: cloudpickle
|
|
39
|
-
Requires-Dist:
|
|
39
|
+
Requires-Dist: orjson>=3.10.5
|
|
40
40
|
Requires-Dist: pydantic<3,>=2
|
|
41
41
|
Requires-Dist: jmespath>=1.0
|
|
42
42
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
@@ -78,9 +78,9 @@ Provides-Extra: dev
|
|
|
78
78
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
79
79
|
Requires-Dist: mypy==1.10.1; extra == "dev"
|
|
80
80
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
81
|
+
Requires-Dist: types-pytz; extra == "dev"
|
|
81
82
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
82
83
|
Requires-Dist: types-requests; extra == "dev"
|
|
83
|
-
Requires-Dist: types-ujson; extra == "dev"
|
|
84
84
|
|
|
85
85
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
86
86
|
|
|
@@ -103,20 +103,18 @@ AI 🔗 DataChain
|
|
|
103
103
|
DataChain is an open-source Python library for processing and curating unstructured
|
|
104
104
|
data at scale.
|
|
105
105
|
|
|
106
|
-
🤖 AI-Driven Data Curation: Use local ML models
|
|
106
|
+
🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
|
|
107
107
|
|
|
108
|
-
🚀 GenAI Dataset scale: Handle
|
|
108
|
+
🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
|
|
109
109
|
|
|
110
|
-
🐍 Python-friendly: Use strictly
|
|
110
|
+
🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
|
|
114
|
-
downloads, and out-of-memory computing. It excels at optimizing batch operations.
|
|
115
|
-
While most GenAI tools focus on online applications and realtime, DataChain is designed
|
|
116
|
-
for offline data processing, data curation and ETL.
|
|
113
|
+
Datachain supports parallel processing, parallel data
|
|
114
|
+
downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
|
|
117
115
|
|
|
118
|
-
The typical use cases
|
|
119
|
-
and validation.
|
|
116
|
+
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
117
|
+
and validation of multimodal AI applications.
|
|
120
118
|
|
|
121
119
|
|
|
122
120
|
.. code:: console
|
|
@@ -128,25 +126,25 @@ and validation.
|
|
|
128
126
|
Quick Start
|
|
129
127
|
-----------
|
|
130
128
|
|
|
131
|
-
|
|
132
|
-
|
|
129
|
+
Data curation with a local model
|
|
130
|
+
=================================
|
|
133
131
|
|
|
134
132
|
We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
|
|
135
|
-
- 50 files total in
|
|
136
|
-
These dialogs involve users looking for better wireless plans
|
|
137
|
-
Our goal is to identify successful dialogs.
|
|
133
|
+
- 50 files total in this example.
|
|
134
|
+
These dialogs involve users chatting with a bot while looking for better wireless plans.
|
|
135
|
+
Our goal is to identify the successful dialogs.
|
|
138
136
|
|
|
139
|
-
The data used in the examples is publicly available.
|
|
137
|
+
The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
|
|
140
138
|
|
|
141
|
-
First, we'll
|
|
139
|
+
First, we'll show batch inference with a simple sentiment model using the `transformers` library:
|
|
142
140
|
|
|
143
141
|
.. code:: shell
|
|
144
142
|
|
|
145
143
|
pip install transformers
|
|
146
144
|
|
|
147
|
-
The code below downloads files the cloud, applies function
|
|
148
|
-
|
|
149
|
-
are copied to local directory
|
|
145
|
+
The code below downloads files the cloud, and applies a user-defined function
|
|
146
|
+
to each one of them. All files with a positive sentiment
|
|
147
|
+
detected are then copied to the local directory.
|
|
150
148
|
|
|
151
149
|
.. code:: py
|
|
152
150
|
|
|
@@ -169,7 +167,7 @@ are copied to local directory `output/`.
|
|
|
169
167
|
)
|
|
170
168
|
|
|
171
169
|
positive_chain = chain.filter(Column("is_positive") == True)
|
|
172
|
-
positive_chain.export_files("./
|
|
170
|
+
positive_chain.export_files("./output")
|
|
173
171
|
|
|
174
172
|
print(f"{positive_chain.count()} files were exported")
|
|
175
173
|
|
|
@@ -185,11 +183,11 @@ are copied to local directory `output/`.
|
|
|
185
183
|
13
|
|
186
184
|
|
|
187
185
|
|
|
188
|
-
LLM judging
|
|
189
|
-
|
|
186
|
+
LLM judging chatbots
|
|
187
|
+
=============================
|
|
190
188
|
|
|
191
|
-
|
|
192
|
-
we
|
|
189
|
+
LLMs can work as efficient universal classifiers. In the example below,
|
|
190
|
+
we employ a free API from Mistral to judge the chatbot performance. Please get a free
|
|
193
191
|
Mistral API key at https://console.mistral.ai
|
|
194
192
|
|
|
195
193
|
.. code:: shell
|
|
@@ -197,9 +195,7 @@ Mistral API key at https://console.mistral.ai
|
|
|
197
195
|
$ pip install mistralai
|
|
198
196
|
$ export MISTRAL_API_KEY=_your_key_
|
|
199
197
|
|
|
200
|
-
|
|
201
|
-
Note, only 4 threads were used in this example `parallel=4` due to a limitation of
|
|
202
|
-
the free LLM service.
|
|
198
|
+
DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
|
|
203
199
|
|
|
204
200
|
.. code:: py
|
|
205
201
|
|
|
@@ -231,7 +227,7 @@ the free LLM service.
|
|
|
231
227
|
print(f"{successful_chain.count()} files were exported")
|
|
232
228
|
|
|
233
229
|
|
|
234
|
-
With the
|
|
230
|
+
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
235
231
|
|
|
236
232
|
.. code:: shell
|
|
237
233
|
|
|
@@ -245,11 +241,11 @@ With the current prompt, we found 31 files considered successful dialogs:
|
|
|
245
241
|
Serializing Python-objects
|
|
246
242
|
==========================
|
|
247
243
|
|
|
248
|
-
LLM responses contain valuable information for analytics
|
|
249
|
-
model
|
|
244
|
+
LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
|
|
245
|
+
model performance parameters.
|
|
250
246
|
|
|
251
|
-
Instead of extracting this information from the Mistral data structure (class
|
|
252
|
-
`ChatCompletionResponse`),
|
|
247
|
+
Instead of extracting this information from the Mistral response data structure (class
|
|
248
|
+
`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
|
|
253
249
|
|
|
254
250
|
|
|
255
251
|
.. code:: py
|
|
@@ -297,21 +293,23 @@ Output:
|
|
|
297
293
|
64.0% dialogs were successful
|
|
298
294
|
|
|
299
295
|
|
|
300
|
-
|
|
296
|
+
Iterating over Python data structures
|
|
301
297
|
=============================================
|
|
302
298
|
|
|
303
|
-
In the previous examples,
|
|
304
|
-
(`SQLite`_ in
|
|
305
|
-
These datasets
|
|
299
|
+
In the previous examples, datasets were saved in the embedded database
|
|
300
|
+
(`SQLite`_ in folder `.datachain` of the working directory).
|
|
301
|
+
These datasets were automatically versioned, and can be accessed using
|
|
306
302
|
`DataChain.from_dataset("dataset_name")`.
|
|
307
303
|
|
|
304
|
+
Here is how to retrieve a saved dataset and iterate over the objects:
|
|
305
|
+
|
|
308
306
|
.. code:: py
|
|
309
307
|
|
|
310
308
|
chain = DataChain.from_dataset("response")
|
|
311
309
|
|
|
312
|
-
# Iterating one-by-one: out
|
|
310
|
+
# Iterating one-by-one: support out-of-memory workflow
|
|
313
311
|
for file, response in chain.limit(5).collect("file", "response"):
|
|
314
|
-
#
|
|
312
|
+
# verify the collected Python objects
|
|
315
313
|
assert isinstance(response, ChatCompletionResponse)
|
|
316
314
|
|
|
317
315
|
status = response.choices[0].message.content[:7]
|
|
@@ -332,9 +330,8 @@ Output:
|
|
|
332
330
|
Vectorized analytics over Python objects
|
|
333
331
|
========================================
|
|
334
332
|
|
|
335
|
-
Some operations can
|
|
336
|
-
|
|
337
|
-
Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
|
|
333
|
+
Some operations can run inside the DB without deserialization.
|
|
334
|
+
For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
|
|
338
335
|
|
|
339
336
|
.. code:: py
|
|
340
337
|
|
|
@@ -406,6 +403,7 @@ Community and Support
|
|
|
406
403
|
.. github-only
|
|
407
404
|
.. _Contributor Guide: CONTRIBUTING.rst
|
|
408
405
|
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
406
|
+
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
409
407
|
.. _SQLite: https://www.sqlite.org/
|
|
410
408
|
.. _Getting Started: https://datachain.dvc.ai/
|
|
411
409
|
.. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
|
|
@@ -19,20 +19,18 @@ AI 🔗 DataChain
|
|
|
19
19
|
DataChain is an open-source Python library for processing and curating unstructured
|
|
20
20
|
data at scale.
|
|
21
21
|
|
|
22
|
-
🤖 AI-Driven Data Curation: Use local ML models
|
|
22
|
+
🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
|
|
23
23
|
|
|
24
|
-
🚀 GenAI Dataset scale: Handle
|
|
24
|
+
🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
|
|
25
25
|
|
|
26
|
-
🐍 Python-friendly: Use strictly
|
|
26
|
+
🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
downloads, and out-of-memory computing. It excels at optimizing batch operations.
|
|
31
|
-
While most GenAI tools focus on online applications and realtime, DataChain is designed
|
|
32
|
-
for offline data processing, data curation and ETL.
|
|
29
|
+
Datachain supports parallel processing, parallel data
|
|
30
|
+
downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
|
|
33
31
|
|
|
34
|
-
The typical use cases
|
|
35
|
-
and validation.
|
|
32
|
+
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
33
|
+
and validation of multimodal AI applications.
|
|
36
34
|
|
|
37
35
|
|
|
38
36
|
.. code:: console
|
|
@@ -44,25 +42,25 @@ and validation.
|
|
|
44
42
|
Quick Start
|
|
45
43
|
-----------
|
|
46
44
|
|
|
47
|
-
|
|
48
|
-
|
|
45
|
+
Data curation with a local model
|
|
46
|
+
=================================
|
|
49
47
|
|
|
50
48
|
We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
|
|
51
|
-
- 50 files total in
|
|
52
|
-
These dialogs involve users looking for better wireless plans
|
|
53
|
-
Our goal is to identify successful dialogs.
|
|
49
|
+
- 50 files total in this example.
|
|
50
|
+
These dialogs involve users chatting with a bot while looking for better wireless plans.
|
|
51
|
+
Our goal is to identify the successful dialogs.
|
|
54
52
|
|
|
55
|
-
The data used in the examples is publicly available.
|
|
53
|
+
The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
|
|
56
54
|
|
|
57
|
-
First, we'll
|
|
55
|
+
First, we'll show batch inference with a simple sentiment model using the `transformers` library:
|
|
58
56
|
|
|
59
57
|
.. code:: shell
|
|
60
58
|
|
|
61
59
|
pip install transformers
|
|
62
60
|
|
|
63
|
-
The code below downloads files the cloud, applies function
|
|
64
|
-
|
|
65
|
-
are copied to local directory
|
|
61
|
+
The code below downloads files the cloud, and applies a user-defined function
|
|
62
|
+
to each one of them. All files with a positive sentiment
|
|
63
|
+
detected are then copied to the local directory.
|
|
66
64
|
|
|
67
65
|
.. code:: py
|
|
68
66
|
|
|
@@ -85,7 +83,7 @@ are copied to local directory `output/`.
|
|
|
85
83
|
)
|
|
86
84
|
|
|
87
85
|
positive_chain = chain.filter(Column("is_positive") == True)
|
|
88
|
-
positive_chain.export_files("./
|
|
86
|
+
positive_chain.export_files("./output")
|
|
89
87
|
|
|
90
88
|
print(f"{positive_chain.count()} files were exported")
|
|
91
89
|
|
|
@@ -101,11 +99,11 @@ are copied to local directory `output/`.
|
|
|
101
99
|
13
|
|
102
100
|
|
|
103
101
|
|
|
104
|
-
LLM judging
|
|
105
|
-
|
|
102
|
+
LLM judging chatbots
|
|
103
|
+
=============================
|
|
106
104
|
|
|
107
|
-
|
|
108
|
-
we
|
|
105
|
+
LLMs can work as efficient universal classifiers. In the example below,
|
|
106
|
+
we employ a free API from Mistral to judge the chatbot performance. Please get a free
|
|
109
107
|
Mistral API key at https://console.mistral.ai
|
|
110
108
|
|
|
111
109
|
.. code:: shell
|
|
@@ -113,9 +111,7 @@ Mistral API key at https://console.mistral.ai
|
|
|
113
111
|
$ pip install mistralai
|
|
114
112
|
$ export MISTRAL_API_KEY=_your_key_
|
|
115
113
|
|
|
116
|
-
|
|
117
|
-
Note, only 4 threads were used in this example `parallel=4` due to a limitation of
|
|
118
|
-
the free LLM service.
|
|
114
|
+
DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
|
|
119
115
|
|
|
120
116
|
.. code:: py
|
|
121
117
|
|
|
@@ -147,7 +143,7 @@ the free LLM service.
|
|
|
147
143
|
print(f"{successful_chain.count()} files were exported")
|
|
148
144
|
|
|
149
145
|
|
|
150
|
-
With the
|
|
146
|
+
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
151
147
|
|
|
152
148
|
.. code:: shell
|
|
153
149
|
|
|
@@ -161,11 +157,11 @@ With the current prompt, we found 31 files considered successful dialogs:
|
|
|
161
157
|
Serializing Python-objects
|
|
162
158
|
==========================
|
|
163
159
|
|
|
164
|
-
LLM responses contain valuable information for analytics
|
|
165
|
-
model
|
|
160
|
+
LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
|
|
161
|
+
model performance parameters.
|
|
166
162
|
|
|
167
|
-
Instead of extracting this information from the Mistral data structure (class
|
|
168
|
-
`ChatCompletionResponse`),
|
|
163
|
+
Instead of extracting this information from the Mistral response data structure (class
|
|
164
|
+
`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
|
|
169
165
|
|
|
170
166
|
|
|
171
167
|
.. code:: py
|
|
@@ -213,21 +209,23 @@ Output:
|
|
|
213
209
|
64.0% dialogs were successful
|
|
214
210
|
|
|
215
211
|
|
|
216
|
-
|
|
212
|
+
Iterating over Python data structures
|
|
217
213
|
=============================================
|
|
218
214
|
|
|
219
|
-
In the previous examples,
|
|
220
|
-
(`SQLite`_ in
|
|
221
|
-
These datasets
|
|
215
|
+
In the previous examples, datasets were saved in the embedded database
|
|
216
|
+
(`SQLite`_ in folder `.datachain` of the working directory).
|
|
217
|
+
These datasets were automatically versioned, and can be accessed using
|
|
222
218
|
`DataChain.from_dataset("dataset_name")`.
|
|
223
219
|
|
|
220
|
+
Here is how to retrieve a saved dataset and iterate over the objects:
|
|
221
|
+
|
|
224
222
|
.. code:: py
|
|
225
223
|
|
|
226
224
|
chain = DataChain.from_dataset("response")
|
|
227
225
|
|
|
228
|
-
# Iterating one-by-one: out
|
|
226
|
+
# Iterating one-by-one: support out-of-memory workflow
|
|
229
227
|
for file, response in chain.limit(5).collect("file", "response"):
|
|
230
|
-
#
|
|
228
|
+
# verify the collected Python objects
|
|
231
229
|
assert isinstance(response, ChatCompletionResponse)
|
|
232
230
|
|
|
233
231
|
status = response.choices[0].message.content[:7]
|
|
@@ -248,9 +246,8 @@ Output:
|
|
|
248
246
|
Vectorized analytics over Python objects
|
|
249
247
|
========================================
|
|
250
248
|
|
|
251
|
-
Some operations can
|
|
252
|
-
|
|
253
|
-
Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
|
|
249
|
+
Some operations can run inside the DB without deserialization.
|
|
250
|
+
For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
|
|
254
251
|
|
|
255
252
|
.. code:: py
|
|
256
253
|
|
|
@@ -322,6 +319,7 @@ Community and Support
|
|
|
322
319
|
.. github-only
|
|
323
320
|
.. _Contributor Guide: CONTRIBUTING.rst
|
|
324
321
|
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
322
|
+
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
325
323
|
.. _SQLite: https://www.sqlite.org/
|
|
326
324
|
.. _Getting Started: https://datachain.dvc.ai/
|
|
327
325
|
.. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
|
|
@@ -58,8 +58,8 @@ def trim_text(text):
|
|
|
58
58
|
match = re.search(r'[A-Z][^.]*\.', text)
|
|
59
59
|
return match.group(0) if match else ''
|
|
60
60
|
|
|
61
|
-
images = chain.
|
|
62
|
-
captions = chain.
|
|
61
|
+
images = chain.collect("file")
|
|
62
|
+
captions = chain.collect("scene")
|
|
63
63
|
_ , axes = plt.subplots(1, len(captions), figsize=(15, 5))
|
|
64
64
|
|
|
65
65
|
for ax, img, caption in zip(axes, images, captions):
|