datachain 0.2.9__tar.gz → 0.2.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.2.9 → datachain-0.2.11}/.github/workflows/tests.yml +71 -1
- {datachain-0.2.9/src/datachain.egg-info → datachain-0.2.11}/PKG-INFO +14 -12
- {datachain-0.2.9 → datachain-0.2.11}/README.rst +6 -5
- {datachain-0.2.9 → datachain-0.2.11}/examples/json-csv-reader.py +4 -2
- datachain-0.2.11/examples/llm-claude-aggregate-query.py +57 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/llm-claude-simple-query.py +31 -14
- datachain-0.2.11/examples/llm-claude.py +39 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/multimodal/clip_fine_tuning.ipynb +114 -111
- {datachain-0.2.9 → datachain-0.2.11}/examples/openimage-detect.py +1 -1
- {datachain-0.2.9 → datachain-0.2.11}/examples/pose_detection.py +1 -2
- {datachain-0.2.9 → datachain-0.2.11}/examples/wds.py +3 -6
- {datachain-0.2.9 → datachain-0.2.11}/mkdocs.yml +0 -3
- {datachain-0.2.9 → datachain-0.2.11}/pyproject.toml +5 -4
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/__init__.py +17 -8
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/catalog/catalog.py +5 -5
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/cli.py +0 -2
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/schema.py +5 -5
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/sqlite.py +1 -1
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/warehouse.py +7 -7
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/arrow.py +25 -8
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/clip.py +6 -11
- datachain-0.2.11/src/datachain/lib/convert/flatten.py +67 -0
- datachain-0.2.11/src/datachain/lib/convert/type_converter.py +96 -0
- datachain-0.2.11/src/datachain/lib/convert/unflatten.py +69 -0
- datachain-0.2.11/src/datachain/lib/convert/values_to_tuples.py +85 -0
- datachain-0.2.11/src/datachain/lib/data_model.py +74 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/dc.py +225 -168
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/file.py +41 -41
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/gpt4_vision.py +1 -9
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/hf_image_to_text.py +9 -17
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/hf_pipeline.py +4 -12
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/image.py +2 -18
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/image_transform.py +0 -1
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/iptc_exif_xmp.py +8 -15
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/meta_formats.py +1 -5
- datachain-0.2.11/src/datachain/lib/model_store.py +77 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/pytorch.py +9 -21
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/signal_schema.py +139 -60
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/text.py +5 -16
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/udf.py +114 -30
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/udf_signature.py +5 -5
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/webdataset.py +3 -3
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/webdataset_laion.py +2 -3
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/node.py +4 -4
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/batch.py +1 -1
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/dataset.py +51 -178
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/dispatch.py +43 -30
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/udf.py +46 -26
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/remote/studio.py +1 -9
- datachain-0.2.11/src/datachain/torch/__init__.py +21 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/utils.py +39 -0
- {datachain-0.2.9 → datachain-0.2.11/src/datachain.egg-info}/PKG-INFO +14 -12
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain.egg-info/SOURCES.txt +10 -8
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain.egg-info/requires.txt +8 -7
- {datachain-0.2.9 → datachain-0.2.11}/tests/conftest.py +1 -1
- {datachain-0.2.9 → datachain-0.2.11}/tests/examples/test_wds_e2e.py +1 -1
- {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_catalog.py +2 -2
- {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_datachain.py +21 -3
- {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_dataset_query.py +40 -53
- {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_datasets.py +2 -2
- datachain-0.2.11/tests/func/test_feature_pickling.py +209 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_pull.py +3 -3
- {datachain-0.2.9 → datachain-0.2.11}/tests/scripts/feature_class.py +3 -2
- {datachain-0.2.9 → datachain-0.2.11}/tests/scripts/feature_class_parallel.py +5 -5
- datachain-0.2.11/tests/scripts/feature_class_parallel_data_model.py +28 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/test_query_e2e.py +55 -14
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_arrow.py +17 -3
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_datachain.py +230 -133
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_datachain_bootstrap.py +5 -5
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_datachain_merge.py +15 -15
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_feature.py +86 -152
- datachain-0.2.11/tests/unit/lib/test_feature_utils.py +109 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_image.py +1 -1
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_signal_schema.py +22 -27
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_udf_signature.py +6 -5
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_utils.py +5 -5
- datachain-0.2.11/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_dataset.py +3 -3
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_listing.py +2 -2
- datachain-0.2.11/tests/unit/test_module_exports.py +93 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_udf.py +14 -60
- datachain-0.2.9/docs/tutorials/cv_intro.md +0 -217
- datachain-0.2.9/docs/tutorials/udfs.md +0 -94
- datachain-0.2.9/examples/llm-claude-aggregate-query.py +0 -40
- datachain-0.2.9/examples/llm-claude.py +0 -21
- datachain-0.2.9/src/datachain/image/__init__.py +0 -3
- datachain-0.2.9/src/datachain/lib/cached_stream.py +0 -38
- datachain-0.2.9/src/datachain/lib/claude.py +0 -69
- datachain-0.2.9/src/datachain/lib/feature.py +0 -412
- datachain-0.2.9/src/datachain/lib/feature_registry.py +0 -51
- datachain-0.2.9/src/datachain/lib/feature_utils.py +0 -154
- datachain-0.2.9/tests/unit/lib/test_feature_utils.py +0 -142
- datachain-0.2.9/tests/unit/test_module_exports.py +0 -30
- {datachain-0.2.9 → datachain-0.2.11}/.cruft.json +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.gitattributes +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.github/codecov.yaml +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.github/dependabot.yml +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.github/workflows/release.yml +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.gitignore +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.pre-commit-config.yaml +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/.reuse/dep5 +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/CONTRIBUTING.rst +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/LICENSE +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/LICENSES/Apache-2.0.txt +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/LICENSES/BSD-3-Clause.txt +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/LICENSES/Python-2.0.txt +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/docs/assets/datachain.png +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/docs/index.md +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/docs/references/catalog.md +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/docs/references/datachain.md +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/blip2_image_desc_lib.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/clip.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/common_sql_functions.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/README.md +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/hf_pipeline.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/llava2_image_desc_lib.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/loader.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/README +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/distance_to_query.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/llm_chat.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/requirements.txt +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/single_query.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/text_loaders.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/openai_image_desc_lib.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/torch-loader.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/batching.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/image_transformation.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/parallel.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/simple.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/stateful.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/stateful_similarity.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/unstructured-text.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/wds_filtered.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/zalando/zalando_clip.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/zalando/zalando_dir_as_class.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/zalando/zalando_splits_and_classes_ds.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/examples/zalando/zalando_splits_and_classes_output.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/noxfile.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/setup.cfg +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/__main__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/asyn.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/cache.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/cli_utils.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/azure.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/gcs.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/local.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/s3.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/config.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/dataset.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/error.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.2.9/src/datachain/remote → datachain-0.2.11/src/datachain/lib/convert}/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/settings.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/unstructured.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/utils.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/listing.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/progress.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/py.typed +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/builtins.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/metrics.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/params.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/schema.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/session.py +0 -0
- {datachain-0.2.9/tests/benchmarks → datachain-0.2.11/src/datachain/remote}/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/types.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/utils.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/storage.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain/text/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/__init__.py +0 -0
- {datachain-0.2.9/tests/examples → datachain-0.2.11/tests/benchmarks}/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/data.py +0 -0
- {datachain-0.2.9/tests/func → datachain-0.2.11/tests/examples}/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/examples/wds_data.py +0 -0
- {datachain-0.2.9/tests/unit → datachain-0.2.11/tests/func}/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_client.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_ls.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_pytorch.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_query.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/scripts/name_len_normal.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/test_cli_e2e.py +0 -0
- {datachain-0.2.9/tests/unit/lib → datachain-0.2.11/tests/unit}/__init__.py +0 -0
- {datachain-0.2.9/tests/unit/sql → datachain-0.2.11/tests/unit/lib}/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.2.9/tests/unit/sql/sqlite → datachain-0.2.11/tests/unit/sql}/__init__.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_asyn.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_cache.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_catalog.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_client.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_metastore.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_query_params.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_serializer.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_session.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_storage.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_utils.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.2.9 → datachain-0.2.11}/tests/utils.py +0 -0
|
@@ -50,7 +50,7 @@ jobs:
|
|
|
50
50
|
- name: Lint code
|
|
51
51
|
run: nox -s lint
|
|
52
52
|
|
|
53
|
-
|
|
53
|
+
datachain:
|
|
54
54
|
timeout-minutes: 25
|
|
55
55
|
runs-on: ${{ matrix.os }}
|
|
56
56
|
strategy:
|
|
@@ -125,3 +125,73 @@ jobs:
|
|
|
125
125
|
|
|
126
126
|
- name: Build docs
|
|
127
127
|
run: nox -s docs
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
studio:
|
|
131
|
+
if: '!github.event.pull_request.head.repo.fork'
|
|
132
|
+
runs-on: ubuntu-latest-16-cores
|
|
133
|
+
strategy:
|
|
134
|
+
matrix:
|
|
135
|
+
pyv: ['3.12']
|
|
136
|
+
group: [1, 2, 3, 4, 5, 6]
|
|
137
|
+
services:
|
|
138
|
+
postgres:
|
|
139
|
+
image: postgres:16.3
|
|
140
|
+
ports:
|
|
141
|
+
- 5432:5432
|
|
142
|
+
env:
|
|
143
|
+
POSTGRES_USER: test
|
|
144
|
+
POSTGRES_DB: database
|
|
145
|
+
POSTGRES_HOST_AUTH_METHOD: trust
|
|
146
|
+
clickhouse:
|
|
147
|
+
image: clickhouse/clickhouse-server:24
|
|
148
|
+
ports:
|
|
149
|
+
- 8123:8123
|
|
150
|
+
- 9010:9000
|
|
151
|
+
env:
|
|
152
|
+
CLICKHOUSE_DB: studio_local_db
|
|
153
|
+
CLICKHOUSE_USER: studio_local
|
|
154
|
+
CLICKHOUSE_PASSWORD: ch123456789!
|
|
155
|
+
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
|
|
156
|
+
redis:
|
|
157
|
+
image: redis:7.2.5
|
|
158
|
+
ports:
|
|
159
|
+
- 6379:6379
|
|
160
|
+
steps:
|
|
161
|
+
|
|
162
|
+
- name: Check out Studio
|
|
163
|
+
uses: actions/checkout@v4
|
|
164
|
+
with:
|
|
165
|
+
fetch-depth: 0
|
|
166
|
+
repository: iterative/studio
|
|
167
|
+
ref: develop
|
|
168
|
+
token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
|
|
169
|
+
|
|
170
|
+
- name: Check out repository
|
|
171
|
+
uses: actions/checkout@v4
|
|
172
|
+
with:
|
|
173
|
+
path: './backend/datachain'
|
|
174
|
+
fetch-depth: 0
|
|
175
|
+
|
|
176
|
+
- name: Set up Python ${{ matrix.pyv }}
|
|
177
|
+
uses: actions/setup-python@v5
|
|
178
|
+
with:
|
|
179
|
+
python-version: ${{ matrix.pyv }}
|
|
180
|
+
cache: 'pip'
|
|
181
|
+
|
|
182
|
+
- name: Install uv
|
|
183
|
+
run: |
|
|
184
|
+
python -m pip install --upgrade uv
|
|
185
|
+
uv --version
|
|
186
|
+
|
|
187
|
+
- name: Install dependencies
|
|
188
|
+
run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
|
|
189
|
+
|
|
190
|
+
- name: Run tests
|
|
191
|
+
# Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
|
|
192
|
+
run: >
|
|
193
|
+
pytest
|
|
194
|
+
--config-file=pyproject.toml -rsx
|
|
195
|
+
--splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
|
|
196
|
+
tests ../datachain/tests
|
|
197
|
+
working-directory: backend/datachain_server
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -35,10 +35,12 @@ Requires-Dist: shtab<2,>=1.3.4
|
|
|
35
35
|
Requires-Dist: sqlalchemy>=2
|
|
36
36
|
Requires-Dist: multiprocess==0.70.16
|
|
37
37
|
Requires-Dist: dill==0.3.8
|
|
38
|
+
Requires-Dist: cloudpickle
|
|
38
39
|
Requires-Dist: ujson>=5.9.0
|
|
39
40
|
Requires-Dist: pydantic<3,>=2
|
|
40
41
|
Requires-Dist: jmespath>=1.0
|
|
41
42
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
43
|
+
Requires-Dist: Pillow<11,>=10.0.0
|
|
42
44
|
Provides-Extra: docs
|
|
43
45
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
44
46
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -46,11 +48,10 @@ Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
|
|
|
46
48
|
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
47
49
|
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
48
50
|
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
49
|
-
Provides-Extra:
|
|
50
|
-
Requires-Dist:
|
|
51
|
-
Requires-Dist:
|
|
52
|
-
Requires-Dist:
|
|
53
|
-
Requires-Dist: transformers>=4.36.0; extra == "cv"
|
|
51
|
+
Provides-Extra: torch
|
|
52
|
+
Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
53
|
+
Requires-Dist: torchvision; extra == "torch"
|
|
54
|
+
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
54
55
|
Provides-Extra: remote
|
|
55
56
|
Requires-Dist: lz4; extra == "remote"
|
|
56
57
|
Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
|
|
@@ -58,7 +59,7 @@ Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
|
58
59
|
Provides-Extra: vector
|
|
59
60
|
Requires-Dist: usearch; extra == "vector"
|
|
60
61
|
Provides-Extra: tests
|
|
61
|
-
Requires-Dist: datachain[
|
|
62
|
+
Requires-Dist: datachain[remote,torch,vector]; extra == "tests"
|
|
62
63
|
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
63
64
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
64
65
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
@@ -89,11 +90,11 @@ Requires-Dist: types-ujson; extra == "dev"
|
|
|
89
90
|
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
90
91
|
:target: https://pypi.org/project/datachain
|
|
91
92
|
:alt: Python Version
|
|
92
|
-
.. |Codecov| image:: https://codecov.io/gh/iterative/
|
|
93
|
-
:target: https://
|
|
93
|
+
.. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
|
|
94
|
+
:target: https://codecov.io/gh/iterative/datachain
|
|
94
95
|
:alt: Codecov
|
|
95
|
-
.. |Tests| image:: https://github.com/iterative/
|
|
96
|
-
:target: https://github.com/iterative/
|
|
96
|
+
.. |Tests| image:: https://github.com/iterative/datachain/workflows/Tests/badge.svg
|
|
97
|
+
:target: https://github.com/iterative/datachain/actions?workflow=Tests
|
|
97
98
|
:alt: Tests
|
|
98
99
|
|
|
99
100
|
AI 🔗 DataChain
|
|
@@ -397,7 +398,8 @@ Chain results can be exported or passed directly to Pytorch dataloader. For exam
|
|
|
397
398
|
Tutorials
|
|
398
399
|
------------------
|
|
399
400
|
|
|
400
|
-
* `
|
|
401
|
+
* `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
|
|
402
|
+
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
|
|
401
403
|
|
|
402
404
|
Contributions
|
|
403
405
|
--------------------
|
|
@@ -6,11 +6,11 @@
|
|
|
6
6
|
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
7
7
|
:target: https://pypi.org/project/datachain
|
|
8
8
|
:alt: Python Version
|
|
9
|
-
.. |Codecov| image:: https://codecov.io/gh/iterative/
|
|
10
|
-
:target: https://
|
|
9
|
+
.. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
|
|
10
|
+
:target: https://codecov.io/gh/iterative/datachain
|
|
11
11
|
:alt: Codecov
|
|
12
|
-
.. |Tests| image:: https://github.com/iterative/
|
|
13
|
-
:target: https://github.com/iterative/
|
|
12
|
+
.. |Tests| image:: https://github.com/iterative/datachain/workflows/Tests/badge.svg
|
|
13
|
+
:target: https://github.com/iterative/datachain/actions?workflow=Tests
|
|
14
14
|
:alt: Tests
|
|
15
15
|
|
|
16
16
|
AI 🔗 DataChain
|
|
@@ -314,7 +314,8 @@ Chain results can be exported or passed directly to Pytorch dataloader. For exam
|
|
|
314
314
|
Tutorials
|
|
315
315
|
------------------
|
|
316
316
|
|
|
317
|
-
* `
|
|
317
|
+
* `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
|
|
318
|
+
* `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
|
|
318
319
|
|
|
319
320
|
Contributions
|
|
320
321
|
--------------------
|
|
@@ -103,7 +103,8 @@ def main():
|
|
|
103
103
|
print("========================================================================")
|
|
104
104
|
print("static CSV with header schema test parsing 3.5K objects")
|
|
105
105
|
print("========================================================================")
|
|
106
|
-
static_csv_ds = DataChain.from_csv(uri,
|
|
106
|
+
static_csv_ds = DataChain.from_csv(uri, output=ChatFeature, object_name="chat")
|
|
107
|
+
static_csv_ds.print_schema()
|
|
107
108
|
print(static_csv_ds.to_pandas())
|
|
108
109
|
|
|
109
110
|
uri = "gs://datachain-demo/laion-aesthetics-csv"
|
|
@@ -111,7 +112,8 @@ def main():
|
|
|
111
112
|
print("========================================================================")
|
|
112
113
|
print("dynamic CSV with header schema test parsing 3M objects")
|
|
113
114
|
print("========================================================================")
|
|
114
|
-
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion"
|
|
115
|
+
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion")
|
|
116
|
+
dynamic_csv_ds.print_schema()
|
|
115
117
|
print(dynamic_csv_ds.to_pandas())
|
|
116
118
|
|
|
117
119
|
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import anthropic
|
|
4
|
+
from anthropic.types import Message
|
|
5
|
+
|
|
6
|
+
from datachain import Column, DataChain
|
|
7
|
+
from datachain.sql.functions import path
|
|
8
|
+
|
|
9
|
+
DATA = "gs://dvcx-datalakes/chatbot-public"
|
|
10
|
+
MODEL = "claude-3-opus-20240229"
|
|
11
|
+
PROMPT = """Consider the following dialogues between the 'user' and the 'bot' separated\
|
|
12
|
+
by '===='. The 'user' is a human trying to find the best mobile plan. The 'bot' is a \
|
|
13
|
+
chatbot designed to query the user and offer the best solution. The dialog is \
|
|
14
|
+
successful if the 'bot' is able to gather the information and offer a plan, or inform \
|
|
15
|
+
the user that such plan does not exist. The dialog is not successful if the \
|
|
16
|
+
conversation ends early or the 'user' requests additional functions the 'bot' \
|
|
17
|
+
cannot perform. Read the dialogues and classify them into a fixed number of concise \
|
|
18
|
+
failure reasons covering most failure cases. Present output as JSON list of reason \
|
|
19
|
+
strings and nothing else.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
TEMPERATURE = 0.9
|
|
23
|
+
DEFAULT_OUTPUT_TOKENS = 1024
|
|
24
|
+
|
|
25
|
+
API_KEY = os.environ.get("ANTHROPIC_API_KEY")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
chain = (
|
|
29
|
+
DataChain.from_storage(DATA, type="text")
|
|
30
|
+
.filter(Column("file.name").glob("*.txt"))
|
|
31
|
+
.limit(5)
|
|
32
|
+
.settings(parallel=4, cache=True)
|
|
33
|
+
.agg(
|
|
34
|
+
dialogues=lambda file: ["\n=====\n".join(f.read() for f in file)],
|
|
35
|
+
output=str,
|
|
36
|
+
partition_by=path.file_ext(Column("name")),
|
|
37
|
+
)
|
|
38
|
+
.setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
|
|
39
|
+
.map(
|
|
40
|
+
claude=lambda client, dialogues: client.messages.create(
|
|
41
|
+
model=MODEL,
|
|
42
|
+
system=PROMPT,
|
|
43
|
+
messages=[
|
|
44
|
+
{"role": "user", "content": dialogues},
|
|
45
|
+
],
|
|
46
|
+
temperature=TEMPERATURE,
|
|
47
|
+
max_tokens=DEFAULT_OUTPUT_TOKENS,
|
|
48
|
+
),
|
|
49
|
+
output=Message,
|
|
50
|
+
)
|
|
51
|
+
.map(
|
|
52
|
+
res=lambda claude: claude.content[0].text if claude.content else [],
|
|
53
|
+
output=str,
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
chain.show()
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
|
|
3
|
-
import
|
|
4
|
+
import anthropic
|
|
5
|
+
from anthropic.types import Message
|
|
6
|
+
from pydantic import BaseModel
|
|
4
7
|
|
|
5
|
-
from datachain
|
|
6
|
-
from datachain.lib.dc import C, DataChain
|
|
7
|
-
from datachain.lib.feature import Feature
|
|
8
|
+
from datachain import Column, DataChain, File
|
|
8
9
|
|
|
9
|
-
|
|
10
|
+
DATA = "gs://dvcx-datalakes/chatbot-public"
|
|
10
11
|
MODEL = "claude-3-opus-20240229"
|
|
11
12
|
PROMPT = """Consider the dialogue between the 'user' and the 'bot'. \
|
|
12
13
|
The 'user' is a human trying to find the best mobile plan. \
|
|
@@ -20,19 +21,38 @@ if it is successful, and 'Failure' if not. After that, provide \
|
|
|
20
21
|
one-sentence explanation of the reasons for this rating. Use only \
|
|
21
22
|
JSON object as output with the keys 'status', and 'explanation'.
|
|
22
23
|
"""
|
|
24
|
+
TEMPERATURE = 0.9
|
|
25
|
+
DEFAULT_OUTPUT_TOKENS = 1024
|
|
23
26
|
|
|
27
|
+
API_KEY = os.environ.get("ANTHROPIC_API_KEY")
|
|
24
28
|
|
|
25
|
-
|
|
29
|
+
|
|
30
|
+
class Rating(BaseModel):
|
|
26
31
|
status: str = ""
|
|
27
32
|
explanation: str = ""
|
|
28
33
|
|
|
29
34
|
|
|
30
35
|
chain = (
|
|
31
|
-
DataChain.from_storage(
|
|
32
|
-
.filter(
|
|
33
|
-
.settings(parallel=3)
|
|
36
|
+
DataChain.from_storage(DATA, type="text")
|
|
37
|
+
.filter(Column("file.name").glob("*.txt"))
|
|
34
38
|
.limit(5)
|
|
35
|
-
.
|
|
39
|
+
.settings(parallel=4, cache=True)
|
|
40
|
+
.setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
|
|
41
|
+
.map(
|
|
42
|
+
claude=lambda client, file: client.messages.create(
|
|
43
|
+
model=MODEL,
|
|
44
|
+
system=PROMPT,
|
|
45
|
+
messages=[
|
|
46
|
+
{
|
|
47
|
+
"role": "user",
|
|
48
|
+
"content": file.read() if isinstance(file, File) else file,
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
temperature=TEMPERATURE,
|
|
52
|
+
max_tokens=DEFAULT_OUTPUT_TOKENS,
|
|
53
|
+
),
|
|
54
|
+
output=Message,
|
|
55
|
+
)
|
|
36
56
|
.map(
|
|
37
57
|
rating=lambda claude: Rating(
|
|
38
58
|
**(json.loads(claude.content[0].text) if claude.content else {})
|
|
@@ -41,7 +61,4 @@ chain = (
|
|
|
41
61
|
)
|
|
42
62
|
)
|
|
43
63
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
with pd.option_context("display.max_columns", None):
|
|
47
|
-
print(df)
|
|
64
|
+
chain.show()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import anthropic
|
|
4
|
+
from anthropic.types import Message
|
|
5
|
+
|
|
6
|
+
from datachain import Column, DataChain, File
|
|
7
|
+
|
|
8
|
+
DATA = "gs://dvcx-datalakes/chatbot-public"
|
|
9
|
+
MODEL = "claude-3-opus-20240229"
|
|
10
|
+
PROMPT = """Summarise the dialog in a sentence"""
|
|
11
|
+
TEMPERATURE = 0.9
|
|
12
|
+
DEFAULT_OUTPUT_TOKENS = 1024
|
|
13
|
+
|
|
14
|
+
API_KEY = os.environ.get("ANTHROPIC_API_KEY")
|
|
15
|
+
|
|
16
|
+
chain = (
|
|
17
|
+
DataChain.from_storage(DATA, type="text")
|
|
18
|
+
.filter(Column("file.name").glob("*.txt"))
|
|
19
|
+
.limit(5)
|
|
20
|
+
.settings(parallel=4, cache=True)
|
|
21
|
+
.setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
|
|
22
|
+
.map(
|
|
23
|
+
claude=lambda client, file: client.messages.create(
|
|
24
|
+
model=MODEL,
|
|
25
|
+
system=PROMPT,
|
|
26
|
+
messages=[
|
|
27
|
+
{
|
|
28
|
+
"role": "user",
|
|
29
|
+
"content": file.read() if isinstance(file, File) else file,
|
|
30
|
+
},
|
|
31
|
+
],
|
|
32
|
+
temperature=TEMPERATURE,
|
|
33
|
+
max_tokens=DEFAULT_OUTPUT_TOKENS,
|
|
34
|
+
),
|
|
35
|
+
output=Message,
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
chain.show()
|