datachain 0.3.7__tar.gz → 0.3.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.7 → datachain-0.3.9}/.github/workflows/tests.yml +1 -1
- {datachain-0.3.7 → datachain-0.3.9}/.pre-commit-config.yaml +1 -1
- {datachain-0.3.7/src/datachain.egg-info → datachain-0.3.9}/PKG-INFO +19 -15
- {datachain-0.3.7 → datachain-0.3.9}/README.rst +11 -12
- {datachain-0.3.7 → datachain-0.3.9}/examples/llm_and_nlp/unstructured-text.py +1 -1
- {datachain-0.3.7 → datachain-0.3.9}/examples/multimodal/wds_filtered.py +1 -3
- {datachain-0.3.7 → datachain-0.3.9}/pyproject.toml +10 -4
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/catalog/catalog.py +2 -92
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/cli.py +0 -37
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/arrow.py +5 -5
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/clip.py +14 -3
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/python_to_sql.py +9 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/data_model.py +10 -1
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/dc.py +135 -39
- datachain-0.3.9/src/datachain/lib/hf.py +166 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/image.py +9 -1
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/pytorch.py +1 -2
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/signal_schema.py +124 -20
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/text.py +4 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/udf.py +14 -20
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/webdataset.py +1 -1
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/dataset.py +24 -9
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/session.py +5 -3
- {datachain-0.3.7 → datachain-0.3.9/src/datachain.egg-info}/PKG-INFO +19 -15
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain.egg-info/SOURCES.txt +3 -4
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain.egg-info/requires.txt +8 -2
- {datachain-0.3.7 → datachain-0.3.9}/tests/examples/wds_data.py +11 -11
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_catalog.py +30 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_datasets.py +0 -127
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_feature_pickling.py +70 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_pytorch.py +17 -2
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/conftest.py +5 -2
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_arrow.py +3 -3
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_datachain.py +54 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_feature.py +3 -2
- datachain-0.3.9/tests/unit/lib/test_hf.py +132 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_signal_schema.py +92 -3
- datachain-0.3.7/examples/computer_vision/blip2_image_desc_lib.py +0 -100
- datachain-0.3.7/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -64
- datachain-0.3.7/examples/llm_and_nlp/llm-claude.py +0 -46
- {datachain-0.3.7 → datachain-0.3.9}/.cruft.json +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.gitattributes +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.github/codecov.yaml +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.github/dependabot.yml +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.github/workflows/release.yml +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/.gitignore +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/LICENSE +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/assets/datachain.png +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/index.md +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/references/datachain.md +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/references/datatype.md +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/references/file.md +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/references/index.md +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/references/sql.md +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/references/torch.md +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/docs/references/udf.md +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/udfs/stateful.py +0 -0
- /datachain-0.3.7/examples/llm_and_nlp/llm-claude-simple-query.py → /datachain-0.3.9/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/mkdocs.yml +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/noxfile.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/setup.cfg +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/__main__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/asyn.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/cache.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/local.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/config.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/dataset.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/error.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/job.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/file.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/listing.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/listing.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/node.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/progress.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/py.typed +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/builtins.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/params.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/schema.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/storage.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain/utils.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/conftest.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/data.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/examples/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_client.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_datachain.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_listing.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_ls.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_pull.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_query.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_client.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_session.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_udf.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.7 → datachain-0.3.9}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.9
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -42,6 +42,7 @@ Requires-Dist: jmespath>=1.0
|
|
|
42
42
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
43
43
|
Requires-Dist: Pillow<11,>=10.0.0
|
|
44
44
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
45
|
+
Requires-Dist: psutil
|
|
45
46
|
Provides-Extra: docs
|
|
46
47
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
47
48
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -58,8 +59,11 @@ Requires-Dist: lz4; extra == "remote"
|
|
|
58
59
|
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
59
60
|
Provides-Extra: vector
|
|
60
61
|
Requires-Dist: usearch; extra == "vector"
|
|
62
|
+
Provides-Extra: hf
|
|
63
|
+
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
64
|
+
Requires-Dist: datasets[audio,vision]; extra == "hf"
|
|
61
65
|
Provides-Extra: tests
|
|
62
|
-
Requires-Dist: datachain[remote,torch,vector]; extra == "tests"
|
|
66
|
+
Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
|
|
63
67
|
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
64
68
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
65
69
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
@@ -74,9 +78,10 @@ Requires-Dist: hypothesis; extra == "tests"
|
|
|
74
78
|
Requires-Dist: open_clip_torch; extra == "tests"
|
|
75
79
|
Requires-Dist: aiotools>=1.7.0; extra == "tests"
|
|
76
80
|
Requires-Dist: requests-mock; extra == "tests"
|
|
81
|
+
Requires-Dist: scipy; extra == "tests"
|
|
77
82
|
Provides-Extra: dev
|
|
78
83
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
79
|
-
Requires-Dist: mypy==1.11.
|
|
84
|
+
Requires-Dist: mypy==1.11.2; extra == "dev"
|
|
80
85
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
81
86
|
Requires-Dist: types-pytz; extra == "dev"
|
|
82
87
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -110,31 +115,30 @@ AI 🔗 DataChain
|
|
|
110
115
|
|
|
111
116
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
112
117
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
113
|
-
your local machine.
|
|
118
|
+
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
114
119
|
|
|
115
120
|
Key Features
|
|
116
121
|
============
|
|
117
122
|
|
|
118
123
|
📂 **Storage as a Source of Truth.**
|
|
119
|
-
- Process unstructured data without redundant copies
|
|
124
|
+
- Process unstructured data without redundant copies from S3, GCP, Azure, and local
|
|
120
125
|
file systems.
|
|
121
|
-
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
122
|
-
-
|
|
126
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
127
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
123
128
|
|
|
124
129
|
🐍 **Python-friendly data pipelines.**
|
|
125
130
|
- Operate on Python objects and object fields.
|
|
126
|
-
- Built-in parallelization and out-of-memory compute without
|
|
127
|
-
Spark jobs.
|
|
131
|
+
- Built-in parallelization and out-of-memory compute without SQL or Spark.
|
|
128
132
|
|
|
129
133
|
🧠 **Data Enrichment and Processing.**
|
|
130
|
-
- Generate metadata
|
|
131
|
-
- Filter, join, and group by
|
|
132
|
-
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
134
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
135
|
+
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
136
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
133
137
|
|
|
134
138
|
🚀 **Efficiency.**
|
|
135
139
|
- Parallelization, out-of-memory workloads and data caching.
|
|
136
140
|
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
137
|
-
-
|
|
141
|
+
- Optimized vector search.
|
|
138
142
|
|
|
139
143
|
|
|
140
144
|
Quick Start
|
|
@@ -159,7 +163,7 @@ where each image has a matching JSON file like `cat.1009.json`:
|
|
|
159
163
|
"inference": {"class": "dog", "confidence": 0.68}
|
|
160
164
|
}
|
|
161
165
|
|
|
162
|
-
Example of downloading only high-confidence cat images using JSON metadata:
|
|
166
|
+
Example of downloading only "high-confidence cat" inferred images using JSON metadata:
|
|
163
167
|
|
|
164
168
|
|
|
165
169
|
.. code:: py
|
|
@@ -229,7 +233,7 @@ detected are then copied to the local directory.
|
|
|
229
233
|
LLM judging chatbots
|
|
230
234
|
=============================
|
|
231
235
|
|
|
232
|
-
LLMs can work as
|
|
236
|
+
LLMs can work as universal classifiers. In the example below,
|
|
233
237
|
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
234
238
|
Mistral API key at https://console.mistral.ai
|
|
235
239
|
|
|
@@ -18,31 +18,30 @@ AI 🔗 DataChain
|
|
|
18
18
|
|
|
19
19
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
20
20
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
21
|
-
your local machine.
|
|
21
|
+
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
22
22
|
|
|
23
23
|
Key Features
|
|
24
24
|
============
|
|
25
25
|
|
|
26
26
|
📂 **Storage as a Source of Truth.**
|
|
27
|
-
- Process unstructured data without redundant copies
|
|
27
|
+
- Process unstructured data without redundant copies from S3, GCP, Azure, and local
|
|
28
28
|
file systems.
|
|
29
|
-
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
30
|
-
-
|
|
29
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
30
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
31
31
|
|
|
32
32
|
🐍 **Python-friendly data pipelines.**
|
|
33
33
|
- Operate on Python objects and object fields.
|
|
34
|
-
- Built-in parallelization and out-of-memory compute without
|
|
35
|
-
Spark jobs.
|
|
34
|
+
- Built-in parallelization and out-of-memory compute without SQL or Spark.
|
|
36
35
|
|
|
37
36
|
🧠 **Data Enrichment and Processing.**
|
|
38
|
-
- Generate metadata
|
|
39
|
-
- Filter, join, and group by
|
|
40
|
-
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
37
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
38
|
+
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
39
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
41
40
|
|
|
42
41
|
🚀 **Efficiency.**
|
|
43
42
|
- Parallelization, out-of-memory workloads and data caching.
|
|
44
43
|
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
45
|
-
-
|
|
44
|
+
- Optimized vector search.
|
|
46
45
|
|
|
47
46
|
|
|
48
47
|
Quick Start
|
|
@@ -67,7 +66,7 @@ where each image has a matching JSON file like `cat.1009.json`:
|
|
|
67
66
|
"inference": {"class": "dog", "confidence": 0.68}
|
|
68
67
|
}
|
|
69
68
|
|
|
70
|
-
Example of downloading only high-confidence cat images using JSON metadata:
|
|
69
|
+
Example of downloading only "high-confidence cat" inferred images using JSON metadata:
|
|
71
70
|
|
|
72
71
|
|
|
73
72
|
.. code:: py
|
|
@@ -137,7 +136,7 @@ detected are then copied to the local directory.
|
|
|
137
136
|
LLM judging chatbots
|
|
138
137
|
=============================
|
|
139
138
|
|
|
140
|
-
LLMs can work as
|
|
139
|
+
LLMs can work as universal classifiers. In the example below,
|
|
141
140
|
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
142
141
|
Mistral API key at https://console.mistral.ai
|
|
143
142
|
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
import datachain.error
|
|
2
2
|
from datachain import C, DataChain
|
|
3
|
-
from datachain.lib.model_store import ModelStore
|
|
4
3
|
from datachain.lib.webdataset import process_webdataset
|
|
5
|
-
from datachain.lib.webdataset_laion import
|
|
4
|
+
from datachain.lib.webdataset_laion import WDSLaion
|
|
6
5
|
from datachain.sql import literal
|
|
7
6
|
from datachain.sql.functions import array, greatest, least, string
|
|
8
7
|
|
|
9
8
|
name = "wds"
|
|
10
|
-
ModelStore.register(LaionMeta)
|
|
11
9
|
try:
|
|
12
10
|
wds = DataChain.from_dataset(name=name)
|
|
13
11
|
except datachain.error.DatasetNotFoundError:
|
|
@@ -44,7 +44,8 @@ dependencies = [
|
|
|
44
44
|
"jmespath>=1.0",
|
|
45
45
|
"datamodel-code-generator>=0.25",
|
|
46
46
|
"Pillow>=10.0.0,<11",
|
|
47
|
-
"msgpack>=1.0.4,<2"
|
|
47
|
+
"msgpack>=1.0.4,<2",
|
|
48
|
+
"psutil"
|
|
48
49
|
]
|
|
49
50
|
|
|
50
51
|
[project.optional-dependencies]
|
|
@@ -68,8 +69,12 @@ remote = [
|
|
|
68
69
|
vector = [
|
|
69
70
|
"usearch"
|
|
70
71
|
]
|
|
72
|
+
hf = [
|
|
73
|
+
"numba>=0.60.0",
|
|
74
|
+
"datasets[audio,vision]"
|
|
75
|
+
]
|
|
71
76
|
tests = [
|
|
72
|
-
"datachain[torch,remote,vector]",
|
|
77
|
+
"datachain[torch,remote,vector,hf]",
|
|
73
78
|
"pytest>=8,<9",
|
|
74
79
|
"pytest-sugar>=0.9.6",
|
|
75
80
|
"pytest-cov>=4.1.0",
|
|
@@ -83,11 +88,12 @@ tests = [
|
|
|
83
88
|
"hypothesis",
|
|
84
89
|
"open_clip_torch",
|
|
85
90
|
"aiotools>=1.7.0",
|
|
86
|
-
"requests-mock"
|
|
91
|
+
"requests-mock",
|
|
92
|
+
"scipy"
|
|
87
93
|
]
|
|
88
94
|
dev = [
|
|
89
95
|
"datachain[docs,tests]",
|
|
90
|
-
"mypy==1.11.
|
|
96
|
+
"mypy==1.11.2",
|
|
91
97
|
"types-python-dateutil",
|
|
92
98
|
"types-pytz",
|
|
93
99
|
"types-PyYAML",
|
|
@@ -1540,87 +1540,6 @@ class Catalog:
|
|
|
1540
1540
|
dataset = self.get_dataset(name)
|
|
1541
1541
|
return self.update_dataset(dataset, **update_data)
|
|
1542
1542
|
|
|
1543
|
-
def merge_datasets(
|
|
1544
|
-
self,
|
|
1545
|
-
src: DatasetRecord,
|
|
1546
|
-
dst: DatasetRecord,
|
|
1547
|
-
src_version: int,
|
|
1548
|
-
dst_version: Optional[int] = None,
|
|
1549
|
-
) -> DatasetRecord:
|
|
1550
|
-
"""
|
|
1551
|
-
Merges records from source to destination dataset.
|
|
1552
|
-
It will create new version
|
|
1553
|
-
of a dataset with records merged from old version and the source, unless
|
|
1554
|
-
existing version is specified for destination in which case it must
|
|
1555
|
-
be in non final status as datasets are immutable
|
|
1556
|
-
"""
|
|
1557
|
-
if (
|
|
1558
|
-
dst_version
|
|
1559
|
-
and not dst.is_valid_next_version(dst_version)
|
|
1560
|
-
and dst.get_version(dst_version).is_final_status()
|
|
1561
|
-
):
|
|
1562
|
-
raise DatasetInvalidVersionError(
|
|
1563
|
-
f"Version {dst_version} must be higher than the current latest one"
|
|
1564
|
-
)
|
|
1565
|
-
|
|
1566
|
-
src_dep = self.get_dataset_dependencies(src.name, src_version)
|
|
1567
|
-
dst_dep = self.get_dataset_dependencies(
|
|
1568
|
-
dst.name,
|
|
1569
|
-
dst.latest_version, # type: ignore[arg-type]
|
|
1570
|
-
)
|
|
1571
|
-
|
|
1572
|
-
if dst.has_version(dst_version): # type: ignore[arg-type]
|
|
1573
|
-
# case where we don't create new version, but append to the existing one
|
|
1574
|
-
self.warehouse.merge_dataset_rows(
|
|
1575
|
-
src,
|
|
1576
|
-
dst,
|
|
1577
|
-
src_version,
|
|
1578
|
-
dst_version=dst_version, # type: ignore[arg-type]
|
|
1579
|
-
)
|
|
1580
|
-
merged_schema = src.serialized_schema | dst.serialized_schema
|
|
1581
|
-
self.update_dataset(dst, schema=merged_schema)
|
|
1582
|
-
self.update_dataset_version_with_warehouse_info(
|
|
1583
|
-
dst,
|
|
1584
|
-
dst_version, # type: ignore[arg-type]
|
|
1585
|
-
schema=merged_schema,
|
|
1586
|
-
)
|
|
1587
|
-
for dep in src_dep:
|
|
1588
|
-
if dep and dep not in dst_dep:
|
|
1589
|
-
self.metastore.add_dependency(
|
|
1590
|
-
dep,
|
|
1591
|
-
dst.name,
|
|
1592
|
-
dst_version, # type: ignore[arg-type]
|
|
1593
|
-
)
|
|
1594
|
-
else:
|
|
1595
|
-
# case where we create new version of merged results
|
|
1596
|
-
src_dr = self.warehouse.dataset_rows(src, src_version)
|
|
1597
|
-
dst_dr = self.warehouse.dataset_rows(dst)
|
|
1598
|
-
|
|
1599
|
-
merge_result_columns = list(
|
|
1600
|
-
{
|
|
1601
|
-
c.name: c for c in list(src_dr.table.c) + list(dst_dr.table.c)
|
|
1602
|
-
}.values()
|
|
1603
|
-
)
|
|
1604
|
-
|
|
1605
|
-
dst_version = dst_version or dst.next_version
|
|
1606
|
-
dst = self.create_new_dataset_version(
|
|
1607
|
-
dst,
|
|
1608
|
-
dst_version,
|
|
1609
|
-
columns=merge_result_columns,
|
|
1610
|
-
)
|
|
1611
|
-
self.warehouse.merge_dataset_rows(
|
|
1612
|
-
src,
|
|
1613
|
-
dst,
|
|
1614
|
-
src_version,
|
|
1615
|
-
dst_version,
|
|
1616
|
-
)
|
|
1617
|
-
self.update_dataset_version_with_warehouse_info(dst, dst_version)
|
|
1618
|
-
for dep in set(src_dep + dst_dep):
|
|
1619
|
-
if dep:
|
|
1620
|
-
self.metastore.add_dependency(dep, dst.name, dst_version)
|
|
1621
|
-
|
|
1622
|
-
return dst
|
|
1623
|
-
|
|
1624
1543
|
def get_file_signals(
|
|
1625
1544
|
self, dataset_name: str, dataset_version: int, row: RowDict
|
|
1626
1545
|
) -> Optional[dict]:
|
|
@@ -1641,17 +1560,8 @@ class Catalog:
|
|
|
1641
1560
|
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1642
1561
|
|
|
1643
1562
|
file_signals_values = {}
|
|
1644
|
-
file_schemas = {}
|
|
1645
|
-
# TODO: To remove after we properly fix deserialization
|
|
1646
|
-
for signal, type_name in version.feature_schema.items():
|
|
1647
|
-
from datachain.lib.model_store import ModelStore
|
|
1648
|
-
|
|
1649
|
-
type_name_parsed, v = ModelStore.parse_name_version(type_name)
|
|
1650
|
-
fr = ModelStore.get(type_name_parsed, v)
|
|
1651
|
-
if fr and issubclass(fr, File):
|
|
1652
|
-
file_schemas[signal] = type_name
|
|
1653
1563
|
|
|
1654
|
-
schema = SignalSchema.deserialize(
|
|
1564
|
+
schema = SignalSchema.deserialize(version.feature_schema)
|
|
1655
1565
|
for file_signals in schema.get_signals(File):
|
|
1656
1566
|
prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
|
|
1657
1567
|
file_signals_values[file_signals] = {
|
|
@@ -1997,7 +1907,7 @@ class Catalog:
|
|
|
1997
1907
|
"""
|
|
1998
1908
|
from datachain.query.dataset import ExecutionResult
|
|
1999
1909
|
|
|
2000
|
-
feature_file = tempfile.NamedTemporaryFile(
|
|
1910
|
+
feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
|
|
2001
1911
|
dir=os.getcwd(), suffix=".py", delete=False
|
|
2002
1912
|
)
|
|
2003
1913
|
_, feature_module = os.path.split(feature_file.name)
|
|
@@ -336,36 +336,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
336
336
|
help="Display size using powers of 1000 not 1024",
|
|
337
337
|
)
|
|
338
338
|
|
|
339
|
-
parse_merge_datasets = subp.add_parser(
|
|
340
|
-
"merge-datasets", parents=[parent_parser], description="Merges datasets"
|
|
341
|
-
)
|
|
342
|
-
parse_merge_datasets.add_argument(
|
|
343
|
-
"--src",
|
|
344
|
-
action="store",
|
|
345
|
-
default=None,
|
|
346
|
-
help="Source dataset name",
|
|
347
|
-
)
|
|
348
|
-
parse_merge_datasets.add_argument(
|
|
349
|
-
"--dst",
|
|
350
|
-
action="store",
|
|
351
|
-
default=None,
|
|
352
|
-
help="Destination dataset name",
|
|
353
|
-
)
|
|
354
|
-
parse_merge_datasets.add_argument(
|
|
355
|
-
"--src-version",
|
|
356
|
-
action="store",
|
|
357
|
-
default=None,
|
|
358
|
-
type=int,
|
|
359
|
-
help="Source dataset version",
|
|
360
|
-
)
|
|
361
|
-
parse_merge_datasets.add_argument(
|
|
362
|
-
"--dst-version",
|
|
363
|
-
action="store",
|
|
364
|
-
default=None,
|
|
365
|
-
type=int,
|
|
366
|
-
help="Destination dataset version",
|
|
367
|
-
)
|
|
368
|
-
|
|
369
339
|
parse_ls = subp.add_parser(
|
|
370
340
|
"ls", parents=[parent_parser], description="List storage contents"
|
|
371
341
|
)
|
|
@@ -996,13 +966,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
996
966
|
new_name=args.new_name,
|
|
997
967
|
labels=args.labels,
|
|
998
968
|
)
|
|
999
|
-
elif args.command == "merge-datasets":
|
|
1000
|
-
catalog.merge_datasets(
|
|
1001
|
-
catalog.get_dataset(args.src),
|
|
1002
|
-
catalog.get_dataset(args.dst),
|
|
1003
|
-
args.src_version,
|
|
1004
|
-
dst_version=args.dst_version,
|
|
1005
|
-
)
|
|
1006
969
|
elif args.command == "ls":
|
|
1007
970
|
ls(
|
|
1008
971
|
args.sources,
|
|
@@ -95,7 +95,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
95
95
|
if not column:
|
|
96
96
|
column = f"c{default_column}"
|
|
97
97
|
default_column += 1
|
|
98
|
-
dtype =
|
|
98
|
+
dtype = arrow_type_mapper(field.type) # type: ignore[assignment]
|
|
99
99
|
if field.nullable:
|
|
100
100
|
dtype = Optional[dtype] # type: ignore[assignment]
|
|
101
101
|
output[column] = dtype
|
|
@@ -103,7 +103,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
103
103
|
return output
|
|
104
104
|
|
|
105
105
|
|
|
106
|
-
def
|
|
106
|
+
def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
|
|
107
107
|
"""Convert pyarrow types to basic types."""
|
|
108
108
|
from datetime import datetime
|
|
109
109
|
|
|
@@ -122,16 +122,16 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
|
|
|
122
122
|
if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
|
|
123
123
|
return str
|
|
124
124
|
if pa.types.is_list(col_type):
|
|
125
|
-
return list[
|
|
125
|
+
return list[arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
|
|
126
126
|
if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
|
|
127
127
|
return dict
|
|
128
128
|
if isinstance(col_type, pa.lib.DictionaryType):
|
|
129
|
-
return
|
|
129
|
+
return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
|
|
130
130
|
raise TypeError(f"{col_type!r} datatypes not supported")
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
def _nrows_file(file: File, nrows: int) -> str:
|
|
134
|
-
tf = NamedTemporaryFile(delete=False)
|
|
134
|
+
tf = NamedTemporaryFile(delete=False) # noqa: SIM115
|
|
135
135
|
with file.open(mode="r") as reader:
|
|
136
136
|
with open(tf.name, "a") as writer:
|
|
137
137
|
for row, line in enumerate(reader):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import inspect
|
|
2
|
-
from typing import TYPE_CHECKING, Any, Callable, Literal, Union
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
|
|
3
3
|
|
|
4
4
|
import torch
|
|
5
5
|
from transformers.modeling_utils import PreTrainedModel
|
|
@@ -39,6 +39,7 @@ def clip_similarity_scores(
|
|
|
39
39
|
tokenizer: Callable,
|
|
40
40
|
prob: bool = False,
|
|
41
41
|
image_to_text: bool = True,
|
|
42
|
+
device: Optional[Union[str, torch.device]] = None,
|
|
42
43
|
) -> list[list[float]]:
|
|
43
44
|
"""
|
|
44
45
|
Calculate CLIP similarity scores between one or more images and/or text.
|
|
@@ -52,6 +53,7 @@ def clip_similarity_scores(
|
|
|
52
53
|
prob : Compute softmax probabilities.
|
|
53
54
|
image_to_text : Whether to compute for image-to-text or text-to-image. Ignored
|
|
54
55
|
if only one of images or text provided.
|
|
56
|
+
device : Device to use. Defaults is None - use model's device.
|
|
55
57
|
|
|
56
58
|
|
|
57
59
|
Example:
|
|
@@ -130,17 +132,26 @@ def clip_similarity_scores(
|
|
|
130
132
|
```
|
|
131
133
|
"""
|
|
132
134
|
|
|
135
|
+
if device is None:
|
|
136
|
+
if hasattr(model, "device"):
|
|
137
|
+
device = model.device
|
|
138
|
+
else:
|
|
139
|
+
device = next(model.parameters()).device
|
|
140
|
+
else:
|
|
141
|
+
model = model.to(device)
|
|
133
142
|
with torch.no_grad():
|
|
134
143
|
if images is not None:
|
|
135
144
|
encoder = _get_encoder(model, "image")
|
|
136
145
|
image_features = convert_images(
|
|
137
|
-
images, transform=preprocess, encoder=encoder
|
|
146
|
+
images, transform=preprocess, encoder=encoder, device=device
|
|
138
147
|
)
|
|
139
148
|
image_features /= image_features.norm(dim=-1, keepdim=True) # type: ignore[union-attr]
|
|
140
149
|
|
|
141
150
|
if text is not None:
|
|
142
151
|
encoder = _get_encoder(model, "text")
|
|
143
|
-
text_features = convert_text(
|
|
152
|
+
text_features = convert_text(
|
|
153
|
+
text, tokenizer, encoder=encoder, device=device
|
|
154
|
+
)
|
|
144
155
|
text_features /= text_features.norm(dim=-1, keepdim=True) # type: ignore[union-attr]
|
|
145
156
|
|
|
146
157
|
if images is not None and text is not None:
|
|
@@ -73,6 +73,9 @@ def python_to_sql(typ): # noqa: PLR0911
|
|
|
73
73
|
if len(args) == 2 and (type(None) in args):
|
|
74
74
|
return python_to_sql(args[0])
|
|
75
75
|
|
|
76
|
+
if _is_union_str_literal(orig, args):
|
|
77
|
+
return String
|
|
78
|
+
|
|
76
79
|
if _is_json_inside_union(orig, args):
|
|
77
80
|
return JSON
|
|
78
81
|
|
|
@@ -94,3 +97,9 @@ def _is_json_inside_union(orig, args) -> bool:
|
|
|
94
97
|
if any(inspect.isclass(arg) and issubclass(arg, BaseModel) for arg in args):
|
|
95
98
|
return True
|
|
96
99
|
return False
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _is_union_str_literal(orig, args) -> bool:
|
|
103
|
+
if orig != Union:
|
|
104
|
+
return False
|
|
105
|
+
return all(arg is str or get_origin(arg) in (Literal, LiteralEx) for arg in args)
|
|
@@ -2,7 +2,7 @@ from collections.abc import Sequence
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import ClassVar, Union, get_args, get_origin
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel
|
|
5
|
+
from pydantic import BaseModel, create_model
|
|
6
6
|
|
|
7
7
|
from datachain.lib.model_store import ModelStore
|
|
8
8
|
|
|
@@ -57,3 +57,12 @@ def is_chain_type(t: type) -> bool:
|
|
|
57
57
|
return is_chain_type(args[0])
|
|
58
58
|
|
|
59
59
|
return False
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
|
|
63
|
+
fields = {name: (anno, ...) for name, anno in data_dict.items()}
|
|
64
|
+
return create_model(
|
|
65
|
+
name,
|
|
66
|
+
__base__=(DataModel,), # type: ignore[call-overload]
|
|
67
|
+
**fields,
|
|
68
|
+
) # type: ignore[call-overload]
|