datachain 0.3.4__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.4 → datachain-0.3.6}/.github/workflows/tests-studio.yml +1 -1
- {datachain-0.3.4 → datachain-0.3.6}/.github/workflows/tests.yml +11 -3
- {datachain-0.3.4 → datachain-0.3.6}/.pre-commit-config.yaml +1 -1
- {datachain-0.3.4/src/datachain.egg-info → datachain-0.3.6}/PKG-INFO +3 -4
- {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/torch-loader.py +1 -1
- {datachain-0.3.4 → datachain-0.3.6}/pyproject.toml +4 -5
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/catalog/catalog.py +15 -3
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/sqlite.py +1 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/flatten.py +0 -28
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/dc.py +49 -12
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/signal_schema.py +10 -5
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/dataset.py +42 -22
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/types.py +14 -8
- {datachain-0.3.4 → datachain-0.3.6/src/datachain.egg-info}/PKG-INFO +3 -4
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain.egg-info/requires.txt +2 -3
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_datachain.py +15 -1
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_datachain.py +97 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_signal_schema.py +43 -2
- {datachain-0.3.4 → datachain-0.3.6}/.cruft.json +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/.gitattributes +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/.github/codecov.yaml +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/.github/dependabot.yml +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/.github/workflows/release.yml +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/.gitignore +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/LICENSE +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/README.rst +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/assets/datachain.png +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/index.md +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/references/datachain.md +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/references/datatype.md +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/references/file.md +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/references/index.md +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/references/sql.md +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/references/torch.md +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/docs/references/udf.md +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/llm_and_nlp/llm-claude.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/mkdocs.yml +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/noxfile.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/setup.cfg +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/__main__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/asyn.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/cache.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/cli.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/local.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/config.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/dataset.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/error.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/job.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/file.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/listing.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/listing.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/node.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/progress.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/py.typed +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/builtins.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/params.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/schema.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/session.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/storage.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain/utils.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/conftest.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/data.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/examples/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_catalog.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_client.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_datasets.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_listing.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_ls.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_pull.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_query.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_client.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_session.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_udf.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.4 → datachain-0.3.6}/tests/utils.py +0 -0
|
@@ -62,9 +62,9 @@ jobs:
|
|
|
62
62
|
pyv: '3.9'
|
|
63
63
|
- os: macos-latest
|
|
64
64
|
pyv: '3.12'
|
|
65
|
-
- os: windows-latest
|
|
65
|
+
- os: windows-latest
|
|
66
66
|
pyv: '3.9'
|
|
67
|
-
- os: windows-latest
|
|
67
|
+
- os: windows-latest
|
|
68
68
|
pyv: '3.12'
|
|
69
69
|
|
|
70
70
|
steps:
|
|
@@ -116,9 +116,17 @@ jobs:
|
|
|
116
116
|
strategy:
|
|
117
117
|
fail-fast: false
|
|
118
118
|
matrix:
|
|
119
|
-
os: [ubuntu-latest
|
|
119
|
+
os: [ubuntu-latest, macos-latest, windows-latest]
|
|
120
120
|
pyv: ['3.9', '3.12']
|
|
121
121
|
group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
|
|
122
|
+
exclude:
|
|
123
|
+
- {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
|
|
124
|
+
- {os: ubuntu-latest, pyv: '3.12', group: 'multimodal'}
|
|
125
|
+
include:
|
|
126
|
+
- {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
|
|
127
|
+
- {os: ubuntu-latest-4-cores, pyv: "3.12", group: multimodal}
|
|
128
|
+
|
|
129
|
+
|
|
122
130
|
steps:
|
|
123
131
|
- uses: actions/checkout@v4
|
|
124
132
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.6
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -41,6 +41,7 @@ Requires-Dist: pydantic<3,>=2
|
|
|
41
41
|
Requires-Dist: jmespath>=1.0
|
|
42
42
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
43
43
|
Requires-Dist: Pillow<11,>=10.0.0
|
|
44
|
+
Requires-Dist: msgpack<2,>=1.0.4
|
|
44
45
|
Provides-Extra: docs
|
|
45
46
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
46
47
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -54,7 +55,6 @@ Requires-Dist: torchvision; extra == "torch"
|
|
|
54
55
|
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
55
56
|
Provides-Extra: remote
|
|
56
57
|
Requires-Dist: lz4; extra == "remote"
|
|
57
|
-
Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
|
|
58
58
|
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
59
59
|
Provides-Extra: vector
|
|
60
60
|
Requires-Dist: usearch; extra == "vector"
|
|
@@ -87,9 +87,8 @@ Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
|
87
87
|
Requires-Dist: defusedxml; extra == "examples"
|
|
88
88
|
Requires-Dist: accelerate; extra == "examples"
|
|
89
89
|
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
90
|
-
Requires-Dist: pdfplumber==0.11.
|
|
90
|
+
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
91
91
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
92
|
-
Requires-Dist: nltk==3.8.1; extra == "examples"
|
|
93
92
|
|
|
94
93
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
95
94
|
|
|
@@ -43,7 +43,8 @@ dependencies = [
|
|
|
43
43
|
"pydantic>=2,<3",
|
|
44
44
|
"jmespath>=1.0",
|
|
45
45
|
"datamodel-code-generator>=0.25",
|
|
46
|
-
"Pillow>=10.0.0,<11"
|
|
46
|
+
"Pillow>=10.0.0,<11",
|
|
47
|
+
"msgpack>=1.0.4,<2"
|
|
47
48
|
]
|
|
48
49
|
|
|
49
50
|
[project.optional-dependencies]
|
|
@@ -62,7 +63,6 @@ torch = [
|
|
|
62
63
|
]
|
|
63
64
|
remote = [
|
|
64
65
|
"lz4",
|
|
65
|
-
"msgpack>=1.0.4,<2",
|
|
66
66
|
"requests>=2.22.0"
|
|
67
67
|
]
|
|
68
68
|
vector = [
|
|
@@ -99,9 +99,8 @@ examples = [
|
|
|
99
99
|
"defusedxml",
|
|
100
100
|
"accelerate",
|
|
101
101
|
"unstructured[pdf]",
|
|
102
|
-
"pdfplumber==0.11.
|
|
103
|
-
"huggingface_hub[hf_transfer]"
|
|
104
|
-
"nltk==3.8.1"
|
|
102
|
+
"pdfplumber==0.11.4",
|
|
103
|
+
"huggingface_hub[hf_transfer]"
|
|
105
104
|
]
|
|
106
105
|
|
|
107
106
|
[project.urls]
|
|
@@ -120,13 +120,25 @@ def noop(_: str):
|
|
|
120
120
|
|
|
121
121
|
@contextmanager
|
|
122
122
|
def print_and_capture(
|
|
123
|
-
stream: "IO[str]", callback: Callable[[str], None] = noop
|
|
123
|
+
stream: "IO[bytes]|IO[str]", callback: Callable[[str], None] = noop
|
|
124
124
|
) -> "Iterator[list[str]]":
|
|
125
125
|
lines: list[str] = []
|
|
126
126
|
append = lines.append
|
|
127
127
|
|
|
128
128
|
def loop() -> None:
|
|
129
|
-
|
|
129
|
+
buffer = b""
|
|
130
|
+
while byt := stream.read(1): # Read one byte at a time
|
|
131
|
+
buffer += byt.encode("utf-8") if isinstance(byt, str) else byt
|
|
132
|
+
|
|
133
|
+
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
134
|
+
line = buffer.decode("utf-8")
|
|
135
|
+
print(line, end="")
|
|
136
|
+
callback(line)
|
|
137
|
+
append(line)
|
|
138
|
+
buffer = b"" # Clear buffer for next line
|
|
139
|
+
|
|
140
|
+
if buffer: # Handle any remaining data in the buffer
|
|
141
|
+
line = buffer.decode("utf-8")
|
|
130
142
|
print(line, end="")
|
|
131
143
|
callback(line)
|
|
132
144
|
append(line)
|
|
@@ -2128,7 +2140,7 @@ class Catalog:
|
|
|
2128
2140
|
stdout=subprocess.PIPE if capture_output else None,
|
|
2129
2141
|
stderr=subprocess.STDOUT if capture_output else None,
|
|
2130
2142
|
bufsize=1,
|
|
2131
|
-
text=
|
|
2143
|
+
text=False,
|
|
2132
2144
|
**kwargs,
|
|
2133
2145
|
) as proc:
|
|
2134
2146
|
os.close(w)
|
|
@@ -209,6 +209,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
209
209
|
return cursor.executemany(self.compile(query).string, params)
|
|
210
210
|
return self.db.executemany(self.compile(query).string, params)
|
|
211
211
|
|
|
212
|
+
@retry_sqlite_locks
|
|
212
213
|
def execute_str(self, sql: str, parameters=None) -> sqlite3.Cursor:
|
|
213
214
|
if parameters is None:
|
|
214
215
|
return self.db.execute(sql)
|
|
@@ -1,34 +1,6 @@
|
|
|
1
|
-
from datetime import datetime
|
|
2
|
-
|
|
3
1
|
from pydantic import BaseModel
|
|
4
2
|
|
|
5
3
|
from datachain.lib.model_store import ModelStore
|
|
6
|
-
from datachain.sql.types import (
|
|
7
|
-
JSON,
|
|
8
|
-
Array,
|
|
9
|
-
Binary,
|
|
10
|
-
Boolean,
|
|
11
|
-
DateTime,
|
|
12
|
-
Float,
|
|
13
|
-
Int,
|
|
14
|
-
Int32,
|
|
15
|
-
Int64,
|
|
16
|
-
NullType,
|
|
17
|
-
String,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
DATACHAIN_TO_TYPE = {
|
|
21
|
-
Int: int,
|
|
22
|
-
Int32: int,
|
|
23
|
-
Int64: int,
|
|
24
|
-
String: str,
|
|
25
|
-
Float: float,
|
|
26
|
-
Boolean: bool,
|
|
27
|
-
DateTime: datetime,
|
|
28
|
-
Binary: bytes,
|
|
29
|
-
Array(NullType): list,
|
|
30
|
-
JSON: dict,
|
|
31
|
-
}
|
|
32
4
|
|
|
33
5
|
|
|
34
6
|
def flatten(obj: BaseModel):
|
|
@@ -839,6 +839,10 @@ class DataChain(DatasetQuery):
|
|
|
839
839
|
def mutate(self, **kwargs) -> "Self":
|
|
840
840
|
"""Create new signals based on existing signals.
|
|
841
841
|
|
|
842
|
+
This method cannot modify existing columns. If you need to modify an
|
|
843
|
+
existing column, use a different name for the new column and then use
|
|
844
|
+
`select()` to choose which columns to keep.
|
|
845
|
+
|
|
842
846
|
This method is vectorized and more efficient compared to map(), and it does not
|
|
843
847
|
extract or download any data from the internal database. However, it can only
|
|
844
848
|
utilize predefined built-in functions and their combinations.
|
|
@@ -859,7 +863,26 @@ class DataChain(DatasetQuery):
|
|
|
859
863
|
dist=cosine_distance(embedding_text, embedding_image)
|
|
860
864
|
)
|
|
861
865
|
```
|
|
866
|
+
|
|
867
|
+
This method can be also used to rename signals. If the Column("name") provided
|
|
868
|
+
as value for the new signal - the old column will be dropped. Otherwise a new
|
|
869
|
+
column is created.
|
|
870
|
+
|
|
871
|
+
Example:
|
|
872
|
+
```py
|
|
873
|
+
dc.mutate(
|
|
874
|
+
newkey=Column("oldkey")
|
|
875
|
+
)
|
|
876
|
+
```
|
|
862
877
|
"""
|
|
878
|
+
existing_columns = set(self.signals_schema.values.keys())
|
|
879
|
+
for col_name in kwargs:
|
|
880
|
+
if col_name in existing_columns:
|
|
881
|
+
raise DataChainColumnError(
|
|
882
|
+
col_name,
|
|
883
|
+
"Cannot modify existing column with mutate(). "
|
|
884
|
+
"Use a different name for the new column.",
|
|
885
|
+
)
|
|
863
886
|
for col_name, expr in kwargs.items():
|
|
864
887
|
if not isinstance(expr, Column) and isinstance(expr.type, NullType):
|
|
865
888
|
raise DataChainColumnError(
|
|
@@ -1224,14 +1247,11 @@ class DataChain(DatasetQuery):
|
|
|
1224
1247
|
"""
|
|
1225
1248
|
headers, max_length = self._effective_signals_schema.get_headers_with_length()
|
|
1226
1249
|
if flatten or max_length < 2:
|
|
1227
|
-
columns = []
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
return pd.DataFrame.from_records(self.to_records(), columns=columns)
|
|
1250
|
+
columns = [".".join(filter(None, header)) for header in headers]
|
|
1251
|
+
else:
|
|
1252
|
+
columns = pd.MultiIndex.from_tuples(map(tuple, headers))
|
|
1231
1253
|
|
|
1232
|
-
return pd.DataFrame(
|
|
1233
|
-
self.results(), columns=pd.MultiIndex.from_tuples(map(tuple, headers))
|
|
1234
|
-
)
|
|
1254
|
+
return pd.DataFrame.from_records(self.results(), columns=columns)
|
|
1235
1255
|
|
|
1236
1256
|
def show(
|
|
1237
1257
|
self,
|
|
@@ -1524,6 +1544,7 @@ class DataChain(DatasetQuery):
|
|
|
1524
1544
|
to_insert: Optional[Union[dict, list[dict]]],
|
|
1525
1545
|
session: Optional[Session] = None,
|
|
1526
1546
|
in_memory: bool = False,
|
|
1547
|
+
schema: Optional[dict[str, DataType]] = None,
|
|
1527
1548
|
) -> "DataChain":
|
|
1528
1549
|
"""Create a DataChain from the provided records. This method can be used for
|
|
1529
1550
|
programmatically generating a chain in contrast of reading data from storages
|
|
@@ -1532,10 +1553,10 @@ class DataChain(DatasetQuery):
|
|
|
1532
1553
|
Parameters:
|
|
1533
1554
|
to_insert : records (or a single record) to insert. Each record is
|
|
1534
1555
|
a dictionary of signals and theirs values.
|
|
1556
|
+
schema : describes chain signals and their corresponding types
|
|
1535
1557
|
|
|
1536
1558
|
Example:
|
|
1537
1559
|
```py
|
|
1538
|
-
empty = DataChain.from_records()
|
|
1539
1560
|
single_record = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
|
|
1540
1561
|
```
|
|
1541
1562
|
"""
|
|
@@ -1543,11 +1564,27 @@ class DataChain(DatasetQuery):
|
|
|
1543
1564
|
catalog = session.catalog
|
|
1544
1565
|
|
|
1545
1566
|
name = session.generate_temp_dataset_name()
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1567
|
+
signal_schema = None
|
|
1568
|
+
columns: list[sqlalchemy.Column] = []
|
|
1569
|
+
|
|
1570
|
+
if schema:
|
|
1571
|
+
signal_schema = SignalSchema(schema)
|
|
1572
|
+
columns = signal_schema.db_signals(as_columns=True) # type: ignore[assignment]
|
|
1573
|
+
else:
|
|
1574
|
+
columns = [
|
|
1575
|
+
sqlalchemy.Column(name, typ)
|
|
1576
|
+
for name, typ in File._datachain_column_types.items()
|
|
1577
|
+
]
|
|
1578
|
+
|
|
1579
|
+
dsr = catalog.create_dataset(
|
|
1580
|
+
name,
|
|
1581
|
+
columns=columns,
|
|
1582
|
+
feature_schema=(
|
|
1583
|
+
signal_schema.clone_without_sys_signals().serialize()
|
|
1584
|
+
if signal_schema
|
|
1585
|
+
else None
|
|
1586
|
+
),
|
|
1549
1587
|
)
|
|
1550
|
-
dsr = catalog.create_dataset(name, columns=columns)
|
|
1551
1588
|
|
|
1552
1589
|
if isinstance(to_insert, dict):
|
|
1553
1590
|
to_insert = [to_insert]
|
|
@@ -2,6 +2,7 @@ import copy
|
|
|
2
2
|
from collections.abc import Iterator, Sequence
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from datetime import datetime
|
|
5
|
+
from inspect import isclass
|
|
5
6
|
from typing import (
|
|
6
7
|
TYPE_CHECKING,
|
|
7
8
|
Annotated,
|
|
@@ -14,10 +15,10 @@ from typing import (
|
|
|
14
15
|
get_origin,
|
|
15
16
|
)
|
|
16
17
|
|
|
18
|
+
import sqlalchemy as sa
|
|
17
19
|
from pydantic import BaseModel, create_model
|
|
18
20
|
from typing_extensions import Literal as LiteralEx
|
|
19
21
|
|
|
20
|
-
from datachain.lib.convert.flatten import DATACHAIN_TO_TYPE
|
|
21
22
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
22
23
|
from datachain.lib.convert.sql_to_python import sql_to_python
|
|
23
24
|
from datachain.lib.convert.unflatten import unflatten_to_json_pos
|
|
@@ -26,6 +27,7 @@ from datachain.lib.file import File
|
|
|
26
27
|
from datachain.lib.model_store import ModelStore
|
|
27
28
|
from datachain.lib.utils import DataChainParamsError
|
|
28
29
|
from datachain.query.schema import DEFAULT_DELIMITER, Column
|
|
30
|
+
from datachain.sql.types import SQLType
|
|
29
31
|
|
|
30
32
|
if TYPE_CHECKING:
|
|
31
33
|
from datachain.catalog import Catalog
|
|
@@ -104,12 +106,15 @@ class SignalSchema:
|
|
|
104
106
|
def from_column_types(col_types: dict[str, Any]) -> "SignalSchema":
|
|
105
107
|
signals: dict[str, DataType] = {}
|
|
106
108
|
for field, col_type in col_types.items():
|
|
107
|
-
if (
|
|
109
|
+
if isinstance(col_type, SQLType):
|
|
110
|
+
signals[field] = col_type.python_type
|
|
111
|
+
elif isclass(col_type) and issubclass(col_type, SQLType):
|
|
112
|
+
signals[field] = col_type().python_type
|
|
113
|
+
else:
|
|
108
114
|
raise SignalSchemaError(
|
|
109
115
|
f"signal schema cannot be obtained for column '{field}':"
|
|
110
|
-
f" unsupported type '{
|
|
116
|
+
f" unsupported type '{col_type}'"
|
|
111
117
|
)
|
|
112
|
-
signals[field] = py_type
|
|
113
118
|
return SignalSchema(signals)
|
|
114
119
|
|
|
115
120
|
def serialize(self) -> dict[str, str]:
|
|
@@ -232,7 +237,7 @@ class SignalSchema:
|
|
|
232
237
|
signals = [
|
|
233
238
|
DEFAULT_DELIMITER.join(path)
|
|
234
239
|
if not as_columns
|
|
235
|
-
else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
|
|
240
|
+
else sa.Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
|
|
236
241
|
for path, _type, has_subtree, _ in self.get_flat_tree()
|
|
237
242
|
if not has_subtree
|
|
238
243
|
]
|
|
@@ -878,17 +878,14 @@ class SQLUnion(Step):
|
|
|
878
878
|
temp_tables.extend(self.query1.temp_table_names)
|
|
879
879
|
q2 = self.query2.apply_steps().select().subquery()
|
|
880
880
|
temp_tables.extend(self.query2.temp_table_names)
|
|
881
|
-
|
|
881
|
+
|
|
882
|
+
columns1, columns2 = _order_columns(q1.columns, q2.columns)
|
|
882
883
|
|
|
883
884
|
def q(*columns):
|
|
884
885
|
names = {c.name for c in columns}
|
|
885
886
|
col1 = [c for c in columns1 if c.name in names]
|
|
886
887
|
col2 = [c for c in columns2 if c.name in names]
|
|
887
|
-
res = (
|
|
888
|
-
sqlalchemy.select(*col1)
|
|
889
|
-
.select_from(q1)
|
|
890
|
-
.union_all(sqlalchemy.select(*col2).select_from(q2))
|
|
891
|
-
)
|
|
888
|
+
res = sqlalchemy.select(*col1).union_all(sqlalchemy.select(*col2))
|
|
892
889
|
|
|
893
890
|
subquery = res.subquery()
|
|
894
891
|
return sqlalchemy.select(*subquery.c).select_from(subquery)
|
|
@@ -1021,23 +1018,46 @@ class GroupBy(Step):
|
|
|
1021
1018
|
return step_result(q, grouped_query.selected_columns)
|
|
1022
1019
|
|
|
1023
1020
|
|
|
1024
|
-
def
|
|
1025
|
-
|
|
1021
|
+
def _validate_columns(
|
|
1022
|
+
left_columns: Iterable[ColumnElement], right_columns: Iterable[ColumnElement]
|
|
1023
|
+
) -> set[str]:
|
|
1024
|
+
left_names = {c.name for c in left_columns}
|
|
1025
|
+
right_names = {c.name for c in right_columns}
|
|
1026
|
+
|
|
1027
|
+
if left_names == right_names:
|
|
1028
|
+
return left_names
|
|
1029
|
+
|
|
1030
|
+
missing_right = left_names - right_names
|
|
1031
|
+
missing_left = right_names - left_names
|
|
1032
|
+
|
|
1033
|
+
def _prepare_msg_part(missing_columns: set[str], side: str) -> str:
|
|
1034
|
+
return f"{', '.join(sorted(missing_columns))} only present in {side}"
|
|
1035
|
+
|
|
1036
|
+
msg_parts = [
|
|
1037
|
+
_prepare_msg_part(missing_columns, found_side)
|
|
1038
|
+
for missing_columns, found_side in zip(
|
|
1039
|
+
[
|
|
1040
|
+
missing_right,
|
|
1041
|
+
missing_left,
|
|
1042
|
+
],
|
|
1043
|
+
["left", "right"],
|
|
1044
|
+
)
|
|
1045
|
+
if missing_columns
|
|
1046
|
+
]
|
|
1047
|
+
msg = f"Cannot perform union. {'. '.join(msg_parts)}"
|
|
1048
|
+
|
|
1049
|
+
raise ValueError(msg)
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
def _order_columns(
|
|
1053
|
+
left_columns: Iterable[ColumnElement], right_columns: Iterable[ColumnElement]
|
|
1026
1054
|
) -> list[list[ColumnElement]]:
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
for col_dict, out in zip(column_dicts, result):
|
|
1034
|
-
if n in col_dict:
|
|
1035
|
-
out.append(col_dict[n])
|
|
1036
|
-
else:
|
|
1037
|
-
# Cast the NULL to ensure all columns are aware of their type
|
|
1038
|
-
# Label it to ensure it's aware of its name
|
|
1039
|
-
out.append(sqlalchemy.cast(sqlalchemy.null(), col.type).label(n))
|
|
1040
|
-
return result
|
|
1055
|
+
column_order = _validate_columns(left_columns, right_columns)
|
|
1056
|
+
column_dicts = [
|
|
1057
|
+
{c.name: c for c in columns} for columns in [left_columns, right_columns]
|
|
1058
|
+
]
|
|
1059
|
+
|
|
1060
|
+
return [[d[n] for n in column_order] for d in column_dicts]
|
|
1041
1061
|
|
|
1042
1062
|
|
|
1043
1063
|
@attrs.define
|
|
@@ -20,6 +20,8 @@ from typing import Any, Union
|
|
|
20
20
|
import sqlalchemy as sa
|
|
21
21
|
from sqlalchemy import TypeDecorator, types
|
|
22
22
|
|
|
23
|
+
from datachain.lib.data_model import StandardType
|
|
24
|
+
|
|
23
25
|
_registry: dict[str, "TypeConverter"] = {}
|
|
24
26
|
registry = MappingProxyType(_registry)
|
|
25
27
|
|
|
@@ -91,6 +93,10 @@ class SQLType(TypeDecorator):
|
|
|
91
93
|
impl: type[types.TypeEngine[Any]] = types.TypeEngine
|
|
92
94
|
cache_ok = True
|
|
93
95
|
|
|
96
|
+
@property
|
|
97
|
+
def python_type(self) -> StandardType:
|
|
98
|
+
raise NotImplementedError
|
|
99
|
+
|
|
94
100
|
def to_dict(self) -> dict[str, Any]:
|
|
95
101
|
return {"type": self.__class__.__name__}
|
|
96
102
|
|
|
@@ -103,7 +109,7 @@ class String(SQLType):
|
|
|
103
109
|
impl = types.String
|
|
104
110
|
|
|
105
111
|
@property
|
|
106
|
-
def python_type(self):
|
|
112
|
+
def python_type(self) -> StandardType:
|
|
107
113
|
return str
|
|
108
114
|
|
|
109
115
|
def load_dialect_impl(self, dialect):
|
|
@@ -125,7 +131,7 @@ class Boolean(SQLType):
|
|
|
125
131
|
impl = types.Boolean
|
|
126
132
|
|
|
127
133
|
@property
|
|
128
|
-
def python_type(self):
|
|
134
|
+
def python_type(self) -> StandardType:
|
|
129
135
|
return bool
|
|
130
136
|
|
|
131
137
|
def load_dialect_impl(self, dialect):
|
|
@@ -147,7 +153,7 @@ class Int(SQLType):
|
|
|
147
153
|
impl = types.INTEGER
|
|
148
154
|
|
|
149
155
|
@property
|
|
150
|
-
def python_type(self):
|
|
156
|
+
def python_type(self) -> StandardType:
|
|
151
157
|
return int
|
|
152
158
|
|
|
153
159
|
def load_dialect_impl(self, dialect):
|
|
@@ -217,7 +223,7 @@ class Float(SQLType):
|
|
|
217
223
|
impl = types.FLOAT
|
|
218
224
|
|
|
219
225
|
@property
|
|
220
|
-
def python_type(self):
|
|
226
|
+
def python_type(self) -> StandardType:
|
|
221
227
|
return float
|
|
222
228
|
|
|
223
229
|
def load_dialect_impl(self, dialect):
|
|
@@ -271,7 +277,7 @@ class Array(SQLType):
|
|
|
271
277
|
impl = types.ARRAY
|
|
272
278
|
|
|
273
279
|
@property
|
|
274
|
-
def python_type(self):
|
|
280
|
+
def python_type(self) -> StandardType:
|
|
275
281
|
return list
|
|
276
282
|
|
|
277
283
|
def load_dialect_impl(self, dialect):
|
|
@@ -314,7 +320,7 @@ class JSON(SQLType):
|
|
|
314
320
|
impl = types.JSON
|
|
315
321
|
|
|
316
322
|
@property
|
|
317
|
-
def python_type(self):
|
|
323
|
+
def python_type(self) -> StandardType:
|
|
318
324
|
return dict
|
|
319
325
|
|
|
320
326
|
def load_dialect_impl(self, dialect):
|
|
@@ -336,7 +342,7 @@ class DateTime(SQLType):
|
|
|
336
342
|
impl = types.DATETIME
|
|
337
343
|
|
|
338
344
|
@property
|
|
339
|
-
def python_type(self):
|
|
345
|
+
def python_type(self) -> StandardType:
|
|
340
346
|
return datetime
|
|
341
347
|
|
|
342
348
|
def load_dialect_impl(self, dialect):
|
|
@@ -358,7 +364,7 @@ class Binary(SQLType):
|
|
|
358
364
|
impl = types.BINARY
|
|
359
365
|
|
|
360
366
|
@property
|
|
361
|
-
def python_type(self):
|
|
367
|
+
def python_type(self) -> StandardType:
|
|
362
368
|
return bytes
|
|
363
369
|
|
|
364
370
|
def load_dialect_impl(self, dialect):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.6
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -41,6 +41,7 @@ Requires-Dist: pydantic<3,>=2
|
|
|
41
41
|
Requires-Dist: jmespath>=1.0
|
|
42
42
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
43
43
|
Requires-Dist: Pillow<11,>=10.0.0
|
|
44
|
+
Requires-Dist: msgpack<2,>=1.0.4
|
|
44
45
|
Provides-Extra: docs
|
|
45
46
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
46
47
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -54,7 +55,6 @@ Requires-Dist: torchvision; extra == "torch"
|
|
|
54
55
|
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
55
56
|
Provides-Extra: remote
|
|
56
57
|
Requires-Dist: lz4; extra == "remote"
|
|
57
|
-
Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
|
|
58
58
|
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
59
59
|
Provides-Extra: vector
|
|
60
60
|
Requires-Dist: usearch; extra == "vector"
|
|
@@ -87,9 +87,8 @@ Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
|
87
87
|
Requires-Dist: defusedxml; extra == "examples"
|
|
88
88
|
Requires-Dist: accelerate; extra == "examples"
|
|
89
89
|
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
90
|
-
Requires-Dist: pdfplumber==0.11.
|
|
90
|
+
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
91
91
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
92
|
-
Requires-Dist: nltk==3.8.1; extra == "examples"
|
|
93
92
|
|
|
94
93
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
95
94
|
|
|
@@ -22,6 +22,7 @@ pydantic<3,>=2
|
|
|
22
22
|
jmespath>=1.0
|
|
23
23
|
datamodel-code-generator>=0.25
|
|
24
24
|
Pillow<11,>=10.0.0
|
|
25
|
+
msgpack<2,>=1.0.4
|
|
25
26
|
|
|
26
27
|
[:sys_platform == "win32"]
|
|
27
28
|
numpy<2,>=1
|
|
@@ -48,13 +49,11 @@ numpy<2,>=1
|
|
|
48
49
|
defusedxml
|
|
49
50
|
accelerate
|
|
50
51
|
unstructured[pdf]
|
|
51
|
-
pdfplumber==0.11.
|
|
52
|
+
pdfplumber==0.11.4
|
|
52
53
|
huggingface_hub[hf_transfer]
|
|
53
|
-
nltk==3.8.1
|
|
54
54
|
|
|
55
55
|
[remote]
|
|
56
56
|
lz4
|
|
57
|
-
msgpack<2,>=1.0.4
|
|
58
57
|
requests>=2.22.0
|
|
59
58
|
|
|
60
59
|
[tests]
|
|
@@ -8,10 +8,11 @@ import pandas as pd
|
|
|
8
8
|
import pytest
|
|
9
9
|
import pytz
|
|
10
10
|
from PIL import Image
|
|
11
|
+
from sqlalchemy import Column
|
|
11
12
|
|
|
12
13
|
from datachain.data_storage.sqlite import SQLiteWarehouse
|
|
13
14
|
from datachain.dataset import DatasetStats
|
|
14
|
-
from datachain.lib.dc import DataChain
|
|
15
|
+
from datachain.lib.dc import DataChain, DataChainColumnError
|
|
15
16
|
from datachain.lib.file import File, ImageFile
|
|
16
17
|
from tests.utils import images_equal
|
|
17
18
|
|
|
@@ -314,3 +315,16 @@ def test_from_storage_check_rows(tmp_dir, test_session):
|
|
|
314
315
|
location=None,
|
|
315
316
|
vtype="",
|
|
316
317
|
)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def test_mutate_existing_column(catalog):
|
|
321
|
+
ds = DataChain.from_values(ids=[1, 2, 3])
|
|
322
|
+
|
|
323
|
+
with pytest.raises(DataChainColumnError) as excinfo:
|
|
324
|
+
ds.mutate(ids=Column("ids") + 1)
|
|
325
|
+
|
|
326
|
+
assert (
|
|
327
|
+
str(excinfo.value)
|
|
328
|
+
== "Error for column ids: Cannot modify existing column with mutate()."
|
|
329
|
+
" Use a different name for the new column."
|
|
330
|
+
)
|