datachain 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.2.0 → datachain-0.2.1}/.gitignore +0 -3
- {datachain-0.2.0/src/datachain.egg-info → datachain-0.2.1}/PKG-INFO +2 -2
- {datachain-0.2.0 → datachain-0.2.1}/pyproject.toml +1 -2
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/cli.py +8 -1
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/data_storage/schema.py +11 -5
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/data_storage/sqlite.py +3 -0
- datachain-0.2.1/src/datachain/lib/cached_stream.py +38 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/dc.py +6 -2
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/feature.py +5 -1
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/feature_registry.py +3 -2
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/file.py +10 -24
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/udf.py +7 -26
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/query/dataset.py +3 -9
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/sqlite/base.py +34 -2
- datachain-0.2.1/src/datachain/sql/sqlite/vector.py +23 -0
- {datachain-0.2.0 → datachain-0.2.1/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain.egg-info/SOURCES.txt +0 -2
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain.egg-info/requires.txt +1 -1
- datachain-0.2.1/tests/func/test_datachain.py +40 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/func/test_dataset_query.py +8 -7
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_arrow.py +3 -4
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_file.py +11 -25
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_image.py +2 -4
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_reader.py +2 -4
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_text.py +6 -10
- datachain-0.2.1/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/utils.py +0 -14
- datachain-0.2.0/src/datachain/__init__.py +0 -4
- datachain-0.2.0/src/datachain/_version.py +0 -16
- datachain-0.2.0/src/datachain/lib/cached_stream.py +0 -120
- datachain-0.2.0/src/datachain/sql/sqlite/vector.py +0 -15
- datachain-0.2.0/tests/func/test_datachain.py +0 -13
- datachain-0.2.0/tests/unit/lib/test_cached_stream.py +0 -82
- {datachain-0.2.0 → datachain-0.2.1}/.cruft.json +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.gitattributes +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.github/codecov.yaml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.github/dependabot.yml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.github/workflows/docs.yml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.github/workflows/release.yml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.pre-commit-config.yaml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/.reuse/dep5 +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/CONTRIBUTING.rst +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/LICENSE +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/LICENSES/Apache-2.0.txt +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/LICENSES/BSD-3-Clause.txt +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/LICENSES/Python-2.0.txt +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/README.rst +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/docs/assets/datachain.png +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/docs/index.md +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/docs/references/catalog.md +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/docs/references/datachain.md +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/docs/tutorials/cv_intro.md +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/docs/tutorials/udfs.md +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/blip2_image_desc_lib.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/clip.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/common_sql_functions.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/README.md +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/hf_pipeline.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/json-csv-reader.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/llava2_image_desc_lib.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/llm-claude-aggregate-query.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/llm-claude-simple-query.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/llm-claude.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/loader.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/neurips/README +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/neurips/distance_to_query.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/neurips/llm_chat.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/neurips/requirements.txt +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/neurips/single_query.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/neurips/text_loaders.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/notebooks/clip_fine_tuning.ipynb +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/openai_image_desc_lib.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/openimage-detect.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/pose_detection.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/torch-loader.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/udfs/batching.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/udfs/image_transformation.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/udfs/parallel.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/udfs/simple.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/udfs/stateful.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/udfs/stateful_similarity.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/unstructured-text.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/wds.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/wds_filtered.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/zalando/zalando_clip.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/zalando/zalando_dir_as_class.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/zalando/zalando_splits_and_classes_ds.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/examples/zalando/zalando_splits_and_classes_output.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/mkdocs.yml +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/noxfile.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/setup.cfg +0 -0
- {datachain-0.2.0/src/datachain/lib → datachain-0.2.1/src/datachain}/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/__main__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/asyn.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/cache.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/cli_utils.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/client/local.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/config.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/dataset.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/error.py +0 -0
- {datachain-0.2.0/src/datachain/remote → datachain-0.2.1/src/datachain/lib}/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/claude.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/feature_utils.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/gpt4_vision.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/hf_image_to_text.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/hf_pipeline.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/image_transform.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/iptc_exif_xmp.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/reader.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/unstructured.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/listing.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/node.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/progress.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/py.typed +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/query/batch.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/query/builtins.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/query/params.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/query/schema.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/query/session.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/query/udf.py +0 -0
- {datachain-0.2.0/tests/benchmarks → datachain-0.2.1/src/datachain/remote}/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/storage.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain/utils.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/__init__.py +0 -0
- {datachain-0.2.0/tests/func → datachain-0.2.1/tests/benchmarks}/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/conftest.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/data.py +0 -0
- {datachain-0.2.0/tests/unit → datachain-0.2.1/tests/func}/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/func/test_client.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/func/test_datasets.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/func/test_ls.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/func/test_pull.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/func/test_query.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/scripts/name_len_normal.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.2.0/tests/unit/lib → datachain-0.2.1/tests/unit}/__init__.py +0 -0
- {datachain-0.2.0/tests/unit/sql → datachain-0.2.1/tests/unit/lib}/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.2.0/tests/unit/sql/sqlite → datachain-0.2.1/tests/unit/sql}/__init__.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_client.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_session.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_storage.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_udf.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.2.0 → datachain-0.2.1}/tests/unit/test_warehouse.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -56,7 +56,7 @@ Requires-Dist: lz4; extra == "remote"
|
|
|
56
56
|
Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
|
|
57
57
|
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
58
58
|
Provides-Extra: vector
|
|
59
|
-
Requires-Dist:
|
|
59
|
+
Requires-Dist: usearch; extra == "vector"
|
|
60
60
|
Provides-Extra: tests
|
|
61
61
|
Requires-Dist: datachain[cv,pandas,remote,vector]; extra == "tests"
|
|
62
62
|
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
@@ -66,7 +66,7 @@ remote = [
|
|
|
66
66
|
"requests>=2.22.0"
|
|
67
67
|
]
|
|
68
68
|
vector = [
|
|
69
|
-
"
|
|
69
|
+
"usearch"
|
|
70
70
|
]
|
|
71
71
|
tests = [
|
|
72
72
|
"datachain[cv,pandas,remote,vector]",
|
|
@@ -107,7 +107,6 @@ where = ["src"]
|
|
|
107
107
|
namespaces = false
|
|
108
108
|
|
|
109
109
|
[tool.setuptools_scm]
|
|
110
|
-
write_to = "src/datachain/_version.py"
|
|
111
110
|
|
|
112
111
|
[tool.pytest.ini_options]
|
|
113
112
|
addopts = "-rfEs -m 'not benchmark'"
|
|
@@ -5,13 +5,14 @@ import sys
|
|
|
5
5
|
import traceback
|
|
6
6
|
from argparse import SUPPRESS, Action, ArgumentParser, ArgumentTypeError, Namespace
|
|
7
7
|
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
8
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
8
9
|
from itertools import chain
|
|
9
10
|
from multiprocessing import freeze_support
|
|
10
11
|
from typing import TYPE_CHECKING, Optional, Union
|
|
11
12
|
|
|
12
13
|
import shtab
|
|
13
14
|
|
|
14
|
-
from datachain import
|
|
15
|
+
from datachain import utils
|
|
15
16
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
16
17
|
from datachain.utils import DataChainDir
|
|
17
18
|
|
|
@@ -96,6 +97,12 @@ def add_show_args(parser: ArgumentParser) -> None:
|
|
|
96
97
|
|
|
97
98
|
|
|
98
99
|
def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
100
|
+
try:
|
|
101
|
+
__version__ = version("datachain")
|
|
102
|
+
except PackageNotFoundError:
|
|
103
|
+
# package is not installed
|
|
104
|
+
__version__ = "unknown"
|
|
105
|
+
|
|
99
106
|
parser = ArgumentParser(
|
|
100
107
|
description="DataChain: Wrangle unstructured AI data at scale", prog="datachain"
|
|
101
108
|
)
|
|
@@ -31,7 +31,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
|
31
31
|
"""
|
|
32
32
|
c_set: dict[str, sa.Column] = {}
|
|
33
33
|
for c in columns:
|
|
34
|
-
if ec := c_set.get(c.name, None):
|
|
34
|
+
if (ec := c_set.get(c.name, None)) is not None:
|
|
35
35
|
if str(ec.type) != str(c.type):
|
|
36
36
|
raise ValueError(
|
|
37
37
|
f"conflicting types for column {c.name}:{c.type!s} and {ec.type!s}"
|
|
@@ -171,8 +171,8 @@ class DataTable:
|
|
|
171
171
|
):
|
|
172
172
|
# copy columns, since re-using the same objects from another table
|
|
173
173
|
# may raise an error
|
|
174
|
-
columns = [cls.copy_column(c) for c in columns
|
|
175
|
-
columns =
|
|
174
|
+
columns = cls.sys_columns() + [cls.copy_column(c) for c in columns]
|
|
175
|
+
columns = dedup_columns(columns)
|
|
176
176
|
|
|
177
177
|
if metadata is None:
|
|
178
178
|
metadata = sa.MetaData()
|
|
@@ -230,11 +230,17 @@ class DataTable:
|
|
|
230
230
|
def delete(self):
|
|
231
231
|
return self.apply_conditions(self.table.delete())
|
|
232
232
|
|
|
233
|
+
@staticmethod
|
|
234
|
+
def sys_columns():
|
|
235
|
+
return [
|
|
236
|
+
sa.Column("id", Int, primary_key=True),
|
|
237
|
+
sa.Column("random", Int64, nullable=False, default=f.random()),
|
|
238
|
+
]
|
|
239
|
+
|
|
233
240
|
@classmethod
|
|
234
241
|
def file_columns(cls) -> list[sa.Column]:
|
|
235
242
|
return [
|
|
236
|
-
|
|
237
|
-
sa.Column("random", Int64, nullable=False),
|
|
243
|
+
*cls.sys_columns(),
|
|
238
244
|
sa.Column("vtype", String, nullable=False, index=True),
|
|
239
245
|
sa.Column("dir_type", Int, index=True),
|
|
240
246
|
sa.Column("parent", String, index=True),
|
|
@@ -33,6 +33,7 @@ from datachain.data_storage.schema import (
|
|
|
33
33
|
from datachain.dataset import DatasetRecord
|
|
34
34
|
from datachain.error import DataChainError
|
|
35
35
|
from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
|
|
36
|
+
from datachain.sql.sqlite.base import load_usearch_extension
|
|
36
37
|
from datachain.sql.types import SQLType
|
|
37
38
|
from datachain.storage import StorageURI
|
|
38
39
|
from datachain.utils import DataChainDir
|
|
@@ -114,6 +115,8 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
114
115
|
if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
|
|
115
116
|
db.set_trace_callback(print)
|
|
116
117
|
|
|
118
|
+
load_usearch_extension(db)
|
|
119
|
+
|
|
117
120
|
return cls(engine, MetaData(), db, db_file)
|
|
118
121
|
except RuntimeError:
|
|
119
122
|
raise DataChainError("Can't connect to SQLite DB") from None
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from contextlib import AbstractContextManager
|
|
3
|
+
|
|
4
|
+
from datachain.cache import UniqueId
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AbstractCachedStream(AbstractContextManager, ABC):
|
|
8
|
+
def __init__(self, catalog, uid: UniqueId):
|
|
9
|
+
self.catalog = catalog
|
|
10
|
+
self.uid = uid
|
|
11
|
+
self.mode = "rb"
|
|
12
|
+
|
|
13
|
+
def set_mode(self, mode):
|
|
14
|
+
self.mode = mode
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PreCachedStream(AbstractCachedStream):
|
|
18
|
+
def __init__(self, catalog, uid: UniqueId):
|
|
19
|
+
super().__init__(catalog, uid)
|
|
20
|
+
self.client = self.catalog.get_client(self.uid.storage)
|
|
21
|
+
self.cached_file = None
|
|
22
|
+
|
|
23
|
+
def get_path_in_cache(self):
|
|
24
|
+
return self.catalog.cache.path_from_checksum(self.uid.get_hash())
|
|
25
|
+
|
|
26
|
+
def __enter__(self):
|
|
27
|
+
self.client.download(self.uid)
|
|
28
|
+
self.cached_file = open(self.get_path_in_cache(), self.mode)
|
|
29
|
+
return self.cached_file
|
|
30
|
+
|
|
31
|
+
def __exit__(self, *args):
|
|
32
|
+
self.cached_file.close()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PreDownloadStream(PreCachedStream):
|
|
36
|
+
def __exit__(self, *args):
|
|
37
|
+
super().__exit__(*args)
|
|
38
|
+
self.catalog.cache.remove(self.uid)
|
|
@@ -39,6 +39,8 @@ if TYPE_CHECKING:
|
|
|
39
39
|
import pandas as pd
|
|
40
40
|
from typing_extensions import Self
|
|
41
41
|
|
|
42
|
+
from datachain.catalog import Catalog
|
|
43
|
+
|
|
42
44
|
C = Column
|
|
43
45
|
|
|
44
46
|
|
|
@@ -200,10 +202,12 @@ class DataChain(DatasetQuery):
|
|
|
200
202
|
def from_storage(
|
|
201
203
|
cls,
|
|
202
204
|
path,
|
|
205
|
+
*,
|
|
203
206
|
type: Literal["binary", "text", "image"] = "binary",
|
|
207
|
+
catalog: Optional["Catalog"] = None,
|
|
204
208
|
recursive: Optional[bool] = True,
|
|
205
209
|
anon: bool = False,
|
|
206
|
-
) -> "
|
|
210
|
+
) -> "Self":
|
|
207
211
|
"""Get data from a storage as a list of file with all file attributes. It
|
|
208
212
|
returns the chain itself as usual.
|
|
209
213
|
|
|
@@ -220,7 +224,7 @@ class DataChain(DatasetQuery):
|
|
|
220
224
|
```
|
|
221
225
|
"""
|
|
222
226
|
func = get_file(type)
|
|
223
|
-
return
|
|
227
|
+
return cls(path, catalog=catalog, recursive=recursive, anon=anon).map(file=func)
|
|
224
228
|
|
|
225
229
|
@classmethod
|
|
226
230
|
def from_dataset(cls, name: str, version: Optional[int] = None) -> "DataChain":
|
|
@@ -7,6 +7,7 @@ from datetime import datetime
|
|
|
7
7
|
from functools import lru_cache
|
|
8
8
|
from types import GenericAlias
|
|
9
9
|
from typing import (
|
|
10
|
+
TYPE_CHECKING,
|
|
10
11
|
Any,
|
|
11
12
|
ClassVar,
|
|
12
13
|
Literal,
|
|
@@ -39,6 +40,9 @@ from datachain.sql.types import (
|
|
|
39
40
|
String,
|
|
40
41
|
)
|
|
41
42
|
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from datachain.catalog import Catalog
|
|
45
|
+
|
|
42
46
|
FeatureStandardType = Union[
|
|
43
47
|
type[int],
|
|
44
48
|
type[str],
|
|
@@ -158,7 +162,7 @@ class Feature(BaseModel):
|
|
|
158
162
|
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
|
|
159
163
|
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
|
|
160
164
|
|
|
161
|
-
def _set_stream(self, catalog
|
|
165
|
+
def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
|
|
162
166
|
pass
|
|
163
167
|
|
|
164
168
|
@classmethod
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from typing import Any, ClassVar, Optional
|
|
2
3
|
|
|
3
|
-
|
|
4
|
+
logger = logging.getLogger(__name__)
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class Registry:
|
|
@@ -16,7 +17,7 @@ class Registry:
|
|
|
16
17
|
version = fr._version # type: ignore[attr-defined]
|
|
17
18
|
if version in cls.reg[name]:
|
|
18
19
|
full_name = f"{name}@{version}"
|
|
19
|
-
logger.warning(
|
|
20
|
+
logger.warning("Feature %s is already registered", full_name)
|
|
20
21
|
cls.reg[name][version] = fr
|
|
21
22
|
|
|
22
23
|
@classmethod
|
|
@@ -2,11 +2,10 @@ import json
|
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, ClassVar, Literal, Optional, Union
|
|
5
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
|
|
6
6
|
from urllib.parse import unquote, urlparse
|
|
7
7
|
from urllib.request import url2pathname
|
|
8
8
|
|
|
9
|
-
from fsspec import Callback
|
|
10
9
|
from fsspec.implementations.local import LocalFileSystem
|
|
11
10
|
from pydantic import Field, field_validator
|
|
12
11
|
|
|
@@ -18,6 +17,9 @@ from datachain.lib.utils import DataChainError
|
|
|
18
17
|
from datachain.sql.types import JSON, Int, String
|
|
19
18
|
from datachain.utils import TIME_ZERO
|
|
20
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from datachain.catalog import Catalog
|
|
22
|
+
|
|
21
23
|
|
|
22
24
|
class FileFeature(Feature):
|
|
23
25
|
_is_file = True
|
|
@@ -182,26 +184,17 @@ class File(FileFeature):
|
|
|
182
184
|
|
|
183
185
|
def open(self):
|
|
184
186
|
if self._stream is None:
|
|
185
|
-
|
|
186
|
-
raise FileError(self, "stream is not set")
|
|
187
|
-
self._stream = self._open_stream()
|
|
187
|
+
raise FileError(self, "stream is not set")
|
|
188
188
|
|
|
189
189
|
if self.location:
|
|
190
190
|
return VFileRegistry.resolve(self, self.location)
|
|
191
191
|
|
|
192
192
|
return self._stream
|
|
193
193
|
|
|
194
|
-
def _set_stream(
|
|
195
|
-
self
|
|
196
|
-
) -> None:
|
|
197
|
-
if self._catalog is None and catalog is None:
|
|
198
|
-
raise DataChainError(f"Cannot set file '{stream}' without catalog")
|
|
199
|
-
|
|
200
|
-
if catalog:
|
|
201
|
-
self._catalog = catalog
|
|
202
|
-
|
|
194
|
+
def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
|
|
195
|
+
self._catalog = catalog
|
|
203
196
|
stream_class = PreCachedStream if caching_enabled else PreDownloadStream
|
|
204
|
-
self._stream = stream_class(
|
|
197
|
+
self._stream = stream_class(self._catalog, self.get_uid())
|
|
205
198
|
self._caching_enabled = caching_enabled
|
|
206
199
|
|
|
207
200
|
def get_uid(self) -> UniqueId:
|
|
@@ -232,11 +225,6 @@ class File(FileFeature):
|
|
|
232
225
|
def get_uri(self):
|
|
233
226
|
return f"{self.source}/{self.get_full_name()}"
|
|
234
227
|
|
|
235
|
-
def _open_stream(self, cache: bool = False, cb: Optional[Callback] = None):
|
|
236
|
-
client = self._catalog.get_client(self.source)
|
|
237
|
-
uid = self.get_uid()
|
|
238
|
-
return client.open_object(uid, use_cache=cache, cb=cb)
|
|
239
|
-
|
|
240
228
|
def get_path(self) -> str:
|
|
241
229
|
path = unquote(self.get_uri())
|
|
242
230
|
fs = self.get_fs()
|
|
@@ -258,10 +246,8 @@ class TextFile(File):
|
|
|
258
246
|
super().__init__(**kwargs)
|
|
259
247
|
self._stream = None
|
|
260
248
|
|
|
261
|
-
def _set_stream(
|
|
262
|
-
|
|
263
|
-
) -> None:
|
|
264
|
-
super()._set_stream(catalog, stream, caching_enabled)
|
|
249
|
+
def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
|
|
250
|
+
super()._set_stream(catalog, caching_enabled)
|
|
265
251
|
self._stream.set_mode("r")
|
|
266
252
|
|
|
267
253
|
|
|
@@ -6,10 +6,10 @@ from typing import TYPE_CHECKING, Callable, Optional
|
|
|
6
6
|
from datachain.lib.feature import Feature
|
|
7
7
|
from datachain.lib.signal_schema import SignalSchema
|
|
8
8
|
from datachain.lib.utils import DataChainError, DataChainParamsError
|
|
9
|
-
from datachain.query import
|
|
9
|
+
from datachain.query import udf
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
|
-
from
|
|
12
|
+
from datachain.query.udf import UDFWrapper
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class UdfError(DataChainParamsError):
|
|
@@ -34,11 +34,6 @@ class UDFBase:
|
|
|
34
34
|
|
|
35
35
|
params_spec = params.to_udf_spec()
|
|
36
36
|
self.params_spec = list(params_spec.keys())
|
|
37
|
-
self._contains_stream = False
|
|
38
|
-
if params.contains_file():
|
|
39
|
-
self.params_spec.insert(0, Stream()) # type: ignore[arg-type]
|
|
40
|
-
self._contains_stream = True
|
|
41
|
-
|
|
42
37
|
self.output_spec = output.to_udf_spec()
|
|
43
38
|
|
|
44
39
|
self._catalog = None
|
|
@@ -122,18 +117,10 @@ class UDFBase:
|
|
|
122
117
|
rows = [rows]
|
|
123
118
|
objs = []
|
|
124
119
|
for row in rows:
|
|
125
|
-
if self._contains_stream:
|
|
126
|
-
stream, *row = row
|
|
127
|
-
else:
|
|
128
|
-
stream = None
|
|
129
|
-
|
|
130
120
|
obj_row = self.params.row_to_objs(row)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
if isinstance(obj, Feature):
|
|
135
|
-
obj._set_stream(self._catalog, stream, True)
|
|
136
|
-
|
|
121
|
+
for obj in obj_row:
|
|
122
|
+
if isinstance(obj, Feature):
|
|
123
|
+
obj._set_stream(self._catalog, caching_enabled=True)
|
|
137
124
|
objs.append(obj_row)
|
|
138
125
|
return objs
|
|
139
126
|
|
|
@@ -150,13 +137,7 @@ class UDFBase:
|
|
|
150
137
|
output_map[name] = []
|
|
151
138
|
|
|
152
139
|
for flat_obj in group:
|
|
153
|
-
|
|
154
|
-
position = 1
|
|
155
|
-
stream = flat_obj[0]
|
|
156
|
-
else:
|
|
157
|
-
position = 0
|
|
158
|
-
stream = None
|
|
159
|
-
|
|
140
|
+
position = 0
|
|
160
141
|
for signal, (cls, length) in spec_map.items():
|
|
161
142
|
slice = flat_obj[position : position + length]
|
|
162
143
|
position += length
|
|
@@ -167,7 +148,7 @@ class UDFBase:
|
|
|
167
148
|
obj = slice[0]
|
|
168
149
|
|
|
169
150
|
if isinstance(obj, Feature):
|
|
170
|
-
obj._set_stream(self._catalog
|
|
151
|
+
obj._set_stream(self._catalog)
|
|
171
152
|
output_map[signal].append(obj)
|
|
172
153
|
|
|
173
154
|
return list(output_map.values())
|
|
@@ -1737,22 +1737,16 @@ class DatasetQuery:
|
|
|
1737
1737
|
|
|
1738
1738
|
# Exclude the id column and let the db create it to avoid unique
|
|
1739
1739
|
# constraint violations.
|
|
1740
|
-
cols = [col.name for col in dr.get_table().c if col.name != "id"]
|
|
1741
|
-
assert cols
|
|
1742
1740
|
q = query.exclude(("id",))
|
|
1743
|
-
|
|
1744
1741
|
if q._order_by_clauses:
|
|
1745
1742
|
# ensuring we have id sorted by order by clause if it exists in a query
|
|
1746
1743
|
q = q.add_columns(
|
|
1747
1744
|
f.row_number().over(order_by=q._order_by_clauses).label("id")
|
|
1748
1745
|
)
|
|
1749
|
-
cols.append("id")
|
|
1750
|
-
|
|
1751
|
-
self.catalog.warehouse.db.execute(
|
|
1752
|
-
sqlalchemy.insert(dr.get_table()).from_select(cols, q),
|
|
1753
|
-
**kwargs,
|
|
1754
|
-
)
|
|
1755
1746
|
|
|
1747
|
+
cols = tuple(c.name for c in q.columns)
|
|
1748
|
+
insert_q = sqlalchemy.insert(dr.get_table()).from_select(cols, q)
|
|
1749
|
+
self.catalog.warehouse.db.execute(insert_q, **kwargs)
|
|
1756
1750
|
self.catalog.metastore.update_dataset_status(
|
|
1757
1751
|
dataset, DatasetStatus.COMPLETE, version=version
|
|
1758
1752
|
)
|
|
@@ -71,8 +71,6 @@ def setup():
|
|
|
71
71
|
compiles(sql_path.name, "sqlite")(compile_path_name)
|
|
72
72
|
compiles(sql_path.file_stem, "sqlite")(compile_path_file_stem)
|
|
73
73
|
compiles(sql_path.file_ext, "sqlite")(compile_path_file_ext)
|
|
74
|
-
compiles(array.cosine_distance, "sqlite")(compile_cosine_distance)
|
|
75
|
-
compiles(array.euclidean_distance, "sqlite")(compile_euclidean_distance)
|
|
76
74
|
compiles(array.length, "sqlite")(compile_array_length)
|
|
77
75
|
compiles(string.length, "sqlite")(compile_string_length)
|
|
78
76
|
compiles(string.split, "sqlite")(compile_string_split)
|
|
@@ -81,6 +79,13 @@ def setup():
|
|
|
81
79
|
compiles(Values, "sqlite")(compile_values)
|
|
82
80
|
compiles(random.rand, "sqlite")(compile_rand)
|
|
83
81
|
|
|
82
|
+
if load_usearch_extension(sqlite3.connect(":memory:")):
|
|
83
|
+
compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
|
|
84
|
+
compiles(array.euclidean_distance, "sqlite")(compile_euclidean_distance_ext)
|
|
85
|
+
else:
|
|
86
|
+
compiles(array.cosine_distance, "sqlite")(compile_cosine_distance)
|
|
87
|
+
compiles(array.euclidean_distance, "sqlite")(compile_euclidean_distance)
|
|
88
|
+
|
|
84
89
|
register_user_defined_sql_functions()
|
|
85
90
|
setup_is_complete = True
|
|
86
91
|
|
|
@@ -246,11 +251,23 @@ def compile_path_file_ext(element, compiler, **kwargs):
|
|
|
246
251
|
return compiler.process(path_file_ext(*element.clauses.clauses), **kwargs)
|
|
247
252
|
|
|
248
253
|
|
|
254
|
+
def compile_cosine_distance_ext(element, compiler, **kwargs):
|
|
255
|
+
run_compiler_hook("cosine_distance")
|
|
256
|
+
return f"distance_cosine_f32({compiler.process(element.clauses, **kwargs)})"
|
|
257
|
+
|
|
258
|
+
|
|
249
259
|
def compile_cosine_distance(element, compiler, **kwargs):
|
|
250
260
|
run_compiler_hook("cosine_distance")
|
|
251
261
|
return f"cosine_distance({compiler.process(element.clauses, **kwargs)})"
|
|
252
262
|
|
|
253
263
|
|
|
264
|
+
def compile_euclidean_distance_ext(element, compiler, **kwargs):
|
|
265
|
+
run_compiler_hook("euclidean_distance")
|
|
266
|
+
return (
|
|
267
|
+
f"sqrt(distance_sqeuclidean_f32({compiler.process(element.clauses, **kwargs)}))"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
|
|
254
271
|
def compile_euclidean_distance(element, compiler, **kwargs):
|
|
255
272
|
run_compiler_hook("euclidean_distance")
|
|
256
273
|
return f"euclidean_distance({compiler.process(element.clauses, **kwargs)})"
|
|
@@ -330,3 +347,18 @@ def compile_values(element, compiler, **kwargs):
|
|
|
330
347
|
|
|
331
348
|
def compile_rand(element, compiler, **kwargs):
|
|
332
349
|
return compiler.process(func.random(), **kwargs)
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def load_usearch_extension(conn) -> bool:
|
|
353
|
+
try:
|
|
354
|
+
# usearch is part of the vector optional dependencies
|
|
355
|
+
# we use the extension's cosine and euclidean distance functions
|
|
356
|
+
from usearch import sqlite_path
|
|
357
|
+
|
|
358
|
+
conn.enable_load_extension(True)
|
|
359
|
+
conn.load_extension(sqlite_path())
|
|
360
|
+
conn.enable_load_extension(False)
|
|
361
|
+
return True
|
|
362
|
+
|
|
363
|
+
except Exception: # noqa: BLE001
|
|
364
|
+
return False
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import math
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def euclidean_distance(a: str, b: str):
|
|
7
|
+
a_np = np.fromstring(a[1:-1], sep=",")
|
|
8
|
+
b_np = np.fromstring(b[1:-1], sep=",")
|
|
9
|
+
|
|
10
|
+
return np.linalg.norm(b_np - a_np)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def cosine_distance(a: str, b: str):
|
|
14
|
+
u = np.fromstring(a[1:-1], sep=",")
|
|
15
|
+
v = np.fromstring(b[1:-1], sep=",")
|
|
16
|
+
|
|
17
|
+
uv = np.inner(u, v)
|
|
18
|
+
uu = np.inner(u, u)
|
|
19
|
+
vv = np.inner(v, v)
|
|
20
|
+
|
|
21
|
+
dist = 1.0 - uv / math.sqrt(uu * vv)
|
|
22
|
+
|
|
23
|
+
return max(0, min(dist, 2.0))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -56,7 +56,7 @@ Requires-Dist: lz4; extra == "remote"
|
|
|
56
56
|
Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
|
|
57
57
|
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
58
58
|
Provides-Extra: vector
|
|
59
|
-
Requires-Dist:
|
|
59
|
+
Requires-Dist: usearch; extra == "vector"
|
|
60
60
|
Provides-Extra: tests
|
|
61
61
|
Requires-Dist: datachain[cv,pandas,remote,vector]; extra == "tests"
|
|
62
62
|
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
@@ -84,7 +84,6 @@ examples/zalando/zalando_splits_and_classes_ds.py
|
|
|
84
84
|
examples/zalando/zalando_splits_and_classes_output.py
|
|
85
85
|
src/datachain/__init__.py
|
|
86
86
|
src/datachain/__main__.py
|
|
87
|
-
src/datachain/_version.py
|
|
88
87
|
src/datachain/asyn.py
|
|
89
88
|
src/datachain/cache.py
|
|
90
89
|
src/datachain/cli.py
|
|
@@ -233,7 +232,6 @@ tests/unit/test_utils.py
|
|
|
233
232
|
tests/unit/test_warehouse.py
|
|
234
233
|
tests/unit/lib/__init__.py
|
|
235
234
|
tests/unit/lib/test_arrow.py
|
|
236
|
-
tests/unit/lib/test_cached_stream.py
|
|
237
235
|
tests/unit/lib/test_datachain.py
|
|
238
236
|
tests/unit/lib/test_datachain_merge.py
|
|
239
237
|
tests/unit/lib/test_feature.py
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from datachain.lib.dc import DataChain
|
|
4
|
+
from datachain.lib.file import File
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.mark.parametrize("anon", [True, False])
|
|
8
|
+
def test_catalog_anon(catalog, anon):
|
|
9
|
+
chain = (
|
|
10
|
+
DataChain.from_storage("gs://dvcx-datalakes/dogs-and-cats/", anon=anon)
|
|
11
|
+
.limit(5)
|
|
12
|
+
.save("test_catalog_anon")
|
|
13
|
+
)
|
|
14
|
+
assert chain.catalog.client_config.get("anon", False) is anon
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_from_storage(cloud_test_catalog):
|
|
18
|
+
ctc = cloud_test_catalog
|
|
19
|
+
dc = DataChain.from_storage(ctc.src_uri, catalog=ctc.catalog)
|
|
20
|
+
assert dc.count() == 7
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_map_file(cloud_test_catalog):
|
|
24
|
+
ctc = cloud_test_catalog
|
|
25
|
+
|
|
26
|
+
def new_signal(file: File) -> str:
|
|
27
|
+
with file.open() as f:
|
|
28
|
+
return file.name + " -> " + f.read().decode("utf-8")
|
|
29
|
+
|
|
30
|
+
dc = DataChain.from_storage(ctc.src_uri, catalog=ctc.catalog).map(signal=new_signal)
|
|
31
|
+
expected = {
|
|
32
|
+
"description -> Cats and Dogs",
|
|
33
|
+
"cat1 -> meow",
|
|
34
|
+
"cat2 -> mrow",
|
|
35
|
+
"dog1 -> woof",
|
|
36
|
+
"dog2 -> arf",
|
|
37
|
+
"dog3 -> bark",
|
|
38
|
+
"dog4 -> ruff",
|
|
39
|
+
}
|
|
40
|
+
assert set(dc.collect_one("signal")) == expected
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
|
+
import math
|
|
3
4
|
import os
|
|
4
5
|
import pickle
|
|
5
6
|
import random
|
|
@@ -56,7 +57,6 @@ from tests.utils import (
|
|
|
56
57
|
SIMPLE_DS_QUERY_RECORDS,
|
|
57
58
|
TARRED_TREE,
|
|
58
59
|
WEBFORMAT_TREE,
|
|
59
|
-
adjusted_float_diff,
|
|
60
60
|
assert_row_names,
|
|
61
61
|
create_tar_dataset,
|
|
62
62
|
dataset_dependency_asdict,
|
|
@@ -1140,6 +1140,8 @@ def test_udf_distributed_cancel(cloud_test_catalog_tmpfile, capfd, datachain_job
|
|
|
1140
1140
|
workers=2,
|
|
1141
1141
|
team_id=metastore.team_id,
|
|
1142
1142
|
created_at=datetime.now(timezone.utc),
|
|
1143
|
+
params="{}",
|
|
1144
|
+
metrics="{}",
|
|
1143
1145
|
),
|
|
1144
1146
|
)
|
|
1145
1147
|
|
|
@@ -2851,13 +2853,12 @@ def test_similarity_search(cloud_test_catalog):
|
|
|
2851
2853
|
("dogs", "dog3", 0.7695916496857775, 1.8344983482620636),
|
|
2852
2854
|
("dogs/others", "dog4", 0.9789704524691446, 2.0531542018152322),
|
|
2853
2855
|
]
|
|
2854
|
-
expected_diffs = [(p, n, 0.0, 0.0) for p, n, _, _ in expected]
|
|
2855
2856
|
|
|
2856
|
-
|
|
2857
|
-
(p2
|
|
2858
|
-
|
|
2859
|
-
|
|
2860
|
-
|
|
2857
|
+
for (p1, n1, c1, e1), (p2, n2, c2, e2) in zip(result, expected):
|
|
2858
|
+
assert p1.endswith(p2)
|
|
2859
|
+
assert n1 == n2
|
|
2860
|
+
assert math.isclose(c1, c2, abs_tol=1e-5)
|
|
2861
|
+
assert math.isclose(e1, e2, abs_tol=1e-5)
|
|
2861
2862
|
|
|
2862
2863
|
|
|
2863
2864
|
@pytest.mark.parametrize(
|
|
@@ -22,11 +22,10 @@ def test_arrow_generator(tmp_path, catalog):
|
|
|
22
22
|
pq_path = tmp_path / name
|
|
23
23
|
df.to_parquet(pq_path)
|
|
24
24
|
stream = File(name=name, parent=tmp_path.as_posix(), source="file:///")
|
|
25
|
-
|
|
26
|
-
stream._set_stream(catalog, fd, caching_enabled=False)
|
|
25
|
+
stream._set_stream(catalog, caching_enabled=False)
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
27
|
+
func = ArrowGenerator()
|
|
28
|
+
objs = list(func(stream))
|
|
30
29
|
|
|
31
30
|
assert len(objs) == len(ids)
|
|
32
31
|
for index, (o, id, text) in enumerate(zip(objs, ids, texts)):
|