datachain 0.2.16__tar.gz → 0.2.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.2.16/src/datachain.egg-info → datachain-0.2.17}/PKG-INFO +71 -12
- {datachain-0.2.16 → datachain-0.2.17}/README.rst +70 -11
- {datachain-0.2.16 → datachain-0.2.17}/examples/get_started/json-csv-reader.py +9 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/catalog/catalog.py +47 -44
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/data_storage/db_engine.py +6 -2
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/data_storage/id_generator.py +14 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/data_storage/metastore.py +13 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/data_storage/sqlite.py +45 -6
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/data_storage/warehouse.py +13 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/arrow.py +22 -7
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/dc.py +29 -6
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/file.py +3 -3
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/signal_schema.py +33 -5
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/listing.py +22 -10
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/query/dataset.py +17 -20
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/query/session.py +19 -4
- {datachain-0.2.16 → datachain-0.2.17/src/datachain.egg-info}/PKG-INFO +71 -12
- {datachain-0.2.16 → datachain-0.2.17}/tests/conftest.py +50 -23
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/test_datachain.py +25 -15
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/test_dataset_query.py +43 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_arrow.py +0 -17
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_datachain.py +308 -157
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_datachain_merge.py +24 -20
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_feature_utils.py +4 -4
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_signal_schema.py +29 -2
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_catalog_loader.py +24 -30
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_data_storage.py +17 -17
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_database_engine.py +9 -11
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_id_generator.py +6 -8
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_metastore.py +7 -9
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_warehouse.py +7 -9
- {datachain-0.2.16 → datachain-0.2.17}/.cruft.json +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.gitattributes +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.github/codecov.yaml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.github/dependabot.yml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.github/workflows/release.yml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.github/workflows/tests.yml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.gitignore +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/.pre-commit-config.yaml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/CONTRIBUTING.rst +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/LICENSE +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/assets/datachain.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/assets/flowchart.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/index.md +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/references/datachain.md +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/references/datatype.md +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/references/file.md +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/references/index.md +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/references/sql.md +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/references/torch.md +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/docs/references/udf.md +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/README.md +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/get_started/json-metadata-tutorial.ipynb +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/llm/llm_chatbot_evaluation.ipynb +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/llm_and_nlp/llm-claude.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/multimodal/wds.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/mkdocs.yml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/noxfile.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/pyproject.toml +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/setup.cfg +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/__main__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/asyn.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/cache.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/cli.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/cli_utils.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/client/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/client/azure.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/client/gcs.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/client/local.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/client/s3.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/config.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/dataset.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/error.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/job.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/clip.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/image.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/settings.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/text.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/udf.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/utils.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/node.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/progress.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/py.typed +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/query/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/query/batch.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/query/builtins.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/query/metrics.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/query/params.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/query/schema.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/query/udf.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/remote/studio.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/types.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/sql/utils.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/storage.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain/utils.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/data.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/examples/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/examples/wds_data.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/test_catalog.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/test_client.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/test_datasets.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/test_ls.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/test_pull.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/test_pytorch.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/func/test_query.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/scripts/feature_class.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/scripts/name_len_normal.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/test_cli_e2e.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/test_query_e2e.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_asyn.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_cache.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_catalog.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_client.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_dataset.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_listing.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_query_params.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_serializer.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_session.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_storage.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_udf.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/unit/test_utils.py +0 -0
- {datachain-0.2.16 → datachain-0.2.17}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.17
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -100,28 +100,87 @@ Requires-Dist: types-requests; extra == "dev"
|
|
|
100
100
|
AI 🔗 DataChain
|
|
101
101
|
----------------
|
|
102
102
|
|
|
103
|
-
DataChain is
|
|
104
|
-
data
|
|
103
|
+
DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
|
|
104
|
+
AI engineers build a metadata layer on top of unstructured files and analyze data using
|
|
105
|
+
this layer.
|
|
105
106
|
|
|
106
|
-
|
|
107
|
+
📂 **Raw Files Processing**
|
|
108
|
+
Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
|
|
109
|
+
Local), version and update datasets.
|
|
107
110
|
|
|
108
|
-
|
|
111
|
+
🌟 **Metadata layer.**
|
|
112
|
+
Build a metadata layer on top of files using structured sources like CSV, Parquet,
|
|
113
|
+
and JSON files.
|
|
109
114
|
|
|
110
|
-
|
|
115
|
+
⭐ **Metadata enrichment.**
|
|
116
|
+
Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
|
|
111
117
|
|
|
118
|
+
🛠️ **Data Transformation.**
|
|
119
|
+
Transform metadata using traditional methods like filtering, grouping, joining, and
|
|
120
|
+
others.
|
|
112
121
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
117
|
-
and validation of multimodal AI applications.
|
|
122
|
+
🐍 **User-friendly interface.**
|
|
123
|
+
Operate efficiently with familiar Python objects and object fields, eliminating the
|
|
124
|
+
need for SQL.
|
|
118
125
|
|
|
119
126
|
|
|
120
127
|
.. code:: console
|
|
121
128
|
|
|
122
129
|
$ pip install datachain
|
|
123
130
|
|
|
124
|
-
|
|
131
|
+
|
|
132
|
+
Data Structures
|
|
133
|
+
===============
|
|
134
|
+
|
|
135
|
+
DataChain introduces expressive data structures tailored for AI-specific workload:
|
|
136
|
+
|
|
137
|
+
- **Dataset:** Preserves the file-references and meta-information. Takes care of Python
|
|
138
|
+
object serialization, dataset versioning and difference. Operations on dataset:
|
|
139
|
+
|
|
140
|
+
- **Transformations:** traditional data-frame or SQL operations such as filtering,
|
|
141
|
+
grouping, joining.
|
|
142
|
+
- **Enrichments:** mapping, aggregating and generating using customer’s Python
|
|
143
|
+
code. This is needed to work with ML inference and LLM calls.
|
|
144
|
+
|
|
145
|
+
- **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
|
|
146
|
+
mode - only when needed.
|
|
147
|
+
|
|
148
|
+
DataChain name comes from these major data structures: dataset and chaining.
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
What’s new in DataChain?
|
|
152
|
+
========================
|
|
153
|
+
|
|
154
|
+
The project combines multiple ideas from different areas in order to simplify AI
|
|
155
|
+
use-cases and at the same time to fit it into traditional data infrastructure.
|
|
156
|
+
|
|
157
|
+
- **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
|
|
158
|
+
native language for AI. It’s powered by `Pydantic`_ data models.
|
|
159
|
+
- **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
|
|
160
|
+
group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
|
|
161
|
+
needed for distributed computations.
|
|
162
|
+
- **Resuming data processing** (in development). Introduces idempotent operations,
|
|
163
|
+
allowing data processing to resume from the last successful process file/record/batch
|
|
164
|
+
if it fails due to issues like failed LLM calls, ML inference or file download.
|
|
165
|
+
|
|
166
|
+
Additional relatively new ideas:
|
|
167
|
+
|
|
168
|
+
- **Functional style data processing.** Using a functional/chaining approach to data
|
|
169
|
+
processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
|
|
170
|
+
- **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
|
|
171
|
+
and implements data versioning, extending ideas from DVC (developed by the same team).
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
What DataChain is NOT?
|
|
175
|
+
======================
|
|
176
|
+
|
|
177
|
+
- **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
|
|
178
|
+
`SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
|
|
179
|
+
version.
|
|
180
|
+
- **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
|
|
181
|
+
it delegates heavy data transformations to underlying data warehouses and focuses on
|
|
182
|
+
AI specific data enrichments and orchestrating all the pieces together.
|
|
183
|
+
|
|
125
184
|
|
|
126
185
|
Quick Start
|
|
127
186
|
-----------
|
|
@@ -16,28 +16,87 @@
|
|
|
16
16
|
AI 🔗 DataChain
|
|
17
17
|
----------------
|
|
18
18
|
|
|
19
|
-
DataChain is
|
|
20
|
-
data
|
|
19
|
+
DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
|
|
20
|
+
AI engineers build a metadata layer on top of unstructured files and analyze data using
|
|
21
|
+
this layer.
|
|
21
22
|
|
|
22
|
-
|
|
23
|
+
📂 **Raw Files Processing**
|
|
24
|
+
Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
|
|
25
|
+
Local), version and update datasets.
|
|
23
26
|
|
|
24
|
-
|
|
27
|
+
🌟 **Metadata layer.**
|
|
28
|
+
Build a metadata layer on top of files using structured sources like CSV, Parquet,
|
|
29
|
+
and JSON files.
|
|
25
30
|
|
|
26
|
-
|
|
31
|
+
⭐ **Metadata enrichment.**
|
|
32
|
+
Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
|
|
27
33
|
|
|
34
|
+
🛠️ **Data Transformation.**
|
|
35
|
+
Transform metadata using traditional methods like filtering, grouping, joining, and
|
|
36
|
+
others.
|
|
28
37
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
33
|
-
and validation of multimodal AI applications.
|
|
38
|
+
🐍 **User-friendly interface.**
|
|
39
|
+
Operate efficiently with familiar Python objects and object fields, eliminating the
|
|
40
|
+
need for SQL.
|
|
34
41
|
|
|
35
42
|
|
|
36
43
|
.. code:: console
|
|
37
44
|
|
|
38
45
|
$ pip install datachain
|
|
39
46
|
|
|
40
|
-
|
|
47
|
+
|
|
48
|
+
Data Structures
|
|
49
|
+
===============
|
|
50
|
+
|
|
51
|
+
DataChain introduces expressive data structures tailored for AI-specific workload:
|
|
52
|
+
|
|
53
|
+
- **Dataset:** Preserves the file-references and meta-information. Takes care of Python
|
|
54
|
+
object serialization, dataset versioning and difference. Operations on dataset:
|
|
55
|
+
|
|
56
|
+
- **Transformations:** traditional data-frame or SQL operations such as filtering,
|
|
57
|
+
grouping, joining.
|
|
58
|
+
- **Enrichments:** mapping, aggregating and generating using customer’s Python
|
|
59
|
+
code. This is needed to work with ML inference and LLM calls.
|
|
60
|
+
|
|
61
|
+
- **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
|
|
62
|
+
mode - only when needed.
|
|
63
|
+
|
|
64
|
+
DataChain name comes from these major data structures: dataset and chaining.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
What’s new in DataChain?
|
|
68
|
+
========================
|
|
69
|
+
|
|
70
|
+
The project combines multiple ideas from different areas in order to simplify AI
|
|
71
|
+
use-cases and at the same time to fit it into traditional data infrastructure.
|
|
72
|
+
|
|
73
|
+
- **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
|
|
74
|
+
native language for AI. It’s powered by `Pydantic`_ data models.
|
|
75
|
+
- **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
|
|
76
|
+
group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
|
|
77
|
+
needed for distributed computations.
|
|
78
|
+
- **Resuming data processing** (in development). Introduces idempotent operations,
|
|
79
|
+
allowing data processing to resume from the last successful process file/record/batch
|
|
80
|
+
if it fails due to issues like failed LLM calls, ML inference or file download.
|
|
81
|
+
|
|
82
|
+
Additional relatively new ideas:
|
|
83
|
+
|
|
84
|
+
- **Functional style data processing.** Using a functional/chaining approach to data
|
|
85
|
+
processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
|
|
86
|
+
- **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
|
|
87
|
+
and implements data versioning, extending ideas from DVC (developed by the same team).
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
What DataChain is NOT?
|
|
91
|
+
======================
|
|
92
|
+
|
|
93
|
+
- **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
|
|
94
|
+
`SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
|
|
95
|
+
version.
|
|
96
|
+
- **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
|
|
97
|
+
it delegates heavy data transformations to underlying data warehouses and focuses on
|
|
98
|
+
AI specific data enrichments and orchestrating all the pieces together.
|
|
99
|
+
|
|
41
100
|
|
|
42
101
|
Quick Start
|
|
43
102
|
-----------
|
|
@@ -89,6 +89,15 @@ def main():
|
|
|
89
89
|
static_csv_ds.print_schema()
|
|
90
90
|
static_csv_ds.show()
|
|
91
91
|
|
|
92
|
+
uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
|
|
93
|
+
print()
|
|
94
|
+
print("========================================================================")
|
|
95
|
+
print("dynamic CSV with header schema test parsing 3/3M objects")
|
|
96
|
+
print("========================================================================")
|
|
97
|
+
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3)
|
|
98
|
+
dynamic_csv_ds.print_schema()
|
|
99
|
+
dynamic_csv_ds.show()
|
|
100
|
+
|
|
92
101
|
|
|
93
102
|
if __name__ == "__main__":
|
|
94
103
|
main()
|
|
@@ -236,36 +236,36 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
236
236
|
import lz4.frame
|
|
237
237
|
import pandas as pd
|
|
238
238
|
|
|
239
|
-
metastore
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
urls = list(urls)
|
|
244
|
-
while urls:
|
|
245
|
-
for url in urls:
|
|
246
|
-
if self.should_check_for_status():
|
|
247
|
-
self.check_for_status()
|
|
248
|
-
|
|
249
|
-
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
250
|
-
if r.status_code == 404:
|
|
251
|
-
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
252
|
-
# moving to the next url
|
|
253
|
-
continue
|
|
239
|
+
# metastore and warehouse are not thread safe
|
|
240
|
+
with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
|
|
241
|
+
dataset = metastore.get_dataset(self.dataset_name)
|
|
254
242
|
|
|
255
|
-
|
|
243
|
+
urls = list(urls)
|
|
244
|
+
while urls:
|
|
245
|
+
for url in urls:
|
|
246
|
+
if self.should_check_for_status():
|
|
247
|
+
self.check_for_status()
|
|
256
248
|
|
|
257
|
-
|
|
249
|
+
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
250
|
+
if r.status_code == 404:
|
|
251
|
+
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
252
|
+
# moving to the next url
|
|
253
|
+
continue
|
|
258
254
|
|
|
259
|
-
|
|
255
|
+
r.raise_for_status()
|
|
260
256
|
|
|
261
|
-
|
|
262
|
-
df = df.drop("sys__id", axis=1)
|
|
257
|
+
df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
|
|
263
258
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
259
|
+
self.fix_columns(df)
|
|
260
|
+
|
|
261
|
+
# id will be autogenerated in DB
|
|
262
|
+
df = df.drop("sys__id", axis=1)
|
|
263
|
+
|
|
264
|
+
inserted = warehouse.insert_dataset_rows(
|
|
265
|
+
df, dataset, self.dataset_version
|
|
266
|
+
)
|
|
267
|
+
self.increase_counter(inserted) # type: ignore [arg-type]
|
|
268
|
+
urls.remove(url)
|
|
269
269
|
|
|
270
270
|
|
|
271
271
|
@dataclass
|
|
@@ -720,7 +720,6 @@ class Catalog:
|
|
|
720
720
|
client.uri, posixpath.join(prefix, "")
|
|
721
721
|
)
|
|
722
722
|
source_metastore = self.metastore.clone(client.uri)
|
|
723
|
-
source_warehouse = self.warehouse.clone()
|
|
724
723
|
|
|
725
724
|
columns = [
|
|
726
725
|
Column("vtype", String),
|
|
@@ -1835,25 +1834,29 @@ class Catalog:
|
|
|
1835
1834
|
if signed_urls:
|
|
1836
1835
|
shuffle(signed_urls)
|
|
1837
1836
|
|
|
1838
|
-
|
|
1839
|
-
self.metastore.clone(),
|
|
1840
|
-
self.warehouse.clone(),
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
signed_urls,
|
|
1850
|
-
math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
|
|
1851
|
-
),
|
|
1852
|
-
dataset_save_progress_bar,
|
|
1837
|
+
with (
|
|
1838
|
+
self.metastore.clone() as metastore,
|
|
1839
|
+
self.warehouse.clone() as warehouse,
|
|
1840
|
+
):
|
|
1841
|
+
rows_fetcher = DatasetRowsFetcher(
|
|
1842
|
+
metastore,
|
|
1843
|
+
warehouse,
|
|
1844
|
+
remote_config,
|
|
1845
|
+
dataset.name,
|
|
1846
|
+
version,
|
|
1847
|
+
schema,
|
|
1853
1848
|
)
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1849
|
+
try:
|
|
1850
|
+
rows_fetcher.run(
|
|
1851
|
+
batched(
|
|
1852
|
+
signed_urls,
|
|
1853
|
+
math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
|
|
1854
|
+
),
|
|
1855
|
+
dataset_save_progress_bar,
|
|
1856
|
+
)
|
|
1857
|
+
except:
|
|
1858
|
+
self.remove_dataset(dataset.name, version)
|
|
1859
|
+
raise
|
|
1857
1860
|
|
|
1858
1861
|
dataset = self.metastore.update_dataset_status(
|
|
1859
1862
|
dataset,
|
|
@@ -4,7 +4,6 @@ from collections.abc import Iterator
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sa
|
|
7
|
-
from attrs import frozen
|
|
8
7
|
from sqlalchemy.sql import FROM_LINTING
|
|
9
8
|
from sqlalchemy.sql.roles import DDLRole
|
|
10
9
|
|
|
@@ -23,13 +22,18 @@ logger = logging.getLogger("datachain")
|
|
|
23
22
|
SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
|
|
24
23
|
|
|
25
24
|
|
|
26
|
-
@frozen
|
|
27
25
|
class DatabaseEngine(ABC, Serializable):
|
|
28
26
|
dialect: ClassVar["Dialect"]
|
|
29
27
|
|
|
30
28
|
engine: "Engine"
|
|
31
29
|
metadata: "MetaData"
|
|
32
30
|
|
|
31
|
+
def __enter__(self) -> "DatabaseEngine":
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
35
|
+
self.close()
|
|
36
|
+
|
|
33
37
|
@abstractmethod
|
|
34
38
|
def clone(self) -> "DatabaseEngine":
|
|
35
39
|
"""Clones DatabaseEngine implementation."""
|
|
@@ -33,6 +33,16 @@ class AbstractIDGenerator(ABC, Serializable):
|
|
|
33
33
|
def cleanup_for_tests(self):
|
|
34
34
|
"""Cleanup for tests."""
|
|
35
35
|
|
|
36
|
+
def close(self) -> None:
|
|
37
|
+
"""Closes any active database connections."""
|
|
38
|
+
|
|
39
|
+
def close_on_exit(self) -> None:
|
|
40
|
+
"""Closes any active database or HTTP connections, called on Session exit or
|
|
41
|
+
for test cleanup only, as some ID Generator implementations may handle this
|
|
42
|
+
differently.
|
|
43
|
+
"""
|
|
44
|
+
self.close()
|
|
45
|
+
|
|
36
46
|
@abstractmethod
|
|
37
47
|
def init_id(self, uri: str) -> None:
|
|
38
48
|
"""Initializes the ID generator for the given URI with zero last_id."""
|
|
@@ -83,6 +93,10 @@ class AbstractDBIDGenerator(AbstractIDGenerator):
|
|
|
83
93
|
def clone(self) -> "AbstractDBIDGenerator":
|
|
84
94
|
"""Clones AbstractIDGenerator implementation."""
|
|
85
95
|
|
|
96
|
+
def close(self) -> None:
|
|
97
|
+
"""Closes any active database connections."""
|
|
98
|
+
self.db.close()
|
|
99
|
+
|
|
86
100
|
@property
|
|
87
101
|
def db(self) -> "DatabaseEngine":
|
|
88
102
|
return self._db
|
|
@@ -78,6 +78,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
78
78
|
self.uri = uri
|
|
79
79
|
self.partial_id: Optional[int] = partial_id
|
|
80
80
|
|
|
81
|
+
def __enter__(self) -> "AbstractMetastore":
|
|
82
|
+
"""Returns self upon entering context manager."""
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
86
|
+
"""Default behavior is to do nothing, as connections may be shared."""
|
|
87
|
+
|
|
81
88
|
@abstractmethod
|
|
82
89
|
def clone(
|
|
83
90
|
self,
|
|
@@ -97,6 +104,12 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
97
104
|
def close(self) -> None:
|
|
98
105
|
"""Closes any active database or HTTP connections."""
|
|
99
106
|
|
|
107
|
+
def close_on_exit(self) -> None:
|
|
108
|
+
"""Closes any active database or HTTP connections, called on Session exit or
|
|
109
|
+
for test cleanup only, as some Metastore implementations may handle this
|
|
110
|
+
differently."""
|
|
111
|
+
self.close()
|
|
112
|
+
|
|
100
113
|
def cleanup_tables(self, temp_table_names: list[str]) -> None:
|
|
101
114
|
"""Cleanup temp tables."""
|
|
102
115
|
|
|
@@ -15,7 +15,6 @@ from typing import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
import sqlalchemy
|
|
18
|
-
from attrs import frozen
|
|
19
18
|
from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
|
|
20
19
|
from sqlalchemy.dialects import sqlite
|
|
21
20
|
from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
|
|
@@ -40,6 +39,7 @@ from datachain.utils import DataChainDir
|
|
|
40
39
|
|
|
41
40
|
if TYPE_CHECKING:
|
|
42
41
|
from sqlalchemy.dialects.sqlite import Insert
|
|
42
|
+
from sqlalchemy.engine.base import Engine
|
|
43
43
|
from sqlalchemy.schema import SchemaItem
|
|
44
44
|
from sqlalchemy.sql.elements import ColumnClause, ColumnElement, TextClause
|
|
45
45
|
from sqlalchemy.sql.selectable import Select
|
|
@@ -52,6 +52,8 @@ RETRY_START_SEC = 0.01
|
|
|
52
52
|
RETRY_MAX_TIMES = 10
|
|
53
53
|
RETRY_FACTOR = 2
|
|
54
54
|
|
|
55
|
+
DETECT_TYPES = sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES
|
|
56
|
+
|
|
55
57
|
Column = Union[str, "ColumnClause[Any]", "TextClause"]
|
|
56
58
|
|
|
57
59
|
datachain.sql.sqlite.setup()
|
|
@@ -80,26 +82,41 @@ def retry_sqlite_locks(func):
|
|
|
80
82
|
return wrapper
|
|
81
83
|
|
|
82
84
|
|
|
83
|
-
@frozen
|
|
84
85
|
class SQLiteDatabaseEngine(DatabaseEngine):
|
|
85
86
|
dialect = sqlite_dialect
|
|
86
87
|
|
|
87
88
|
db: sqlite3.Connection
|
|
88
89
|
db_file: Optional[str]
|
|
90
|
+
is_closed: bool
|
|
91
|
+
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
engine: "Engine",
|
|
95
|
+
metadata: "MetaData",
|
|
96
|
+
db: sqlite3.Connection,
|
|
97
|
+
db_file: Optional[str] = None,
|
|
98
|
+
):
|
|
99
|
+
self.engine = engine
|
|
100
|
+
self.metadata = metadata
|
|
101
|
+
self.db = db
|
|
102
|
+
self.db_file = db_file
|
|
103
|
+
self.is_closed = False
|
|
89
104
|
|
|
90
105
|
@classmethod
|
|
91
106
|
def from_db_file(cls, db_file: Optional[str] = None) -> "SQLiteDatabaseEngine":
|
|
92
|
-
|
|
107
|
+
return cls(*cls._connect(db_file=db_file))
|
|
93
108
|
|
|
109
|
+
@staticmethod
|
|
110
|
+
def _connect(db_file: Optional[str] = None):
|
|
94
111
|
try:
|
|
95
112
|
if db_file == ":memory:":
|
|
96
113
|
# Enable multithreaded usage of the same in-memory db
|
|
97
114
|
db = sqlite3.connect(
|
|
98
|
-
"file::memory:?cache=shared", uri=True, detect_types=
|
|
115
|
+
"file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
|
|
99
116
|
)
|
|
100
117
|
else:
|
|
101
118
|
db = sqlite3.connect(
|
|
102
|
-
db_file or DataChainDir.find().db, detect_types=
|
|
119
|
+
db_file or DataChainDir.find().db, detect_types=DETECT_TYPES
|
|
103
120
|
)
|
|
104
121
|
create_user_defined_sql_functions(db)
|
|
105
122
|
engine = sqlalchemy.create_engine(
|
|
@@ -118,7 +135,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
118
135
|
|
|
119
136
|
load_usearch_extension(db)
|
|
120
137
|
|
|
121
|
-
return
|
|
138
|
+
return engine, MetaData(), db, db_file
|
|
122
139
|
except RuntimeError:
|
|
123
140
|
raise DataChainError("Can't connect to SQLite DB") from None
|
|
124
141
|
|
|
@@ -138,6 +155,16 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
138
155
|
{},
|
|
139
156
|
)
|
|
140
157
|
|
|
158
|
+
def _reconnect(self) -> None:
|
|
159
|
+
if not self.is_closed:
|
|
160
|
+
raise RuntimeError("Cannot reconnect on still-open DB!")
|
|
161
|
+
engine, metadata, db, db_file = self._connect(db_file=self.db_file)
|
|
162
|
+
self.engine = engine
|
|
163
|
+
self.metadata = metadata
|
|
164
|
+
self.db = db
|
|
165
|
+
self.db_file = db_file
|
|
166
|
+
self.is_closed = False
|
|
167
|
+
|
|
141
168
|
@retry_sqlite_locks
|
|
142
169
|
def execute(
|
|
143
170
|
self,
|
|
@@ -145,6 +172,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
145
172
|
cursor: Optional[sqlite3.Cursor] = None,
|
|
146
173
|
conn=None,
|
|
147
174
|
) -> sqlite3.Cursor:
|
|
175
|
+
if self.is_closed:
|
|
176
|
+
# Reconnect in case of being closed previously.
|
|
177
|
+
self._reconnect()
|
|
148
178
|
if cursor is not None:
|
|
149
179
|
result = cursor.execute(*self.compile_to_args(query))
|
|
150
180
|
elif conn is not None:
|
|
@@ -179,6 +209,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
179
209
|
|
|
180
210
|
def close(self) -> None:
|
|
181
211
|
self.db.close()
|
|
212
|
+
self.is_closed = True
|
|
182
213
|
|
|
183
214
|
@contextmanager
|
|
184
215
|
def transaction(self):
|
|
@@ -359,6 +390,10 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
359
390
|
|
|
360
391
|
self._init_tables()
|
|
361
392
|
|
|
393
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
394
|
+
"""Close connection upon exit from context manager."""
|
|
395
|
+
self.close()
|
|
396
|
+
|
|
362
397
|
def clone(
|
|
363
398
|
self,
|
|
364
399
|
uri: StorageURI = StorageURI(""),
|
|
@@ -521,6 +556,10 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
521
556
|
|
|
522
557
|
self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
|
|
523
558
|
|
|
559
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
560
|
+
"""Close connection upon exit from context manager."""
|
|
561
|
+
self.close()
|
|
562
|
+
|
|
524
563
|
def clone(self, use_new_connection: bool = False) -> "SQLiteWarehouse":
|
|
525
564
|
return SQLiteWarehouse(self.id_generator.clone(), db=self.db.clone())
|
|
526
565
|
|
|
@@ -70,6 +70,13 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
70
70
|
def __init__(self, id_generator: "AbstractIDGenerator"):
|
|
71
71
|
self.id_generator = id_generator
|
|
72
72
|
|
|
73
|
+
def __enter__(self) -> "AbstractWarehouse":
|
|
74
|
+
return self
|
|
75
|
+
|
|
76
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
77
|
+
# Default behavior is to do nothing, as connections may be shared.
|
|
78
|
+
pass
|
|
79
|
+
|
|
73
80
|
def cleanup_for_tests(self):
|
|
74
81
|
"""Cleanup for tests."""
|
|
75
82
|
|
|
@@ -158,6 +165,12 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
158
165
|
"""Closes any active database connections."""
|
|
159
166
|
self.db.close()
|
|
160
167
|
|
|
168
|
+
def close_on_exit(self) -> None:
|
|
169
|
+
"""Closes any active database or HTTP connections, called on Session exit or
|
|
170
|
+
for test cleanup only, as some Warehouse implementations may handle this
|
|
171
|
+
differently."""
|
|
172
|
+
self.close()
|
|
173
|
+
|
|
161
174
|
#
|
|
162
175
|
# Query Tables
|
|
163
176
|
#
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from collections.abc import Sequence
|
|
3
|
+
from tempfile import NamedTemporaryFile
|
|
3
4
|
from typing import TYPE_CHECKING, Optional
|
|
4
5
|
|
|
5
6
|
import pyarrow as pa
|
|
@@ -43,13 +44,17 @@ class ArrowGenerator(Generator):
|
|
|
43
44
|
self.kwargs = kwargs
|
|
44
45
|
|
|
45
46
|
def process(self, file: File):
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
47
|
+
if self.nrows:
|
|
48
|
+
path = _nrows_file(file, self.nrows)
|
|
49
|
+
ds = dataset(path, schema=self.input_schema, **self.kwargs)
|
|
50
|
+
else:
|
|
51
|
+
path = file.get_path()
|
|
52
|
+
ds = dataset(
|
|
53
|
+
path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
|
|
54
|
+
)
|
|
50
55
|
index = 0
|
|
51
56
|
with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
|
|
52
|
-
for record_batch in ds.to_batches(
|
|
57
|
+
for record_batch in ds.to_batches():
|
|
53
58
|
for record in record_batch.to_pylist():
|
|
54
59
|
vals = list(record.values())
|
|
55
60
|
if self.output_schema:
|
|
@@ -60,8 +65,6 @@ class ArrowGenerator(Generator):
|
|
|
60
65
|
else:
|
|
61
66
|
yield vals
|
|
62
67
|
index += 1
|
|
63
|
-
if self.nrows and index >= self.nrows:
|
|
64
|
-
return
|
|
65
68
|
pbar.update(len(record_batch))
|
|
66
69
|
|
|
67
70
|
|
|
@@ -125,3 +128,15 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
|
|
|
125
128
|
if isinstance(col_type, pa.lib.DictionaryType):
|
|
126
129
|
return _arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
|
|
127
130
|
raise TypeError(f"{col_type!r} datatypes not supported")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _nrows_file(file: File, nrows: int) -> str:
|
|
134
|
+
tf = NamedTemporaryFile(delete=False)
|
|
135
|
+
with file.open(mode="r") as reader:
|
|
136
|
+
with open(tf.name, "a") as writer:
|
|
137
|
+
for row, line in enumerate(reader):
|
|
138
|
+
if row >= nrows:
|
|
139
|
+
break
|
|
140
|
+
writer.write(line)
|
|
141
|
+
writer.write("\n")
|
|
142
|
+
return tf.name
|