datachain 0.2.15__tar.gz → 0.2.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.2.15/src/datachain.egg-info → datachain-0.2.17}/PKG-INFO +71 -12
- {datachain-0.2.15 → datachain-0.2.17}/README.rst +70 -11
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/iptc_exif_xmp_lib.py +2 -1
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/openimage-detect.py +1 -1
- {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/json-csv-reader.py +6 -7
- {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/torch-loader.py +1 -1
- {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/wds.py +20 -11
- {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/wds_filtered.py +1 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/catalog/catalog.py +52 -51
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/cli.py +1 -1
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/db_engine.py +6 -2
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/id_generator.py +14 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/metastore.py +15 -2
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/sqlite.py +45 -6
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/warehouse.py +17 -6
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/arrow.py +22 -7
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/dc.py +37 -26
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/file.py +3 -3
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/signal_schema.py +37 -6
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/listing.py +22 -10
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/dataset.py +17 -17
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/session.py +19 -4
- {datachain-0.2.15 → datachain-0.2.17/src/datachain.egg-info}/PKG-INFO +71 -12
- {datachain-0.2.15 → datachain-0.2.17}/tests/conftest.py +50 -23
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_catalog.py +1 -1
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_datachain.py +25 -15
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_dataset_query.py +43 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_arrow.py +0 -17
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_datachain.py +372 -156
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_datachain_merge.py +24 -20
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_feature_utils.py +4 -4
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_signal_schema.py +29 -2
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_catalog_loader.py +24 -30
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_data_storage.py +17 -17
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_database_engine.py +9 -11
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_id_generator.py +6 -8
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_metastore.py +7 -9
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_warehouse.py +7 -9
- {datachain-0.2.15 → datachain-0.2.17}/.cruft.json +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.gitattributes +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.github/codecov.yaml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.github/dependabot.yml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.github/workflows/release.yml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.github/workflows/tests.yml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.gitignore +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/.pre-commit-config.yaml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/CONTRIBUTING.rst +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/LICENSE +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/assets/datachain.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/assets/flowchart.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/index.md +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/references/datachain.md +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/references/datatype.md +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/references/file.md +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/references/index.md +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/references/sql.md +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/references/torch.md +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/docs/references/udf.md +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/README.md +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/json-metadata-tutorial.ipynb +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/llm/llm_chatbot_evaluation.ipynb +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/llm_and_nlp/llm-claude.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/mkdocs.yml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/noxfile.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/pyproject.toml +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/setup.cfg +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/__main__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/asyn.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/cache.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/cli_utils.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/azure.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/gcs.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/local.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/s3.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/config.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/dataset.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/error.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/job.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/clip.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/image.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/settings.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/text.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/udf.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/utils.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/node.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/progress.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/py.typed +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/batch.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/builtins.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/metrics.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/params.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/schema.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/udf.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/remote/studio.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/types.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/utils.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/storage.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain/utils.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/data.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/examples/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/examples/wds_data.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_client.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_datasets.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_ls.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_pull.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_pytorch.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_query.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/scripts/feature_class.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/scripts/name_len_normal.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/test_cli_e2e.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/test_query_e2e.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_asyn.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_cache.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_catalog.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_client.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_dataset.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_listing.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_query_params.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_serializer.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_session.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_storage.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_udf.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_utils.py +0 -0
- {datachain-0.2.15 → datachain-0.2.17}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.17
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -100,28 +100,87 @@ Requires-Dist: types-requests; extra == "dev"
|
|
|
100
100
|
AI 🔗 DataChain
|
|
101
101
|
----------------
|
|
102
102
|
|
|
103
|
-
DataChain is
|
|
104
|
-
data
|
|
103
|
+
DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
|
|
104
|
+
AI engineers build a metadata layer on top of unstructured files and analyze data using
|
|
105
|
+
this layer.
|
|
105
106
|
|
|
106
|
-
|
|
107
|
+
📂 **Raw Files Processing**
|
|
108
|
+
Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
|
|
109
|
+
Local), version and update datasets.
|
|
107
110
|
|
|
108
|
-
|
|
111
|
+
🌟 **Metadata layer.**
|
|
112
|
+
Build a metadata layer on top of files using structured sources like CSV, Parquet,
|
|
113
|
+
and JSON files.
|
|
109
114
|
|
|
110
|
-
|
|
115
|
+
⭐ **Metadata enrichment.**
|
|
116
|
+
Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
|
|
111
117
|
|
|
118
|
+
🛠️ **Data Transformation.**
|
|
119
|
+
Transform metadata using traditional methods like filtering, grouping, joining, and
|
|
120
|
+
others.
|
|
112
121
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
117
|
-
and validation of multimodal AI applications.
|
|
122
|
+
🐍 **User-friendly interface.**
|
|
123
|
+
Operate efficiently with familiar Python objects and object fields, eliminating the
|
|
124
|
+
need for SQL.
|
|
118
125
|
|
|
119
126
|
|
|
120
127
|
.. code:: console
|
|
121
128
|
|
|
122
129
|
$ pip install datachain
|
|
123
130
|
|
|
124
|
-
|
|
131
|
+
|
|
132
|
+
Data Structures
|
|
133
|
+
===============
|
|
134
|
+
|
|
135
|
+
DataChain introduces expressive data structures tailored for AI-specific workload:
|
|
136
|
+
|
|
137
|
+
- **Dataset:** Preserves the file-references and meta-information. Takes care of Python
|
|
138
|
+
object serialization, dataset versioning and difference. Operations on dataset:
|
|
139
|
+
|
|
140
|
+
- **Transformations:** traditional data-frame or SQL operations such as filtering,
|
|
141
|
+
grouping, joining.
|
|
142
|
+
- **Enrichments:** mapping, aggregating and generating using customer’s Python
|
|
143
|
+
code. This is needed to work with ML inference and LLM calls.
|
|
144
|
+
|
|
145
|
+
- **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
|
|
146
|
+
mode - only when needed.
|
|
147
|
+
|
|
148
|
+
DataChain name comes from these major data structures: dataset and chaining.
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
What’s new in DataChain?
|
|
152
|
+
========================
|
|
153
|
+
|
|
154
|
+
The project combines multiple ideas from different areas in order to simplify AI
|
|
155
|
+
use-cases and at the same time to fit it into traditional data infrastructure.
|
|
156
|
+
|
|
157
|
+
- **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
|
|
158
|
+
native language for AI. It’s powered by `Pydantic`_ data models.
|
|
159
|
+
- **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
|
|
160
|
+
group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
|
|
161
|
+
needed for distributed computations.
|
|
162
|
+
- **Resuming data processing** (in development). Introduces idempotent operations,
|
|
163
|
+
allowing data processing to resume from the last successful process file/record/batch
|
|
164
|
+
if it fails due to issues like failed LLM calls, ML inference or file download.
|
|
165
|
+
|
|
166
|
+
Additional relatively new ideas:
|
|
167
|
+
|
|
168
|
+
- **Functional style data processing.** Using a functional/chaining approach to data
|
|
169
|
+
processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
|
|
170
|
+
- **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
|
|
171
|
+
and implements data versioning, extending ideas from DVC (developed by the same team).
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
What DataChain is NOT?
|
|
175
|
+
======================
|
|
176
|
+
|
|
177
|
+
- **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
|
|
178
|
+
`SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
|
|
179
|
+
version.
|
|
180
|
+
- **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
|
|
181
|
+
it delegates heavy data transformations to underlying data warehouses and focuses on
|
|
182
|
+
AI specific data enrichments and orchestrating all the pieces together.
|
|
183
|
+
|
|
125
184
|
|
|
126
185
|
Quick Start
|
|
127
186
|
-----------
|
|
@@ -16,28 +16,87 @@
|
|
|
16
16
|
AI 🔗 DataChain
|
|
17
17
|
----------------
|
|
18
18
|
|
|
19
|
-
DataChain is
|
|
20
|
-
data
|
|
19
|
+
DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
|
|
20
|
+
AI engineers build a metadata layer on top of unstructured files and analyze data using
|
|
21
|
+
this layer.
|
|
21
22
|
|
|
22
|
-
|
|
23
|
+
📂 **Raw Files Processing**
|
|
24
|
+
Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
|
|
25
|
+
Local), version and update datasets.
|
|
23
26
|
|
|
24
|
-
|
|
27
|
+
🌟 **Metadata layer.**
|
|
28
|
+
Build a metadata layer on top of files using structured sources like CSV, Parquet,
|
|
29
|
+
and JSON files.
|
|
25
30
|
|
|
26
|
-
|
|
31
|
+
⭐ **Metadata enrichment.**
|
|
32
|
+
Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
|
|
27
33
|
|
|
34
|
+
🛠️ **Data Transformation.**
|
|
35
|
+
Transform metadata using traditional methods like filtering, grouping, joining, and
|
|
36
|
+
others.
|
|
28
37
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
The typical use cases include Computer Vision data curation, LLM analytics,
|
|
33
|
-
and validation of multimodal AI applications.
|
|
38
|
+
🐍 **User-friendly interface.**
|
|
39
|
+
Operate efficiently with familiar Python objects and object fields, eliminating the
|
|
40
|
+
need for SQL.
|
|
34
41
|
|
|
35
42
|
|
|
36
43
|
.. code:: console
|
|
37
44
|
|
|
38
45
|
$ pip install datachain
|
|
39
46
|
|
|
40
|
-
|
|
47
|
+
|
|
48
|
+
Data Structures
|
|
49
|
+
===============
|
|
50
|
+
|
|
51
|
+
DataChain introduces expressive data structures tailored for AI-specific workload:
|
|
52
|
+
|
|
53
|
+
- **Dataset:** Preserves the file-references and meta-information. Takes care of Python
|
|
54
|
+
object serialization, dataset versioning and difference. Operations on dataset:
|
|
55
|
+
|
|
56
|
+
- **Transformations:** traditional data-frame or SQL operations such as filtering,
|
|
57
|
+
grouping, joining.
|
|
58
|
+
- **Enrichments:** mapping, aggregating and generating using customer’s Python
|
|
59
|
+
code. This is needed to work with ML inference and LLM calls.
|
|
60
|
+
|
|
61
|
+
- **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
|
|
62
|
+
mode - only when needed.
|
|
63
|
+
|
|
64
|
+
DataChain name comes from these major data structures: dataset and chaining.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
What’s new in DataChain?
|
|
68
|
+
========================
|
|
69
|
+
|
|
70
|
+
The project combines multiple ideas from different areas in order to simplify AI
|
|
71
|
+
use-cases and at the same time to fit it into traditional data infrastructure.
|
|
72
|
+
|
|
73
|
+
- **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
|
|
74
|
+
native language for AI. It’s powered by `Pydantic`_ data models.
|
|
75
|
+
- **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
|
|
76
|
+
group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
|
|
77
|
+
needed for distributed computations.
|
|
78
|
+
- **Resuming data processing** (in development). Introduces idempotent operations,
|
|
79
|
+
allowing data processing to resume from the last successful process file/record/batch
|
|
80
|
+
if it fails due to issues like failed LLM calls, ML inference or file download.
|
|
81
|
+
|
|
82
|
+
Additional relatively new ideas:
|
|
83
|
+
|
|
84
|
+
- **Functional style data processing.** Using a functional/chaining approach to data
|
|
85
|
+
processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
|
|
86
|
+
- **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
|
|
87
|
+
and implements data versioning, extending ideas from DVC (developed by the same team).
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
What DataChain is NOT?
|
|
91
|
+
======================
|
|
92
|
+
|
|
93
|
+
- **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
|
|
94
|
+
`SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
|
|
95
|
+
version.
|
|
96
|
+
- **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
|
|
97
|
+
it delegates heavy data transformations to underlying data warehouses and focuses on
|
|
98
|
+
AI specific data enrichments and orchestrating all the pieces together.
|
|
99
|
+
|
|
41
100
|
|
|
42
101
|
Quick Start
|
|
43
102
|
-----------
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# pip install defusedxml
|
|
1
2
|
import json
|
|
2
3
|
|
|
3
4
|
from PIL import (
|
|
@@ -63,7 +64,7 @@ if __name__ == "__main__":
|
|
|
63
64
|
DataChain.from_storage(source, type="image")
|
|
64
65
|
.settings(parallel=-1)
|
|
65
66
|
.filter(C("file.name").glob("*.jpg"))
|
|
66
|
-
.limit(
|
|
67
|
+
.limit(5000)
|
|
67
68
|
.map(
|
|
68
69
|
image_description,
|
|
69
70
|
params=["file"],
|
|
@@ -36,7 +36,7 @@ def main():
|
|
|
36
36
|
print("========================================================================")
|
|
37
37
|
uri = "gs://datachain-demo/jsonl/object.jsonl"
|
|
38
38
|
jsonl_ds = DataChain.from_json(uri, meta_type="jsonl", print_schema=True)
|
|
39
|
-
|
|
39
|
+
jsonl_ds.show()
|
|
40
40
|
|
|
41
41
|
print()
|
|
42
42
|
print("========================================================================")
|
|
@@ -49,8 +49,7 @@ def main():
|
|
|
49
49
|
json_pairs_ds = DataChain.from_json(
|
|
50
50
|
uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage"
|
|
51
51
|
)
|
|
52
|
-
|
|
53
|
-
# print(list(json_pairs_ds.collect())[0])
|
|
52
|
+
json_pairs_ds.show()
|
|
54
53
|
|
|
55
54
|
uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
56
55
|
|
|
@@ -72,7 +71,7 @@ def main():
|
|
|
72
71
|
static_json_ds = DataChain.from_json(
|
|
73
72
|
uri, jmespath="licenses", spec=LicenseFeature, nrows=3
|
|
74
73
|
)
|
|
75
|
-
|
|
74
|
+
static_json_ds.show()
|
|
76
75
|
|
|
77
76
|
print()
|
|
78
77
|
print("========================================================================")
|
|
@@ -88,16 +87,16 @@ def main():
|
|
|
88
87
|
print("========================================================================")
|
|
89
88
|
static_csv_ds = DataChain.from_csv(uri, output=ChatDialog, object_name="chat")
|
|
90
89
|
static_csv_ds.print_schema()
|
|
91
|
-
|
|
90
|
+
static_csv_ds.show()
|
|
92
91
|
|
|
93
|
-
uri = "gs://datachain-demo/laion-aesthetics-csv"
|
|
92
|
+
uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
|
|
94
93
|
print()
|
|
95
94
|
print("========================================================================")
|
|
96
95
|
print("dynamic CSV with header schema test parsing 3/3M objects")
|
|
97
96
|
print("========================================================================")
|
|
98
97
|
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3)
|
|
99
98
|
dynamic_csv_ds.print_schema()
|
|
100
|
-
|
|
99
|
+
dynamic_csv_ds.show()
|
|
101
100
|
|
|
102
101
|
|
|
103
102
|
if __name__ == "__main__":
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
|
|
3
1
|
from datachain import C, DataChain
|
|
4
2
|
from datachain.lib.webdataset import process_webdataset
|
|
5
3
|
from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
|
|
@@ -9,25 +7,36 @@ wds = (
|
|
|
9
7
|
.filter(C("file.name").glob("00000000.tar"))
|
|
10
8
|
.settings(cache=True)
|
|
11
9
|
.gen(laion=process_webdataset(spec=WDSLaion), params="file")
|
|
10
|
+
.save() # materialize chain to avoid downloading data multiple times
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
meta_pq = (
|
|
14
|
+
DataChain.from_parquet("gs://datachain-demo/datacomp-small/metadata/0020f*.parquet")
|
|
15
|
+
.filter(
|
|
16
|
+
C("uid").in_(values[0] for values in wds.select("laion.json.uid").collect())
|
|
17
|
+
)
|
|
18
|
+
.map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
|
|
19
|
+
.save()
|
|
12
20
|
)
|
|
13
21
|
|
|
14
22
|
meta_emd = (
|
|
15
|
-
DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata")
|
|
16
|
-
.filter(C("file.name").glob("0020f*.npz"))
|
|
23
|
+
DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata/0020f*.npz")
|
|
17
24
|
.gen(emd=process_laion_meta)
|
|
25
|
+
.filter(
|
|
26
|
+
C("emd.index").in_(
|
|
27
|
+
values[0] for values in meta_pq.select("source.index").collect()
|
|
28
|
+
)
|
|
29
|
+
)
|
|
18
30
|
.map(stem=lambda file: file.get_file_stem(), params=["emd.file"], output=str)
|
|
19
31
|
)
|
|
20
32
|
|
|
21
|
-
meta_pq = DataChain.from_parquet(
|
|
22
|
-
"gs://datachain-demo/datacomp-small/metadata/0020f*.parquet"
|
|
23
|
-
).map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
|
|
24
33
|
|
|
25
34
|
meta = meta_emd.merge(
|
|
26
|
-
meta_pq,
|
|
35
|
+
meta_pq,
|
|
36
|
+
on=["stem", "emd.index"],
|
|
37
|
+
right_on=["stem", "source.index"],
|
|
27
38
|
)
|
|
28
39
|
|
|
29
40
|
res = wds.merge(meta, on="laion.json.uid", right_on="uid")
|
|
30
41
|
|
|
31
|
-
|
|
32
|
-
with pd.option_context("display.max_columns", None):
|
|
33
|
-
print(df)
|
|
42
|
+
res.show(3)
|
|
@@ -236,36 +236,36 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
236
236
|
import lz4.frame
|
|
237
237
|
import pandas as pd
|
|
238
238
|
|
|
239
|
-
metastore
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
urls = list(urls)
|
|
244
|
-
while urls:
|
|
245
|
-
for url in urls:
|
|
246
|
-
if self.should_check_for_status():
|
|
247
|
-
self.check_for_status()
|
|
248
|
-
|
|
249
|
-
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
250
|
-
if r.status_code == 404:
|
|
251
|
-
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
252
|
-
# moving to the next url
|
|
253
|
-
continue
|
|
239
|
+
# metastore and warehouse are not thread safe
|
|
240
|
+
with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
|
|
241
|
+
dataset = metastore.get_dataset(self.dataset_name)
|
|
254
242
|
|
|
255
|
-
|
|
243
|
+
urls = list(urls)
|
|
244
|
+
while urls:
|
|
245
|
+
for url in urls:
|
|
246
|
+
if self.should_check_for_status():
|
|
247
|
+
self.check_for_status()
|
|
256
248
|
|
|
257
|
-
|
|
249
|
+
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
250
|
+
if r.status_code == 404:
|
|
251
|
+
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
252
|
+
# moving to the next url
|
|
253
|
+
continue
|
|
258
254
|
|
|
259
|
-
|
|
255
|
+
r.raise_for_status()
|
|
260
256
|
|
|
261
|
-
|
|
262
|
-
df = df.drop("sys__id", axis=1)
|
|
257
|
+
df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
|
|
263
258
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
259
|
+
self.fix_columns(df)
|
|
260
|
+
|
|
261
|
+
# id will be autogenerated in DB
|
|
262
|
+
df = df.drop("sys__id", axis=1)
|
|
263
|
+
|
|
264
|
+
inserted = warehouse.insert_dataset_rows(
|
|
265
|
+
df, dataset, self.dataset_version
|
|
266
|
+
)
|
|
267
|
+
self.increase_counter(inserted) # type: ignore [arg-type]
|
|
268
|
+
urls.remove(url)
|
|
269
269
|
|
|
270
270
|
|
|
271
271
|
@dataclass
|
|
@@ -720,7 +720,6 @@ class Catalog:
|
|
|
720
720
|
client.uri, posixpath.join(prefix, "")
|
|
721
721
|
)
|
|
722
722
|
source_metastore = self.metastore.clone(client.uri)
|
|
723
|
-
source_warehouse = self.warehouse.clone()
|
|
724
723
|
|
|
725
724
|
columns = [
|
|
726
725
|
Column("vtype", String),
|
|
@@ -1217,16 +1216,14 @@ class Catalog:
|
|
|
1217
1216
|
def get_temp_table_names(self) -> list[str]:
|
|
1218
1217
|
return self.warehouse.get_temp_table_names()
|
|
1219
1218
|
|
|
1220
|
-
def
|
|
1219
|
+
def cleanup_tables(self, names: Iterable[str]) -> None:
|
|
1221
1220
|
"""
|
|
1222
|
-
Drop tables
|
|
1221
|
+
Drop tables passed.
|
|
1223
1222
|
|
|
1224
|
-
This should be implemented
|
|
1225
|
-
|
|
1226
|
-
needed. When running the same `DatasetQuery` multiple times we
|
|
1227
|
-
may use the same temporary table names.
|
|
1223
|
+
This should be implemented to ensure that the provided tables
|
|
1224
|
+
are cleaned up as soon as they are no longer needed.
|
|
1228
1225
|
"""
|
|
1229
|
-
self.warehouse.
|
|
1226
|
+
self.warehouse.cleanup_tables(names)
|
|
1230
1227
|
self.id_generator.delete_uris(names)
|
|
1231
1228
|
|
|
1232
1229
|
def create_dataset_from_sources(
|
|
@@ -1837,25 +1834,29 @@ class Catalog:
|
|
|
1837
1834
|
if signed_urls:
|
|
1838
1835
|
shuffle(signed_urls)
|
|
1839
1836
|
|
|
1840
|
-
|
|
1841
|
-
self.metastore.clone(),
|
|
1842
|
-
self.warehouse.clone(),
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
signed_urls,
|
|
1852
|
-
math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
|
|
1853
|
-
),
|
|
1854
|
-
dataset_save_progress_bar,
|
|
1837
|
+
with (
|
|
1838
|
+
self.metastore.clone() as metastore,
|
|
1839
|
+
self.warehouse.clone() as warehouse,
|
|
1840
|
+
):
|
|
1841
|
+
rows_fetcher = DatasetRowsFetcher(
|
|
1842
|
+
metastore,
|
|
1843
|
+
warehouse,
|
|
1844
|
+
remote_config,
|
|
1845
|
+
dataset.name,
|
|
1846
|
+
version,
|
|
1847
|
+
schema,
|
|
1855
1848
|
)
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1849
|
+
try:
|
|
1850
|
+
rows_fetcher.run(
|
|
1851
|
+
batched(
|
|
1852
|
+
signed_urls,
|
|
1853
|
+
math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
|
|
1854
|
+
),
|
|
1855
|
+
dataset_save_progress_bar,
|
|
1856
|
+
)
|
|
1857
|
+
except:
|
|
1858
|
+
self.remove_dataset(dataset.name, version)
|
|
1859
|
+
raise
|
|
1859
1860
|
|
|
1860
1861
|
dataset = self.metastore.update_dataset_status(
|
|
1861
1862
|
dataset,
|
|
@@ -910,7 +910,7 @@ def garbage_collect(catalog: "Catalog"):
|
|
|
910
910
|
print("Nothing to clean up.")
|
|
911
911
|
else:
|
|
912
912
|
print(f"Garbage collecting {len(temp_tables)} tables.")
|
|
913
|
-
catalog.
|
|
913
|
+
catalog.cleanup_tables(temp_tables)
|
|
914
914
|
|
|
915
915
|
|
|
916
916
|
def completion(shell: str) -> str:
|
|
@@ -4,7 +4,6 @@ from collections.abc import Iterator
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sa
|
|
7
|
-
from attrs import frozen
|
|
8
7
|
from sqlalchemy.sql import FROM_LINTING
|
|
9
8
|
from sqlalchemy.sql.roles import DDLRole
|
|
10
9
|
|
|
@@ -23,13 +22,18 @@ logger = logging.getLogger("datachain")
|
|
|
23
22
|
SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
|
|
24
23
|
|
|
25
24
|
|
|
26
|
-
@frozen
|
|
27
25
|
class DatabaseEngine(ABC, Serializable):
|
|
28
26
|
dialect: ClassVar["Dialect"]
|
|
29
27
|
|
|
30
28
|
engine: "Engine"
|
|
31
29
|
metadata: "MetaData"
|
|
32
30
|
|
|
31
|
+
def __enter__(self) -> "DatabaseEngine":
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
35
|
+
self.close()
|
|
36
|
+
|
|
33
37
|
@abstractmethod
|
|
34
38
|
def clone(self) -> "DatabaseEngine":
|
|
35
39
|
"""Clones DatabaseEngine implementation."""
|
|
@@ -33,6 +33,16 @@ class AbstractIDGenerator(ABC, Serializable):
|
|
|
33
33
|
def cleanup_for_tests(self):
|
|
34
34
|
"""Cleanup for tests."""
|
|
35
35
|
|
|
36
|
+
def close(self) -> None:
|
|
37
|
+
"""Closes any active database connections."""
|
|
38
|
+
|
|
39
|
+
def close_on_exit(self) -> None:
|
|
40
|
+
"""Closes any active database or HTTP connections, called on Session exit or
|
|
41
|
+
for test cleanup only, as some ID Generator implementations may handle this
|
|
42
|
+
differently.
|
|
43
|
+
"""
|
|
44
|
+
self.close()
|
|
45
|
+
|
|
36
46
|
@abstractmethod
|
|
37
47
|
def init_id(self, uri: str) -> None:
|
|
38
48
|
"""Initializes the ID generator for the given URI with zero last_id."""
|
|
@@ -83,6 +93,10 @@ class AbstractDBIDGenerator(AbstractIDGenerator):
|
|
|
83
93
|
def clone(self) -> "AbstractDBIDGenerator":
|
|
84
94
|
"""Clones AbstractIDGenerator implementation."""
|
|
85
95
|
|
|
96
|
+
def close(self) -> None:
|
|
97
|
+
"""Closes any active database connections."""
|
|
98
|
+
self.db.close()
|
|
99
|
+
|
|
86
100
|
@property
|
|
87
101
|
def db(self) -> "DatabaseEngine":
|
|
88
102
|
return self._db
|
|
@@ -78,6 +78,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
78
78
|
self.uri = uri
|
|
79
79
|
self.partial_id: Optional[int] = partial_id
|
|
80
80
|
|
|
81
|
+
def __enter__(self) -> "AbstractMetastore":
|
|
82
|
+
"""Returns self upon entering context manager."""
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
86
|
+
"""Default behavior is to do nothing, as connections may be shared."""
|
|
87
|
+
|
|
81
88
|
@abstractmethod
|
|
82
89
|
def clone(
|
|
83
90
|
self,
|
|
@@ -97,7 +104,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
97
104
|
def close(self) -> None:
|
|
98
105
|
"""Closes any active database or HTTP connections."""
|
|
99
106
|
|
|
100
|
-
def
|
|
107
|
+
def close_on_exit(self) -> None:
|
|
108
|
+
"""Closes any active database or HTTP connections, called on Session exit or
|
|
109
|
+
for test cleanup only, as some Metastore implementations may handle this
|
|
110
|
+
differently."""
|
|
111
|
+
self.close()
|
|
112
|
+
|
|
113
|
+
def cleanup_tables(self, temp_table_names: list[str]) -> None:
|
|
101
114
|
"""Cleanup temp tables."""
|
|
102
115
|
|
|
103
116
|
def cleanup_for_tests(self) -> None:
|
|
@@ -457,7 +470,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
457
470
|
"""Closes any active database connections."""
|
|
458
471
|
self.db.close()
|
|
459
472
|
|
|
460
|
-
def
|
|
473
|
+
def cleanup_tables(self, temp_table_names: list[str]) -> None:
|
|
461
474
|
"""Cleanup temp tables."""
|
|
462
475
|
self.id_generator.delete_uris(temp_table_names)
|
|
463
476
|
|