datachain 0.3.2__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.2 → datachain-0.3.3}/.github/workflows/benchmarks.yml +9 -6
- datachain-0.3.3/.github/workflows/tests-studio.yml +103 -0
- {datachain-0.3.2 → datachain-0.3.3}/.github/workflows/tests.yml +0 -92
- {datachain-0.3.2/src/datachain.egg-info → datachain-0.3.3}/PKG-INFO +2 -2
- {datachain-0.3.2 → datachain-0.3.3}/examples/computer_vision/openimage-detect.py +1 -1
- {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/common_sql_functions.py +2 -2
- {datachain-0.3.2 → datachain-0.3.3}/pyproject.toml +1 -1
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/asyn.py +20 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/catalog/catalog.py +2 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/catalog/loader.py +75 -50
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/azure.py +13 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/gcs.py +12 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/local.py +11 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/s3.py +12 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/sqlite.py +55 -14
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/warehouse.py +17 -3
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/arrow.py +1 -1
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/values_to_tuples.py +14 -8
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/data_model.py +1 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/dc.py +25 -6
- datachain-0.3.3/src/datachain/lib/listing.py +111 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/dataset.py +22 -12
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/session.py +9 -2
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/sqlite/base.py +30 -4
- {datachain-0.3.2 → datachain-0.3.3/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain.egg-info/SOURCES.txt +8 -22
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.3.2 → datachain-0.3.3}/tests/benchmarks/conftest.py +6 -0
- datachain-0.3.3/tests/benchmarks/datasets/.dvc/.gitignore +3 -0
- datachain-0.3.3/tests/benchmarks/datasets/.dvc/config +4 -0
- datachain-0.3.3/tests/benchmarks/datasets/.gitignore +1 -0
- datachain-0.3.3/tests/benchmarks/datasets/laion-tiny.npz.dvc +5 -0
- datachain-0.3.3/tests/benchmarks/test_datachain.py +22 -0
- datachain-0.3.3/tests/func/test_listing.py +34 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_datachain.py +169 -42
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_datachain_merge.py +35 -2
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_path.py +2 -1
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_asyn.py +29 -1
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_catalog_loader.py +41 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_database_engine.py +21 -1
- datachain-0.3.2/examples/computer_vision/fashion_product_images/.gitignore +0 -5
- datachain-0.3.2/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -2211
- datachain-0.3.2/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -4103
- datachain-0.3.2/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -1081
- datachain-0.3.2/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -754
- datachain-0.3.2/examples/computer_vision/fashion_product_images/README.md +0 -60
- datachain-0.3.2/examples/computer_vision/fashion_product_images/requirements.txt +0 -6
- datachain-0.3.2/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -47
- datachain-0.3.2/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -47
- datachain-0.3.2/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -36
- datachain-0.3.2/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -44
- datachain-0.3.2/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -52
- datachain-0.3.2/examples/computer_vision/fashion_product_images/src/clustering.py +0 -41
- datachain-0.3.2/examples/computer_vision/fashion_product_images/src/train.py +0 -143
- datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.cruft.json +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.gitattributes +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.github/codecov.yaml +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.github/dependabot.yml +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.github/workflows/release.yml +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.gitignore +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/.pre-commit-config.yaml +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/LICENSE +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/README.rst +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/assets/datachain.png +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/index.md +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/references/datachain.md +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/references/datatype.md +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/references/file.md +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/references/index.md +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/references/sql.md +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/references/torch.md +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/docs/references/udf.md +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/llm_and_nlp/llm-claude.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/mkdocs.yml +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/noxfile.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/setup.cfg +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/__main__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/cache.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/cli.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/config.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/dataset.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/error.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/job.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/file.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/listing.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/node.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/progress.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/py.typed +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/builtins.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/params.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/schema.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/storage.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain/utils.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/conftest.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/data.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/examples/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_catalog.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_client.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_datachain.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_datasets.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_ls.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_pull.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_query.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_client.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_session.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_udf.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.2 → datachain-0.3.3}/tests/utils.py +0 -0
|
@@ -5,23 +5,24 @@ on:
|
|
|
5
5
|
- cron: '0 0 * * *'
|
|
6
6
|
pull_request:
|
|
7
7
|
types: [opened, reopened, labeled, synchronize]
|
|
8
|
-
workflow_dispatch:
|
|
8
|
+
workflow_dispatch:
|
|
9
9
|
|
|
10
10
|
env:
|
|
11
11
|
FORCE_COLOR: "1"
|
|
12
12
|
|
|
13
|
+
concurrency:
|
|
14
|
+
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
|
15
|
+
cancel-in-progress: true
|
|
16
|
+
|
|
13
17
|
jobs:
|
|
14
18
|
run:
|
|
15
|
-
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') }}
|
|
16
19
|
runs-on: ubuntu-latest
|
|
17
|
-
|
|
18
20
|
steps:
|
|
19
21
|
- uses: actions/checkout@v4
|
|
20
|
-
|
|
21
|
-
- name: Set up Python 3.10
|
|
22
|
+
- name: Set up Python 3.12
|
|
22
23
|
uses: actions/setup-python@v5
|
|
23
24
|
with:
|
|
24
|
-
python-version: '3.
|
|
25
|
+
python-version: '3.12'
|
|
25
26
|
cache: 'pip'
|
|
26
27
|
|
|
27
28
|
- name: Upgrade nox and uv
|
|
@@ -30,5 +31,7 @@ jobs:
|
|
|
30
31
|
nox --version
|
|
31
32
|
uv --version
|
|
32
33
|
|
|
34
|
+
- run: uv pip install dvc[gs] --system
|
|
35
|
+
- run: dvc --cd tests/benchmarks/datasets pull
|
|
33
36
|
- name: Run benchmarks
|
|
34
37
|
run: nox -s bench
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
name: Studio Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
env:
|
|
10
|
+
FORCE_COLOR: "1"
|
|
11
|
+
BRANCH: ${{ github.head_ref || github.ref_name }}
|
|
12
|
+
|
|
13
|
+
concurrency:
|
|
14
|
+
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
|
15
|
+
cancel-in-progress: true
|
|
16
|
+
|
|
17
|
+
jobs:
|
|
18
|
+
studio:
|
|
19
|
+
if: '!github.event.pull_request.head.repo.fork'
|
|
20
|
+
runs-on: ubuntu-latest-16-cores
|
|
21
|
+
strategy:
|
|
22
|
+
matrix:
|
|
23
|
+
pyv: ['3.12']
|
|
24
|
+
group: [1, 2, 3, 4, 5, 6]
|
|
25
|
+
services:
|
|
26
|
+
postgres:
|
|
27
|
+
image: postgres:16.3
|
|
28
|
+
ports:
|
|
29
|
+
- 5432:5432
|
|
30
|
+
env:
|
|
31
|
+
POSTGRES_USER: test
|
|
32
|
+
POSTGRES_DB: database
|
|
33
|
+
POSTGRES_HOST_AUTH_METHOD: trust
|
|
34
|
+
clickhouse:
|
|
35
|
+
image: clickhouse/clickhouse-server:24
|
|
36
|
+
ports:
|
|
37
|
+
- 8123:8123
|
|
38
|
+
- 9010:9000
|
|
39
|
+
env:
|
|
40
|
+
CLICKHOUSE_DB: studio_local_db
|
|
41
|
+
CLICKHOUSE_USER: studio_local
|
|
42
|
+
CLICKHOUSE_PASSWORD: ch123456789!
|
|
43
|
+
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
|
|
44
|
+
redis:
|
|
45
|
+
image: redis:7.2.5
|
|
46
|
+
ports:
|
|
47
|
+
- 6379:6379
|
|
48
|
+
steps:
|
|
49
|
+
- name: Studio branch name
|
|
50
|
+
env:
|
|
51
|
+
BRANCH: ${{ env.BRANCH }}
|
|
52
|
+
STUDIO_READ_ACCESS_TOKEN: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
|
|
53
|
+
run: |
|
|
54
|
+
echo "DataChain branch: $BRANCH"
|
|
55
|
+
if [[ "$BRANCH" == "main" ]]
|
|
56
|
+
then
|
|
57
|
+
STUDIO_BRANCH=develop
|
|
58
|
+
elif git ls-remote --heads https://"$STUDIO_READ_ACCESS_TOKEN"@github.com/iterative/studio.git "$BRANCH" | grep -F "$BRANCH" 2>&1>/dev/null
|
|
59
|
+
then
|
|
60
|
+
STUDIO_BRANCH="$BRANCH"
|
|
61
|
+
else
|
|
62
|
+
STUDIO_BRANCH=develop
|
|
63
|
+
fi
|
|
64
|
+
echo "STUDIO_BRANCH=$STUDIO_BRANCH" >> $GITHUB_ENV
|
|
65
|
+
echo "Studio branch: $STUDIO_BRANCH"
|
|
66
|
+
|
|
67
|
+
- name: Check out Studio
|
|
68
|
+
uses: actions/checkout@v4
|
|
69
|
+
with:
|
|
70
|
+
fetch-depth: 0
|
|
71
|
+
repository: iterative/studio
|
|
72
|
+
ref: ${{ env.STUDIO_BRANCH }}
|
|
73
|
+
token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
|
|
74
|
+
|
|
75
|
+
- name: Check out repository
|
|
76
|
+
uses: actions/checkout@v4
|
|
77
|
+
with:
|
|
78
|
+
path: './backend/datachain'
|
|
79
|
+
fetch-depth: 0
|
|
80
|
+
|
|
81
|
+
- name: Set up Python ${{ matrix.pyv }}
|
|
82
|
+
uses: actions/setup-python@v5
|
|
83
|
+
with:
|
|
84
|
+
python-version: ${{ matrix.pyv }}
|
|
85
|
+
cache: 'pip'
|
|
86
|
+
|
|
87
|
+
- name: Install uv
|
|
88
|
+
run: |
|
|
89
|
+
python -m pip install --upgrade uv
|
|
90
|
+
uv --version
|
|
91
|
+
|
|
92
|
+
- name: Install dependencies
|
|
93
|
+
run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
|
|
94
|
+
|
|
95
|
+
- name: Run tests
|
|
96
|
+
# Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
|
|
97
|
+
run: >
|
|
98
|
+
pytest
|
|
99
|
+
--config-file=pyproject.toml -rs
|
|
100
|
+
--splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
|
|
101
|
+
-m 'not benchmark'
|
|
102
|
+
tests ../datachain/tests
|
|
103
|
+
working-directory: backend/datachain_server
|
|
@@ -8,7 +8,6 @@ on:
|
|
|
8
8
|
|
|
9
9
|
env:
|
|
10
10
|
FORCE_COLOR: "1"
|
|
11
|
-
BRANCH: ${{ github.head_ref || github.ref_name }}
|
|
12
11
|
|
|
13
12
|
concurrency:
|
|
14
13
|
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
|
@@ -18,7 +17,6 @@ jobs:
|
|
|
18
17
|
lint:
|
|
19
18
|
runs-on: ubuntu-latest
|
|
20
19
|
steps:
|
|
21
|
-
|
|
22
20
|
- name: Check out the repository
|
|
23
21
|
uses: actions/checkout@v4
|
|
24
22
|
with:
|
|
@@ -112,95 +110,6 @@ jobs:
|
|
|
112
110
|
- name: Build docs
|
|
113
111
|
run: nox -s docs
|
|
114
112
|
|
|
115
|
-
|
|
116
|
-
studio:
|
|
117
|
-
if: '!github.event.pull_request.head.repo.fork'
|
|
118
|
-
runs-on: ubuntu-latest-16-cores
|
|
119
|
-
strategy:
|
|
120
|
-
matrix:
|
|
121
|
-
pyv: ['3.12']
|
|
122
|
-
group: [1, 2, 3, 4, 5, 6]
|
|
123
|
-
services:
|
|
124
|
-
postgres:
|
|
125
|
-
image: postgres:16.3
|
|
126
|
-
ports:
|
|
127
|
-
- 5432:5432
|
|
128
|
-
env:
|
|
129
|
-
POSTGRES_USER: test
|
|
130
|
-
POSTGRES_DB: database
|
|
131
|
-
POSTGRES_HOST_AUTH_METHOD: trust
|
|
132
|
-
clickhouse:
|
|
133
|
-
image: clickhouse/clickhouse-server:24
|
|
134
|
-
ports:
|
|
135
|
-
- 8123:8123
|
|
136
|
-
- 9010:9000
|
|
137
|
-
env:
|
|
138
|
-
CLICKHOUSE_DB: studio_local_db
|
|
139
|
-
CLICKHOUSE_USER: studio_local
|
|
140
|
-
CLICKHOUSE_PASSWORD: ch123456789!
|
|
141
|
-
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
|
|
142
|
-
redis:
|
|
143
|
-
image: redis:7.2.5
|
|
144
|
-
ports:
|
|
145
|
-
- 6379:6379
|
|
146
|
-
steps:
|
|
147
|
-
|
|
148
|
-
- name: Studio branch name
|
|
149
|
-
env:
|
|
150
|
-
BRANCH: ${{ env.BRANCH }}
|
|
151
|
-
STUDIO_READ_ACCESS_TOKEN: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
|
|
152
|
-
run: |
|
|
153
|
-
echo "DataChain branch: $BRANCH"
|
|
154
|
-
if [[ "$BRANCH" == "main" ]]
|
|
155
|
-
then
|
|
156
|
-
STUDIO_BRANCH=develop
|
|
157
|
-
elif git ls-remote --heads https://"$STUDIO_READ_ACCESS_TOKEN"@github.com/iterative/studio.git "$BRANCH" | grep -F "$BRANCH" 2>&1>/dev/null
|
|
158
|
-
then
|
|
159
|
-
STUDIO_BRANCH="$BRANCH"
|
|
160
|
-
else
|
|
161
|
-
STUDIO_BRANCH=develop
|
|
162
|
-
fi
|
|
163
|
-
echo "STUDIO_BRANCH=$STUDIO_BRANCH" >> $GITHUB_ENV
|
|
164
|
-
echo "Studio branch: $STUDIO_BRANCH"
|
|
165
|
-
|
|
166
|
-
- name: Check out Studio
|
|
167
|
-
uses: actions/checkout@v4
|
|
168
|
-
with:
|
|
169
|
-
fetch-depth: 0
|
|
170
|
-
repository: iterative/studio
|
|
171
|
-
ref: ${{ env.STUDIO_BRANCH }}
|
|
172
|
-
token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
|
|
173
|
-
|
|
174
|
-
- name: Check out repository
|
|
175
|
-
uses: actions/checkout@v4
|
|
176
|
-
with:
|
|
177
|
-
path: './backend/datachain'
|
|
178
|
-
fetch-depth: 0
|
|
179
|
-
|
|
180
|
-
- name: Set up Python ${{ matrix.pyv }}
|
|
181
|
-
uses: actions/setup-python@v5
|
|
182
|
-
with:
|
|
183
|
-
python-version: ${{ matrix.pyv }}
|
|
184
|
-
cache: 'pip'
|
|
185
|
-
|
|
186
|
-
- name: Install uv
|
|
187
|
-
run: |
|
|
188
|
-
python -m pip install --upgrade uv
|
|
189
|
-
uv --version
|
|
190
|
-
|
|
191
|
-
- name: Install dependencies
|
|
192
|
-
run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
|
|
193
|
-
|
|
194
|
-
- name: Run tests
|
|
195
|
-
# Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
|
|
196
|
-
run: >
|
|
197
|
-
pytest
|
|
198
|
-
--config-file=pyproject.toml -rs
|
|
199
|
-
--splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
|
|
200
|
-
tests ../datachain/tests
|
|
201
|
-
working-directory: backend/datachain_server
|
|
202
|
-
|
|
203
|
-
|
|
204
113
|
examples:
|
|
205
114
|
runs-on: ${{ matrix.os }}
|
|
206
115
|
timeout-minutes: 60
|
|
@@ -211,7 +120,6 @@ jobs:
|
|
|
211
120
|
pyv: ['3.9', '3.12']
|
|
212
121
|
group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
|
|
213
122
|
steps:
|
|
214
|
-
|
|
215
123
|
- uses: actions/checkout@v4
|
|
216
124
|
|
|
217
125
|
- name: Set up Python ${{ matrix.pyv }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -76,7 +76,7 @@ Requires-Dist: aiotools>=1.7.0; extra == "tests"
|
|
|
76
76
|
Requires-Dist: requests-mock; extra == "tests"
|
|
77
77
|
Provides-Extra: dev
|
|
78
78
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
79
|
-
Requires-Dist: mypy==1.
|
|
79
|
+
Requires-Dist: mypy==1.11.1; extra == "dev"
|
|
80
80
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
81
81
|
Requires-Dist: types-pytz; extra == "dev"
|
|
82
82
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -54,7 +54,7 @@ source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
|
|
|
54
54
|
.filter(C("file.path").glob("*.jpg") | C("file.path").glob("*.json"))
|
|
55
55
|
.agg(
|
|
56
56
|
openimage_detect,
|
|
57
|
-
partition_by=path.file_stem(
|
|
57
|
+
partition_by=path.file_stem(C("file.path")),
|
|
58
58
|
params=["file"],
|
|
59
59
|
output={"file": File, "bbox": BBox},
|
|
60
60
|
)
|
|
@@ -26,8 +26,8 @@ dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
|
|
|
26
26
|
|
|
27
27
|
(
|
|
28
28
|
dc.mutate(
|
|
29
|
-
stem=path.file_stem(
|
|
30
|
-
ext=path.file_ext(
|
|
29
|
+
stem=path.file_stem(C("file.path")),
|
|
30
|
+
ext=path.file_ext(C("file.path")),
|
|
31
31
|
)
|
|
32
32
|
.select("file.path", "stem", "ext")
|
|
33
33
|
.show(5)
|
|
@@ -224,3 +224,23 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
|
|
|
224
224
|
async def _break_iteration(self) -> None:
|
|
225
225
|
self.heap = []
|
|
226
226
|
self._push_result(self._next_yield, None)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def iter_over_async(ait, loop):
|
|
230
|
+
"""Wrap an asynchronous iterator into a synchronous one"""
|
|
231
|
+
ait = ait.__aiter__()
|
|
232
|
+
|
|
233
|
+
# helper async fn that just gets the next element from the async iterator
|
|
234
|
+
async def get_next():
|
|
235
|
+
try:
|
|
236
|
+
obj = await ait.__anext__()
|
|
237
|
+
return False, obj
|
|
238
|
+
except StopAsyncIteration:
|
|
239
|
+
return True, None
|
|
240
|
+
|
|
241
|
+
# actual sync iterator
|
|
242
|
+
while True:
|
|
243
|
+
done, obj = asyncio.run_coroutine_threadsafe(get_next(), loop).result()
|
|
244
|
+
if done:
|
|
245
|
+
break
|
|
246
|
+
yield obj
|
|
@@ -577,6 +577,7 @@ class Catalog:
|
|
|
577
577
|
warehouse_ready_callback: Optional[
|
|
578
578
|
Callable[["AbstractWarehouse"], None]
|
|
579
579
|
] = None,
|
|
580
|
+
in_memory: bool = False,
|
|
580
581
|
):
|
|
581
582
|
datachain_dir = DataChainDir(cache=cache_dir, tmp=tmp_dir)
|
|
582
583
|
datachain_dir.init()
|
|
@@ -590,6 +591,7 @@ class Catalog:
|
|
|
590
591
|
"tmp_dir": tmp_dir,
|
|
591
592
|
}
|
|
592
593
|
self._warehouse_ready_callback = warehouse_ready_callback
|
|
594
|
+
self.in_memory = in_memory
|
|
593
595
|
|
|
594
596
|
@cached_property
|
|
595
597
|
def warehouse(self) -> "AbstractWarehouse":
|
|
@@ -28,8 +28,10 @@ WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
|
|
|
28
28
|
DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
|
|
29
29
|
DISTRIBUTED_ARG_PREFIX = "DATACHAIN_DISTRIBUTED_ARG_"
|
|
30
30
|
|
|
31
|
+
IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
31
32
|
|
|
32
|
-
|
|
33
|
+
|
|
34
|
+
def get_id_generator(in_memory: bool = False) -> "AbstractIDGenerator":
|
|
33
35
|
id_generator_serialized = os.environ.get(ID_GENERATOR_SERIALIZED)
|
|
34
36
|
if id_generator_serialized:
|
|
35
37
|
id_generator_obj = deserialize(id_generator_serialized)
|
|
@@ -43,25 +45,31 @@ def get_id_generator() -> "AbstractIDGenerator":
|
|
|
43
45
|
id_generator_import_path = os.environ.get(ID_GENERATOR_IMPORT_PATH)
|
|
44
46
|
id_generator_arg_envs = get_envs_by_prefix(ID_GENERATOR_ARG_PREFIX)
|
|
45
47
|
# Convert env variable names to keyword argument names by lowercasing them
|
|
46
|
-
id_generator_args
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
48
|
+
id_generator_args: dict[str, Any] = {
|
|
49
|
+
k.lower(): v for k, v in id_generator_arg_envs.items()
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if not id_generator_import_path:
|
|
53
|
+
id_generator_args["in_memory"] = in_memory
|
|
54
|
+
return SQLiteIDGenerator(**id_generator_args)
|
|
55
|
+
if in_memory:
|
|
56
|
+
raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
|
|
57
|
+
# ID generator paths are specified as (for example):
|
|
58
|
+
# datachain.data_storage.SQLiteIDGenerator
|
|
59
|
+
if "." not in id_generator_import_path:
|
|
60
|
+
raise RuntimeError(
|
|
61
|
+
f"Invalid {ID_GENERATOR_IMPORT_PATH} import path:"
|
|
62
|
+
f"{id_generator_import_path}"
|
|
63
|
+
)
|
|
64
|
+
module_name, _, class_name = id_generator_import_path.rpartition(".")
|
|
65
|
+
id_generator = import_module(module_name)
|
|
66
|
+
id_generator_class = getattr(id_generator, class_name)
|
|
61
67
|
return id_generator_class(**id_generator_args)
|
|
62
68
|
|
|
63
69
|
|
|
64
|
-
def get_metastore(
|
|
70
|
+
def get_metastore(
|
|
71
|
+
id_generator: Optional["AbstractIDGenerator"], in_memory: bool = False
|
|
72
|
+
) -> "AbstractMetastore":
|
|
65
73
|
metastore_serialized = os.environ.get(METASTORE_SERIALIZED)
|
|
66
74
|
if metastore_serialized:
|
|
67
75
|
metastore_obj = deserialize(metastore_serialized)
|
|
@@ -78,24 +86,32 @@ def get_metastore(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractMet
|
|
|
78
86
|
metastore_import_path = os.environ.get(METASTORE_IMPORT_PATH)
|
|
79
87
|
metastore_arg_envs = get_envs_by_prefix(METASTORE_ARG_PREFIX)
|
|
80
88
|
# Convert env variable names to keyword argument names by lowercasing them
|
|
81
|
-
metastore_args
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
if
|
|
87
|
-
raise
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
89
|
+
metastore_args: dict[str, Any] = {
|
|
90
|
+
k.lower(): v for k, v in metastore_arg_envs.items()
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if not metastore_import_path:
|
|
94
|
+
if not isinstance(id_generator, SQLiteIDGenerator):
|
|
95
|
+
raise ValueError("SQLiteMetastore can only be used with SQLiteIDGenerator")
|
|
96
|
+
metastore_args["in_memory"] = in_memory
|
|
97
|
+
return SQLiteMetastore(id_generator, **metastore_args)
|
|
98
|
+
if in_memory:
|
|
99
|
+
raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
|
|
100
|
+
# Metastore paths are specified as (for example):
|
|
101
|
+
# datachain.data_storage.SQLiteMetastore
|
|
102
|
+
if "." not in metastore_import_path:
|
|
103
|
+
raise RuntimeError(
|
|
104
|
+
f"Invalid {METASTORE_IMPORT_PATH} import path: {metastore_import_path}"
|
|
105
|
+
)
|
|
106
|
+
module_name, _, class_name = metastore_import_path.rpartition(".")
|
|
107
|
+
metastore = import_module(module_name)
|
|
108
|
+
metastore_class = getattr(metastore, class_name)
|
|
95
109
|
return metastore_class(id_generator, **metastore_args)
|
|
96
110
|
|
|
97
111
|
|
|
98
|
-
def get_warehouse(
|
|
112
|
+
def get_warehouse(
|
|
113
|
+
id_generator: Optional["AbstractIDGenerator"], in_memory: bool = False
|
|
114
|
+
) -> "AbstractWarehouse":
|
|
99
115
|
warehouse_serialized = os.environ.get(WAREHOUSE_SERIALIZED)
|
|
100
116
|
if warehouse_serialized:
|
|
101
117
|
warehouse_obj = deserialize(warehouse_serialized)
|
|
@@ -112,20 +128,26 @@ def get_warehouse(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractWar
|
|
|
112
128
|
warehouse_import_path = os.environ.get(WAREHOUSE_IMPORT_PATH)
|
|
113
129
|
warehouse_arg_envs = get_envs_by_prefix(WAREHOUSE_ARG_PREFIX)
|
|
114
130
|
# Convert env variable names to keyword argument names by lowercasing them
|
|
115
|
-
warehouse_args
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
if
|
|
121
|
-
raise
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
131
|
+
warehouse_args: dict[str, Any] = {
|
|
132
|
+
k.lower(): v for k, v in warehouse_arg_envs.items()
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if not warehouse_import_path:
|
|
136
|
+
if not isinstance(id_generator, SQLiteIDGenerator):
|
|
137
|
+
raise ValueError("SQLiteWarehouse can only be used with SQLiteIDGenerator")
|
|
138
|
+
warehouse_args["in_memory"] = in_memory
|
|
139
|
+
return SQLiteWarehouse(id_generator, **warehouse_args)
|
|
140
|
+
if in_memory:
|
|
141
|
+
raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
|
|
142
|
+
# Warehouse paths are specified as (for example):
|
|
143
|
+
# datachain.data_storage.SQLiteWarehouse
|
|
144
|
+
if "." not in warehouse_import_path:
|
|
145
|
+
raise RuntimeError(
|
|
146
|
+
f"Invalid {WAREHOUSE_IMPORT_PATH} import path: {warehouse_import_path}"
|
|
147
|
+
)
|
|
148
|
+
module_name, _, class_name = warehouse_import_path.rpartition(".")
|
|
149
|
+
warehouse = import_module(module_name)
|
|
150
|
+
warehouse_class = getattr(warehouse, class_name)
|
|
129
151
|
return warehouse_class(id_generator, **warehouse_args)
|
|
130
152
|
|
|
131
153
|
|
|
@@ -152,7 +174,9 @@ def get_distributed_class(**kwargs):
|
|
|
152
174
|
return distributed_class(**distributed_args | kwargs)
|
|
153
175
|
|
|
154
176
|
|
|
155
|
-
def get_catalog(
|
|
177
|
+
def get_catalog(
|
|
178
|
+
client_config: Optional[dict[str, Any]] = None, in_memory: bool = False
|
|
179
|
+
) -> Catalog:
|
|
156
180
|
"""
|
|
157
181
|
Function that creates Catalog instance with appropriate metastore
|
|
158
182
|
and warehouse classes. Metastore class can be provided with env variable
|
|
@@ -164,10 +188,11 @@ def get_catalog(client_config: Optional[dict[str, Any]] = None) -> Catalog:
|
|
|
164
188
|
and name of variable after, e.g. if it accepts team_id as kwargs
|
|
165
189
|
we can provide DATACHAIN_METASTORE_ARG_TEAM_ID=12345 env variable.
|
|
166
190
|
"""
|
|
167
|
-
id_generator = get_id_generator()
|
|
191
|
+
id_generator = get_id_generator(in_memory=in_memory)
|
|
168
192
|
return Catalog(
|
|
169
193
|
id_generator=id_generator,
|
|
170
|
-
metastore=get_metastore(id_generator),
|
|
171
|
-
warehouse=get_warehouse(id_generator),
|
|
194
|
+
metastore=get_metastore(id_generator, in_memory=in_memory),
|
|
195
|
+
warehouse=get_warehouse(id_generator, in_memory=in_memory),
|
|
172
196
|
client_config=client_config,
|
|
197
|
+
in_memory=in_memory,
|
|
173
198
|
)
|
|
@@ -3,6 +3,7 @@ from typing import Any
|
|
|
3
3
|
from adlfs import AzureBlobFileSystem
|
|
4
4
|
from tqdm import tqdm
|
|
5
5
|
|
|
6
|
+
from datachain.lib.file import File
|
|
6
7
|
from datachain.node import Entry
|
|
7
8
|
|
|
8
9
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
@@ -24,6 +25,18 @@ class AzureClient(Client):
|
|
|
24
25
|
size=v.get("size", ""),
|
|
25
26
|
)
|
|
26
27
|
|
|
28
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
29
|
+
version_id = v.get("version_id")
|
|
30
|
+
return File(
|
|
31
|
+
source=self.uri,
|
|
32
|
+
path=path,
|
|
33
|
+
etag=v.get("etag", "").strip('"'),
|
|
34
|
+
version=version_id or "",
|
|
35
|
+
is_latest=version_id is None or bool(v.get("is_current_version")),
|
|
36
|
+
last_modified=v["last_modified"],
|
|
37
|
+
size=v.get("size", ""),
|
|
38
|
+
)
|
|
39
|
+
|
|
27
40
|
async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
|
|
28
41
|
prefix = start_prefix
|
|
29
42
|
if prefix:
|
|
@@ -9,6 +9,7 @@ from dateutil.parser import isoparse
|
|
|
9
9
|
from gcsfs import GCSFileSystem
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
|
+
from datachain.lib.file import File
|
|
12
13
|
from datachain.node import Entry
|
|
13
14
|
|
|
14
15
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
@@ -120,3 +121,14 @@ class GCSClient(Client):
|
|
|
120
121
|
last_modified=self.parse_timestamp(v["updated"]),
|
|
121
122
|
size=v.get("size", ""),
|
|
122
123
|
)
|
|
124
|
+
|
|
125
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
126
|
+
return File(
|
|
127
|
+
source=self.uri,
|
|
128
|
+
path=path,
|
|
129
|
+
etag=v.get("etag", ""),
|
|
130
|
+
version=v.get("generation", ""),
|
|
131
|
+
is_latest=not v.get("timeDeleted"),
|
|
132
|
+
last_modified=self.parse_timestamp(v["updated"]),
|
|
133
|
+
size=v.get("size", ""),
|
|
134
|
+
)
|
|
@@ -7,6 +7,7 @@ from urllib.parse import urlparse
|
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
9
9
|
|
|
10
|
+
from datachain.lib.file import File
|
|
10
11
|
from datachain.node import Entry
|
|
11
12
|
from datachain.storage import StorageURI
|
|
12
13
|
|
|
@@ -144,6 +145,16 @@ class FileClient(Client):
|
|
|
144
145
|
size=v.get("size", ""),
|
|
145
146
|
)
|
|
146
147
|
|
|
148
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
149
|
+
return File(
|
|
150
|
+
source=self.uri,
|
|
151
|
+
path=path,
|
|
152
|
+
size=v.get("size", ""),
|
|
153
|
+
etag=v["mtime"].hex(),
|
|
154
|
+
is_latest=True,
|
|
155
|
+
last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
|
|
156
|
+
)
|
|
157
|
+
|
|
147
158
|
def fetch_nodes(
|
|
148
159
|
self,
|
|
149
160
|
nodes,
|
|
@@ -5,6 +5,7 @@ from botocore.exceptions import NoCredentialsError
|
|
|
5
5
|
from s3fs import S3FileSystem
|
|
6
6
|
from tqdm import tqdm
|
|
7
7
|
|
|
8
|
+
from datachain.lib.file import File
|
|
8
9
|
from datachain.node import Entry
|
|
9
10
|
|
|
10
11
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
@@ -167,3 +168,14 @@ class ClientS3(Client):
|
|
|
167
168
|
owner_name=v.get("Owner", {}).get("DisplayName", ""),
|
|
168
169
|
owner_id=v.get("Owner", {}).get("ID", ""),
|
|
169
170
|
)
|
|
171
|
+
|
|
172
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
173
|
+
return File(
|
|
174
|
+
source=self.uri,
|
|
175
|
+
path=path,
|
|
176
|
+
size=v["size"],
|
|
177
|
+
version=ClientS3.clean_s3_version(v.get("VersionId", "")),
|
|
178
|
+
etag=v.get("ETag", "").strip('"'),
|
|
179
|
+
is_latest=v.get("IsLatest", True),
|
|
180
|
+
last_modified=v.get("LastModified", ""),
|
|
181
|
+
)
|