datachain 0.14.5__tar.gz → 0.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.14.5 → datachain-0.16.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.14.5/src/datachain.egg-info → datachain-0.16.0}/PKG-INFO +1 -1
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/datachain.md +4 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/multimodal/wds_filtered.py +1 -1
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/__init__.py +4 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/catalog/catalog.py +19 -9
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/catalog/loader.py +11 -7
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/__init__.py +1 -1
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/datasets.py +3 -3
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/show.py +2 -2
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/parser/__init__.py +2 -2
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/metastore.py +5 -5
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/dataset.py +8 -8
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/values_to_tuples.py +23 -14
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dataset_info.py +18 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/__init__.py +4 -1
- datachain-0.16.0/src/datachain/lib/dc/database.py +151 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/datachain.py +19 -8
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/datasets.py +52 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/pandas.py +8 -1
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/records.py +12 -14
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/signal_schema.py +10 -1
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/udf.py +2 -1
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/dataset.py +12 -14
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/dispatch.py +7 -2
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/schema.py +4 -1
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/remote/studio.py +2 -2
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/studio.py +2 -2
- {datachain-0.14.5 → datachain-0.16.0/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain.egg-info/SOURCES.txt +2 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/conftest.py +11 -8
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_catalog.py +3 -3
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_datachain.py +32 -8
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_dataset_query.py +0 -60
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_datasets.py +7 -7
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_hidden_field.py +1 -1
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_pull.py +10 -8
- datachain-0.16.0/tests/func/test_read_database.py +175 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/test_cli_studio.py +4 -4
- {datachain-0.14.5 → datachain-0.16.0}/tests/test_import_time.py +1 -1
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_datachain.py +118 -2
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_feature_utils.py +0 -5
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_catalog_loader.py +21 -10
- {datachain-0.14.5 → datachain-0.16.0}/.cruft.json +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.gitattributes +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.github/codecov.yaml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.github/dependabot.yml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.github/workflows/release.yml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/.gitignore +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/LICENSE +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/README.rst +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/contributing.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/examples.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/index.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/overrides/main.html +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/quick-start.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/func.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/index.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/remotes.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/toolkit.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/torch.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/references/udf.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/docs/tutorials.md +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/mkdocs.yml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/noxfile.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/pyproject.toml +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/setup.cfg +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/__main__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/asyn.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cache.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/local.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/config.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/error.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/array.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/base.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/func.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/path.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/random.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/string.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/window.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/job.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/file.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/listing.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/node.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/progress.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/py.typed +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/params.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/session.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/data.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/examples/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_client.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_file.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_hf.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_image.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_listing.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_ls.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_query.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_session.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_video.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/test_atomicity.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/test_telemetry.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_client.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_config.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_func.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_query.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_session.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.14.5 → datachain-0.16.0}/tests/utils.py +0 -0
|
@@ -31,6 +31,10 @@ for examples of how to create a chain.
|
|
|
31
31
|
|
|
32
32
|
::: datachain.lib.dc.values.read_values
|
|
33
33
|
|
|
34
|
+
::: datachain.lib.dc.database.read_database
|
|
35
|
+
|
|
36
|
+
::: datachain.lib.dc.database.ConnectionType
|
|
37
|
+
|
|
34
38
|
::: datachain.lib.dc.DataChain
|
|
35
39
|
|
|
36
40
|
::: datachain.lib.utils.DataChainError
|
|
@@ -5,8 +5,10 @@ from datachain.lib.dc import (
|
|
|
5
5
|
DataChain,
|
|
6
6
|
Sys,
|
|
7
7
|
datasets,
|
|
8
|
+
delete_dataset,
|
|
8
9
|
listings,
|
|
9
10
|
read_csv,
|
|
11
|
+
read_database,
|
|
10
12
|
read_dataset,
|
|
11
13
|
read_hf,
|
|
12
14
|
read_json,
|
|
@@ -61,11 +63,13 @@ __all__ = [
|
|
|
61
63
|
"VideoFragment",
|
|
62
64
|
"VideoFrame",
|
|
63
65
|
"datasets",
|
|
66
|
+
"delete_dataset",
|
|
64
67
|
"is_chain_type",
|
|
65
68
|
"listings",
|
|
66
69
|
"metrics",
|
|
67
70
|
"param",
|
|
68
71
|
"read_csv",
|
|
72
|
+
"read_database",
|
|
69
73
|
"read_dataset",
|
|
70
74
|
"read_hf",
|
|
71
75
|
"read_json",
|
|
@@ -776,7 +776,7 @@ class Catalog:
|
|
|
776
776
|
listing: Optional[bool] = False,
|
|
777
777
|
uuid: Optional[str] = None,
|
|
778
778
|
description: Optional[str] = None,
|
|
779
|
-
|
|
779
|
+
attrs: Optional[list[str]] = None,
|
|
780
780
|
) -> "DatasetRecord":
|
|
781
781
|
"""
|
|
782
782
|
Creates new dataset of a specific version.
|
|
@@ -794,16 +794,16 @@ class Catalog:
|
|
|
794
794
|
dataset = self.get_dataset(name)
|
|
795
795
|
default_version = dataset.next_version
|
|
796
796
|
|
|
797
|
-
if (description or
|
|
798
|
-
dataset.description != description or dataset.
|
|
797
|
+
if (description or attrs) and (
|
|
798
|
+
dataset.description != description or dataset.attrs != attrs
|
|
799
799
|
):
|
|
800
800
|
description = description or dataset.description
|
|
801
|
-
|
|
801
|
+
attrs = attrs or dataset.attrs
|
|
802
802
|
|
|
803
803
|
self.update_dataset(
|
|
804
804
|
dataset,
|
|
805
805
|
description=description,
|
|
806
|
-
|
|
806
|
+
attrs=attrs,
|
|
807
807
|
)
|
|
808
808
|
|
|
809
809
|
except DatasetNotFoundError:
|
|
@@ -817,7 +817,7 @@ class Catalog:
|
|
|
817
817
|
schema=schema,
|
|
818
818
|
ignore_if_exists=True,
|
|
819
819
|
description=description,
|
|
820
|
-
|
|
820
|
+
attrs=attrs,
|
|
821
821
|
)
|
|
822
822
|
|
|
823
823
|
version = version or default_version
|
|
@@ -1299,7 +1299,17 @@ class Catalog:
|
|
|
1299
1299
|
name: str,
|
|
1300
1300
|
version: Optional[int] = None,
|
|
1301
1301
|
force: Optional[bool] = False,
|
|
1302
|
+
studio: Optional[bool] = False,
|
|
1302
1303
|
):
|
|
1304
|
+
from datachain.remote.studio import StudioClient
|
|
1305
|
+
|
|
1306
|
+
if studio:
|
|
1307
|
+
client = StudioClient()
|
|
1308
|
+
response = client.rm_dataset(name, version=version, force=force)
|
|
1309
|
+
if not response.ok:
|
|
1310
|
+
raise DataChainError(response.message)
|
|
1311
|
+
return
|
|
1312
|
+
|
|
1303
1313
|
dataset = self.get_dataset(name)
|
|
1304
1314
|
if not version and not force:
|
|
1305
1315
|
raise ValueError(f"Missing dataset version from input for dataset {name}")
|
|
@@ -1324,15 +1334,15 @@ class Catalog:
|
|
|
1324
1334
|
name: str,
|
|
1325
1335
|
new_name: Optional[str] = None,
|
|
1326
1336
|
description: Optional[str] = None,
|
|
1327
|
-
|
|
1337
|
+
attrs: Optional[list[str]] = None,
|
|
1328
1338
|
) -> DatasetRecord:
|
|
1329
1339
|
update_data = {}
|
|
1330
1340
|
if new_name:
|
|
1331
1341
|
update_data["name"] = new_name
|
|
1332
1342
|
if description is not None:
|
|
1333
1343
|
update_data["description"] = description
|
|
1334
|
-
if
|
|
1335
|
-
update_data["
|
|
1344
|
+
if attrs is not None:
|
|
1345
|
+
update_data["attrs"] = attrs # type: ignore[assignment]
|
|
1336
1346
|
|
|
1337
1347
|
dataset = self.get_dataset(name)
|
|
1338
1348
|
return self.update_dataset(dataset, **update_data)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import sys
|
|
2
3
|
from importlib import import_module
|
|
3
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
5
|
|
|
@@ -15,6 +16,7 @@ METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
|
|
|
15
16
|
WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
|
|
16
17
|
WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
|
|
17
18
|
WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
|
|
19
|
+
DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
|
|
18
20
|
DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
|
|
19
21
|
|
|
20
22
|
IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
@@ -100,19 +102,21 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
|
100
102
|
return warehouse_class(**warehouse_args)
|
|
101
103
|
|
|
102
104
|
|
|
103
|
-
def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
|
|
104
|
-
distributed_import_path
|
|
105
|
+
def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
|
|
106
|
+
if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
|
|
107
|
+
return None
|
|
105
108
|
|
|
106
|
-
if not distributed_import_path:
|
|
107
|
-
raise RuntimeError(
|
|
108
|
-
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
109
|
-
"for distributed UDF processing."
|
|
110
|
-
)
|
|
111
109
|
# Distributed class paths are specified as (for example): module.classname
|
|
112
110
|
if "." not in distributed_import_path:
|
|
113
111
|
raise RuntimeError(
|
|
114
112
|
f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
|
|
115
113
|
)
|
|
114
|
+
|
|
115
|
+
# Optional: set the Python path to look for the module
|
|
116
|
+
distributed_import_pythonpath = os.environ.get(DISTRIBUTED_IMPORT_PYTHONPATH)
|
|
117
|
+
if distributed_import_pythonpath and distributed_import_pythonpath not in sys.path:
|
|
118
|
+
sys.path.insert(0, distributed_import_pythonpath)
|
|
119
|
+
|
|
116
120
|
module_name, _, class_name = distributed_import_path.rpartition(".")
|
|
117
121
|
distributed = import_module(module_name)
|
|
118
122
|
return getattr(distributed, class_name)
|
|
@@ -154,7 +154,7 @@ def edit_dataset(
|
|
|
154
154
|
name: str,
|
|
155
155
|
new_name: Optional[str] = None,
|
|
156
156
|
description: Optional[str] = None,
|
|
157
|
-
|
|
157
|
+
attrs: Optional[list[str]] = None,
|
|
158
158
|
studio: bool = False,
|
|
159
159
|
local: bool = False,
|
|
160
160
|
all: bool = True,
|
|
@@ -167,9 +167,9 @@ def edit_dataset(
|
|
|
167
167
|
|
|
168
168
|
if all or local:
|
|
169
169
|
try:
|
|
170
|
-
catalog.edit_dataset(name, new_name, description,
|
|
170
|
+
catalog.edit_dataset(name, new_name, description, attrs)
|
|
171
171
|
except DatasetNotFoundError:
|
|
172
172
|
print("Dataset not found in local", file=sys.stderr)
|
|
173
173
|
|
|
174
174
|
if (all or studio) and token:
|
|
175
|
-
edit_studio_dataset(team, name, new_name, description,
|
|
175
|
+
edit_studio_dataset(team, name, new_name, description, attrs)
|
|
@@ -42,8 +42,8 @@ def show(
|
|
|
42
42
|
print("Name: ", name)
|
|
43
43
|
if dataset.description:
|
|
44
44
|
print("Description: ", dataset.description)
|
|
45
|
-
if dataset.
|
|
46
|
-
print("
|
|
45
|
+
if dataset.attrs:
|
|
46
|
+
print("Attributes: ", ",".join(dataset.attrs))
|
|
47
47
|
print("\n")
|
|
48
48
|
|
|
49
49
|
show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
|
|
@@ -217,9 +217,9 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
217
217
|
help="Dataset description",
|
|
218
218
|
)
|
|
219
219
|
parse_edit_dataset.add_argument(
|
|
220
|
-
"--
|
|
220
|
+
"--attrs",
|
|
221
221
|
nargs="+",
|
|
222
|
-
help="Dataset
|
|
222
|
+
help="Dataset attributes",
|
|
223
223
|
)
|
|
224
224
|
parse_edit_dataset.add_argument(
|
|
225
225
|
"--studio",
|
|
@@ -120,7 +120,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
120
120
|
schema: Optional[dict[str, Any]] = None,
|
|
121
121
|
ignore_if_exists: bool = False,
|
|
122
122
|
description: Optional[str] = None,
|
|
123
|
-
|
|
123
|
+
attrs: Optional[list[str]] = None,
|
|
124
124
|
) -> DatasetRecord:
|
|
125
125
|
"""Creates new dataset."""
|
|
126
126
|
|
|
@@ -326,7 +326,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
326
326
|
Column("id", Integer, primary_key=True),
|
|
327
327
|
Column("name", Text, nullable=False),
|
|
328
328
|
Column("description", Text),
|
|
329
|
-
Column("
|
|
329
|
+
Column("attrs", JSON, nullable=True),
|
|
330
330
|
Column("status", Integer, nullable=False),
|
|
331
331
|
Column("feature_schema", JSON, nullable=True),
|
|
332
332
|
Column("created_at", DateTime(timezone=True)),
|
|
@@ -521,7 +521,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
521
521
|
schema: Optional[dict[str, Any]] = None,
|
|
522
522
|
ignore_if_exists: bool = False,
|
|
523
523
|
description: Optional[str] = None,
|
|
524
|
-
|
|
524
|
+
attrs: Optional[list[str]] = None,
|
|
525
525
|
**kwargs, # TODO registered = True / False
|
|
526
526
|
) -> DatasetRecord:
|
|
527
527
|
"""Creates new dataset."""
|
|
@@ -538,7 +538,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
538
538
|
query_script=query_script,
|
|
539
539
|
schema=json.dumps(schema or {}),
|
|
540
540
|
description=description,
|
|
541
|
-
|
|
541
|
+
attrs=json.dumps(attrs or []),
|
|
542
542
|
)
|
|
543
543
|
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
544
544
|
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
@@ -621,7 +621,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
621
621
|
dataset_values = {}
|
|
622
622
|
for field, value in kwargs.items():
|
|
623
623
|
if field in self._dataset_fields[1:]:
|
|
624
|
-
if field in ["
|
|
624
|
+
if field in ["attrs", "schema"]:
|
|
625
625
|
values[field] = json.dumps(value) if value else None
|
|
626
626
|
else:
|
|
627
627
|
values[field] = value
|
|
@@ -329,7 +329,7 @@ class DatasetRecord:
|
|
|
329
329
|
id: int
|
|
330
330
|
name: str
|
|
331
331
|
description: Optional[str]
|
|
332
|
-
|
|
332
|
+
attrs: list[str]
|
|
333
333
|
schema: dict[str, Union[SQLType, type[SQLType]]]
|
|
334
334
|
feature_schema: dict
|
|
335
335
|
versions: list[DatasetVersion]
|
|
@@ -357,7 +357,7 @@ class DatasetRecord:
|
|
|
357
357
|
id: int,
|
|
358
358
|
name: str,
|
|
359
359
|
description: Optional[str],
|
|
360
|
-
|
|
360
|
+
attrs: str,
|
|
361
361
|
status: int,
|
|
362
362
|
feature_schema: Optional[str],
|
|
363
363
|
created_at: datetime,
|
|
@@ -387,7 +387,7 @@ class DatasetRecord:
|
|
|
387
387
|
version_schema: str,
|
|
388
388
|
version_job_id: Optional[str] = None,
|
|
389
389
|
) -> "DatasetRecord":
|
|
390
|
-
|
|
390
|
+
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
391
391
|
schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
|
|
392
392
|
version_schema_dct: dict[str, str] = (
|
|
393
393
|
json.loads(version_schema) if version_schema else {}
|
|
@@ -418,7 +418,7 @@ class DatasetRecord:
|
|
|
418
418
|
id,
|
|
419
419
|
name,
|
|
420
420
|
description,
|
|
421
|
-
|
|
421
|
+
attrs_lst,
|
|
422
422
|
cls.parse_schema(schema_dct), # type: ignore[arg-type]
|
|
423
423
|
json.loads(feature_schema) if feature_schema else {},
|
|
424
424
|
[dataset_version],
|
|
@@ -562,7 +562,7 @@ class DatasetListRecord:
|
|
|
562
562
|
id: int
|
|
563
563
|
name: str
|
|
564
564
|
description: Optional[str]
|
|
565
|
-
|
|
565
|
+
attrs: list[str]
|
|
566
566
|
versions: list[DatasetListVersion]
|
|
567
567
|
created_at: Optional[datetime] = None
|
|
568
568
|
|
|
@@ -572,7 +572,7 @@ class DatasetListRecord:
|
|
|
572
572
|
id: int,
|
|
573
573
|
name: str,
|
|
574
574
|
description: Optional[str],
|
|
575
|
-
|
|
575
|
+
attrs: str,
|
|
576
576
|
created_at: datetime,
|
|
577
577
|
version_id: int,
|
|
578
578
|
version_uuid: str,
|
|
@@ -588,7 +588,7 @@ class DatasetListRecord:
|
|
|
588
588
|
version_query_script: Optional[str],
|
|
589
589
|
version_job_id: Optional[str] = None,
|
|
590
590
|
) -> "DatasetListRecord":
|
|
591
|
-
|
|
591
|
+
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
592
592
|
|
|
593
593
|
dataset_version = DatasetListVersion.parse(
|
|
594
594
|
version_id,
|
|
@@ -610,7 +610,7 @@ class DatasetListRecord:
|
|
|
610
610
|
id,
|
|
611
611
|
name,
|
|
612
612
|
description,
|
|
613
|
-
|
|
613
|
+
attrs_lst,
|
|
614
614
|
[dataset_version],
|
|
615
615
|
created_at,
|
|
616
616
|
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
from collections.abc import Sequence
|
|
2
|
-
from typing import Any, Union
|
|
3
|
+
from typing import Any, Optional, Union
|
|
3
4
|
|
|
4
5
|
from datachain.lib.data_model import (
|
|
5
6
|
DataType,
|
|
@@ -66,21 +67,29 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
66
67
|
f"signal '{k}' is not present in the output",
|
|
67
68
|
)
|
|
68
69
|
else:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
raise ValuesToTupleError(
|
|
76
|
-
ds_name,
|
|
77
|
-
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
78
|
-
f" Please use DataModel types: {DataTypeNames}",
|
|
70
|
+
# FIXME: Stops as soon as it finds the first non-None value.
|
|
71
|
+
# If a non-None value appears early, it won't check the remaining items for
|
|
72
|
+
# `None` values.
|
|
73
|
+
try:
|
|
74
|
+
pos, first_not_none_element = next(
|
|
75
|
+
itertools.dropwhile(lambda pair: pair[1] is None, enumerate(v))
|
|
79
76
|
)
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
except StopIteration:
|
|
78
|
+
typ = str # default to str if all values are None or has length 0
|
|
79
|
+
nullable = True
|
|
82
80
|
else:
|
|
83
|
-
|
|
81
|
+
nullable = pos > 0
|
|
82
|
+
typ = type(first_not_none_element) # type: ignore[assignment]
|
|
83
|
+
if not is_chain_type(typ):
|
|
84
|
+
raise ValuesToTupleError(
|
|
85
|
+
ds_name,
|
|
86
|
+
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
87
|
+
f" Please use DataModel types: {DataTypeNames}",
|
|
88
|
+
)
|
|
89
|
+
if isinstance(first_not_none_element, list):
|
|
90
|
+
typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
|
|
91
|
+
|
|
92
|
+
types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
|
|
84
93
|
|
|
85
94
|
if length < 0:
|
|
86
95
|
length = len_
|
|
@@ -32,11 +32,28 @@ class DatasetInfo(DataModel):
|
|
|
32
32
|
metrics: dict[str, Any] = Field(default={})
|
|
33
33
|
error_message: str = Field(default="")
|
|
34
34
|
error_stack: str = Field(default="")
|
|
35
|
+
attrs: list[str] = Field(default=[])
|
|
35
36
|
|
|
36
37
|
@property
|
|
37
38
|
def is_temp(self) -> bool:
|
|
38
39
|
return Session.is_temp_dataset(self.name)
|
|
39
40
|
|
|
41
|
+
def has_attr(self, attr: str) -> bool:
|
|
42
|
+
s = attr.split("=")
|
|
43
|
+
if len(s) == 1:
|
|
44
|
+
return attr in self.attrs
|
|
45
|
+
|
|
46
|
+
name = s[0]
|
|
47
|
+
value = s[1]
|
|
48
|
+
for a in self.attrs:
|
|
49
|
+
s = a.split("=")
|
|
50
|
+
if value == "*" and s[0] == name:
|
|
51
|
+
return True
|
|
52
|
+
if len(s) == 2 and s[0] == name and s[1] == value:
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
return False
|
|
56
|
+
|
|
40
57
|
@staticmethod
|
|
41
58
|
def _validate_dict(
|
|
42
59
|
v: Optional[Union[str, dict]],
|
|
@@ -83,4 +100,5 @@ class DatasetInfo(DataModel):
|
|
|
83
100
|
metrics=job.metrics if job else {},
|
|
84
101
|
error_message=version.error_message,
|
|
85
102
|
error_stack=version.error_stack,
|
|
103
|
+
attrs=dataset.attrs,
|
|
86
104
|
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from .csv import read_csv
|
|
2
|
+
from .database import read_database
|
|
2
3
|
from .datachain import C, Column, DataChain
|
|
3
|
-
from .datasets import datasets, read_dataset
|
|
4
|
+
from .datasets import datasets, delete_dataset, read_dataset
|
|
4
5
|
from .hf import read_hf
|
|
5
6
|
from .json import read_json
|
|
6
7
|
from .listings import listings
|
|
@@ -19,8 +20,10 @@ __all__ = [
|
|
|
19
20
|
"DatasetPrepareError",
|
|
20
21
|
"Sys",
|
|
21
22
|
"datasets",
|
|
23
|
+
"delete_dataset",
|
|
22
24
|
"listings",
|
|
23
25
|
"read_csv",
|
|
26
|
+
"read_database",
|
|
24
27
|
"read_dataset",
|
|
25
28
|
"read_hf",
|
|
26
29
|
"read_json",
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import itertools
|
|
3
|
+
import os
|
|
4
|
+
import sqlite3
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
6
|
+
|
|
7
|
+
import sqlalchemy
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
11
|
+
|
|
12
|
+
import sqlalchemy.orm # noqa: TC004
|
|
13
|
+
|
|
14
|
+
from datachain.lib.data_model import DataType
|
|
15
|
+
from datachain.query import Session
|
|
16
|
+
|
|
17
|
+
from .datachain import DataChain
|
|
18
|
+
|
|
19
|
+
ConnectionType = Union[
|
|
20
|
+
str,
|
|
21
|
+
sqlalchemy.engine.URL,
|
|
22
|
+
sqlalchemy.engine.interfaces.Connectable,
|
|
23
|
+
sqlalchemy.engine.Engine,
|
|
24
|
+
sqlalchemy.engine.Connection,
|
|
25
|
+
sqlalchemy.orm.Session,
|
|
26
|
+
sqlite3.Connection,
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@contextlib.contextmanager
|
|
31
|
+
def _connect(
|
|
32
|
+
connection: "ConnectionType",
|
|
33
|
+
) -> "Iterator[Union[sqlalchemy.engine.Connection, sqlalchemy.orm.Session]]":
|
|
34
|
+
import sqlalchemy.orm
|
|
35
|
+
|
|
36
|
+
with contextlib.ExitStack() as stack:
|
|
37
|
+
engine_kwargs = {"echo": bool(os.environ.get("DEBUG_SHOW_SQL_QUERIES"))}
|
|
38
|
+
if isinstance(connection, (str, sqlalchemy.URL)):
|
|
39
|
+
engine = sqlalchemy.create_engine(connection, **engine_kwargs)
|
|
40
|
+
stack.callback(engine.dispose)
|
|
41
|
+
yield stack.enter_context(engine.connect())
|
|
42
|
+
elif isinstance(connection, sqlite3.Connection):
|
|
43
|
+
engine = sqlalchemy.create_engine(
|
|
44
|
+
"sqlite://", creator=lambda: connection, **engine_kwargs
|
|
45
|
+
)
|
|
46
|
+
# do not close the connection, as it is managed by the caller
|
|
47
|
+
yield engine.connect()
|
|
48
|
+
elif isinstance(connection, sqlalchemy.Engine):
|
|
49
|
+
yield stack.enter_context(connection.connect())
|
|
50
|
+
elif isinstance(connection, (sqlalchemy.Connection, sqlalchemy.orm.Session)):
|
|
51
|
+
# do not close the connection, as it is managed by the caller
|
|
52
|
+
yield connection
|
|
53
|
+
else:
|
|
54
|
+
raise TypeError(f"Unsupported connection type: {type(connection).__name__}")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _infer_schema(
|
|
58
|
+
result: "sqlalchemy.engine.Result",
|
|
59
|
+
to_infer: list[str],
|
|
60
|
+
infer_schema_length: Optional[int] = 100,
|
|
61
|
+
) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
|
|
62
|
+
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
63
|
+
|
|
64
|
+
if not to_infer:
|
|
65
|
+
return [], {}
|
|
66
|
+
|
|
67
|
+
rows = list(itertools.islice(result, infer_schema_length))
|
|
68
|
+
values = {col: [row._mapping[col] for row in rows] for col in to_infer}
|
|
69
|
+
_, output_schema, _ = values_to_tuples("", **values)
|
|
70
|
+
return rows, output_schema
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def read_database(
|
|
74
|
+
query: Union[str, "sqlalchemy.sql.expression.Executable"],
|
|
75
|
+
connection: "ConnectionType",
|
|
76
|
+
params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
|
|
77
|
+
*,
|
|
78
|
+
output: Optional["dict[str, DataType]"] = None,
|
|
79
|
+
session: Optional["Session"] = None,
|
|
80
|
+
settings: Optional[dict] = None,
|
|
81
|
+
in_memory: bool = False,
|
|
82
|
+
infer_schema_length: Optional[int] = 100,
|
|
83
|
+
) -> "DataChain":
|
|
84
|
+
"""
|
|
85
|
+
Read the results of a SQL query into a DataChain, using a given database connection.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
query:
|
|
89
|
+
The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
|
|
90
|
+
`Executable` object.
|
|
91
|
+
connection: SQLAlchemy connectable, str, or a sqlite3 connection
|
|
92
|
+
Using SQLAlchemy makes it possible to use any DB supported by that
|
|
93
|
+
library. If a DBAPI2 object, only sqlite3 is supported. The user is
|
|
94
|
+
responsible for engine disposal and connection closure for the
|
|
95
|
+
SQLAlchemy connectable; str connections are closed automatically.
|
|
96
|
+
params: Parameters to pass to execute method.
|
|
97
|
+
output: A dictionary mapping column names to types, used to override the
|
|
98
|
+
schema inferred from the query results.
|
|
99
|
+
session: Session to use for the chain.
|
|
100
|
+
settings: Settings to use for the chain.
|
|
101
|
+
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
102
|
+
infer_schema_length:
|
|
103
|
+
The maximum number of rows to scan for inferring schema.
|
|
104
|
+
If set to `None`, the full data may be scanned.
|
|
105
|
+
The rows used for schema inference are stored in memory,
|
|
106
|
+
so large values can lead to high memory usage.
|
|
107
|
+
Only applies if the `output` parameter is not set for the given column.
|
|
108
|
+
|
|
109
|
+
Examples:
|
|
110
|
+
Reading from a SQL query against a user-supplied connection:
|
|
111
|
+
```python
|
|
112
|
+
query = "SELECT key, value FROM tbl"
|
|
113
|
+
chain = dc.read_database(query, connection, output={"value": float})
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Load data from a SQLAlchemy driver/engine:
|
|
117
|
+
```python
|
|
118
|
+
from sqlalchemy import create_engine
|
|
119
|
+
engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
|
|
120
|
+
chain = dc.read_database("select * from tbl", engine)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Load data from a parameterized SQLAlchemy query:
|
|
124
|
+
```python
|
|
125
|
+
query = "SELECT key, value FROM tbl WHERE value > :value"
|
|
126
|
+
dc.read_database(query, engine, params={"value": 50})
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Notes:
|
|
130
|
+
This function works with a variety of databases — including, but not limited to,
|
|
131
|
+
SQLite, DuckDB, PostgreSQL, and Snowflake, provided the appropriate driver is
|
|
132
|
+
installed.
|
|
133
|
+
"""
|
|
134
|
+
from datachain.lib.dc.records import read_records
|
|
135
|
+
|
|
136
|
+
output = output or {}
|
|
137
|
+
if isinstance(query, str):
|
|
138
|
+
query = sqlalchemy.text(query)
|
|
139
|
+
kw = {"execution_options": {"stream_results": True}} # use server-side cursors
|
|
140
|
+
with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
|
|
141
|
+
cols = result.keys()
|
|
142
|
+
to_infer = [k for k in cols if k not in output] # preserve the order
|
|
143
|
+
rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
|
|
144
|
+
records = (row._asdict() for row in itertools.chain(rows, result))
|
|
145
|
+
return read_records(
|
|
146
|
+
records,
|
|
147
|
+
session=session,
|
|
148
|
+
settings=settings,
|
|
149
|
+
in_memory=in_memory,
|
|
150
|
+
schema=inferred_schema | output,
|
|
151
|
+
)
|