datachain 0.14.5__tar.gz → 0.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.14.5 → datachain-0.15.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.14.5/src/datachain.egg-info → datachain-0.15.0}/PKG-INFO +1 -1
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/datachain.md +4 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/multimodal/wds_filtered.py +1 -1
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/__init__.py +4 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/catalog/catalog.py +10 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/catalog/loader.py +11 -7
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/values_to_tuples.py +23 -14
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/__init__.py +4 -1
- datachain-0.15.0/src/datachain/lib/dc/database.py +151 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/datachain.py +15 -5
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/datasets.py +43 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/pandas.py +8 -1
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/records.py +12 -14
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/signal_schema.py +10 -1
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/dataset.py +10 -12
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/dispatch.py +7 -2
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/schema.py +4 -1
- {datachain-0.14.5 → datachain-0.15.0/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain.egg-info/SOURCES.txt +2 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/conftest.py +4 -1
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_catalog.py +3 -3
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_datachain.py +28 -4
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_dataset_query.py +0 -60
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_hidden_field.py +1 -1
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_pull.py +9 -7
- datachain-0.15.0/tests/func/test_read_database.py +175 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/test_import_time.py +1 -1
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_datachain.py +83 -2
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_feature_utils.py +0 -5
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_catalog_loader.py +21 -10
- {datachain-0.14.5 → datachain-0.15.0}/.cruft.json +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.gitattributes +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.github/codecov.yaml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.github/dependabot.yml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.github/workflows/release.yml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/.gitignore +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/LICENSE +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/README.rst +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/contributing.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/examples.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/index.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/overrides/main.html +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/quick-start.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/func.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/index.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/remotes.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/toolkit.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/torch.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/references/udf.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/docs/tutorials.md +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/mkdocs.yml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/noxfile.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/pyproject.toml +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/setup.cfg +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/__main__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/asyn.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cache.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/local.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/config.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/dataset.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/error.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/array.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/base.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/func.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/path.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/random.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/string.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/window.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/job.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/file.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/listing.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/node.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/progress.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/py.typed +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/params.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/session.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/studio.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain/utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/data.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/examples/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_client.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_file.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_hf.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_image.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_listing.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_ls.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_query.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_session.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_video.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/test_atomicity.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/test_cli_studio.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/test_telemetry.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_client.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_config.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_func.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_query.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_session.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.14.5 → datachain-0.15.0}/tests/utils.py +0 -0
|
@@ -31,6 +31,10 @@ for examples of how to create a chain.
|
|
|
31
31
|
|
|
32
32
|
::: datachain.lib.dc.values.read_values
|
|
33
33
|
|
|
34
|
+
::: datachain.lib.dc.database.read_database
|
|
35
|
+
|
|
36
|
+
::: datachain.lib.dc.database.ConnectionType
|
|
37
|
+
|
|
34
38
|
::: datachain.lib.dc.DataChain
|
|
35
39
|
|
|
36
40
|
::: datachain.lib.utils.DataChainError
|
|
@@ -5,8 +5,10 @@ from datachain.lib.dc import (
|
|
|
5
5
|
DataChain,
|
|
6
6
|
Sys,
|
|
7
7
|
datasets,
|
|
8
|
+
delete_dataset,
|
|
8
9
|
listings,
|
|
9
10
|
read_csv,
|
|
11
|
+
read_database,
|
|
10
12
|
read_dataset,
|
|
11
13
|
read_hf,
|
|
12
14
|
read_json,
|
|
@@ -61,11 +63,13 @@ __all__ = [
|
|
|
61
63
|
"VideoFragment",
|
|
62
64
|
"VideoFrame",
|
|
63
65
|
"datasets",
|
|
66
|
+
"delete_dataset",
|
|
64
67
|
"is_chain_type",
|
|
65
68
|
"listings",
|
|
66
69
|
"metrics",
|
|
67
70
|
"param",
|
|
68
71
|
"read_csv",
|
|
72
|
+
"read_database",
|
|
69
73
|
"read_dataset",
|
|
70
74
|
"read_hf",
|
|
71
75
|
"read_json",
|
|
@@ -1299,7 +1299,17 @@ class Catalog:
|
|
|
1299
1299
|
name: str,
|
|
1300
1300
|
version: Optional[int] = None,
|
|
1301
1301
|
force: Optional[bool] = False,
|
|
1302
|
+
studio: Optional[bool] = False,
|
|
1302
1303
|
):
|
|
1304
|
+
from datachain.remote.studio import StudioClient
|
|
1305
|
+
|
|
1306
|
+
if studio:
|
|
1307
|
+
client = StudioClient()
|
|
1308
|
+
response = client.rm_dataset(name, version=version, force=force)
|
|
1309
|
+
if not response.ok:
|
|
1310
|
+
raise DataChainError(response.message)
|
|
1311
|
+
return
|
|
1312
|
+
|
|
1303
1313
|
dataset = self.get_dataset(name)
|
|
1304
1314
|
if not version and not force:
|
|
1305
1315
|
raise ValueError(f"Missing dataset version from input for dataset {name}")
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import sys
|
|
2
3
|
from importlib import import_module
|
|
3
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
5
|
|
|
@@ -15,6 +16,7 @@ METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
|
|
|
15
16
|
WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
|
|
16
17
|
WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
|
|
17
18
|
WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
|
|
19
|
+
DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
|
|
18
20
|
DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
|
|
19
21
|
|
|
20
22
|
IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
@@ -100,19 +102,21 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
|
100
102
|
return warehouse_class(**warehouse_args)
|
|
101
103
|
|
|
102
104
|
|
|
103
|
-
def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
|
|
104
|
-
distributed_import_path
|
|
105
|
+
def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
|
|
106
|
+
if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
|
|
107
|
+
return None
|
|
105
108
|
|
|
106
|
-
if not distributed_import_path:
|
|
107
|
-
raise RuntimeError(
|
|
108
|
-
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
109
|
-
"for distributed UDF processing."
|
|
110
|
-
)
|
|
111
109
|
# Distributed class paths are specified as (for example): module.classname
|
|
112
110
|
if "." not in distributed_import_path:
|
|
113
111
|
raise RuntimeError(
|
|
114
112
|
f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
|
|
115
113
|
)
|
|
114
|
+
|
|
115
|
+
# Optional: set the Python path to look for the module
|
|
116
|
+
distributed_import_pythonpath = os.environ.get(DISTRIBUTED_IMPORT_PYTHONPATH)
|
|
117
|
+
if distributed_import_pythonpath and distributed_import_pythonpath not in sys.path:
|
|
118
|
+
sys.path.insert(0, distributed_import_pythonpath)
|
|
119
|
+
|
|
116
120
|
module_name, _, class_name = distributed_import_path.rpartition(".")
|
|
117
121
|
distributed = import_module(module_name)
|
|
118
122
|
return getattr(distributed, class_name)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
from collections.abc import Sequence
|
|
2
|
-
from typing import Any, Union
|
|
3
|
+
from typing import Any, Optional, Union
|
|
3
4
|
|
|
4
5
|
from datachain.lib.data_model import (
|
|
5
6
|
DataType,
|
|
@@ -66,21 +67,29 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
66
67
|
f"signal '{k}' is not present in the output",
|
|
67
68
|
)
|
|
68
69
|
else:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
raise ValuesToTupleError(
|
|
76
|
-
ds_name,
|
|
77
|
-
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
78
|
-
f" Please use DataModel types: {DataTypeNames}",
|
|
70
|
+
# FIXME: Stops as soon as it finds the first non-None value.
|
|
71
|
+
# If a non-None value appears early, it won't check the remaining items for
|
|
72
|
+
# `None` values.
|
|
73
|
+
try:
|
|
74
|
+
pos, first_not_none_element = next(
|
|
75
|
+
itertools.dropwhile(lambda pair: pair[1] is None, enumerate(v))
|
|
79
76
|
)
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
except StopIteration:
|
|
78
|
+
typ = str # default to str if all values are None or has length 0
|
|
79
|
+
nullable = True
|
|
82
80
|
else:
|
|
83
|
-
|
|
81
|
+
nullable = pos > 0
|
|
82
|
+
typ = type(first_not_none_element) # type: ignore[assignment]
|
|
83
|
+
if not is_chain_type(typ):
|
|
84
|
+
raise ValuesToTupleError(
|
|
85
|
+
ds_name,
|
|
86
|
+
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
87
|
+
f" Please use DataModel types: {DataTypeNames}",
|
|
88
|
+
)
|
|
89
|
+
if isinstance(first_not_none_element, list):
|
|
90
|
+
typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
|
|
91
|
+
|
|
92
|
+
types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
|
|
84
93
|
|
|
85
94
|
if length < 0:
|
|
86
95
|
length = len_
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from .csv import read_csv
|
|
2
|
+
from .database import read_database
|
|
2
3
|
from .datachain import C, Column, DataChain
|
|
3
|
-
from .datasets import datasets, read_dataset
|
|
4
|
+
from .datasets import datasets, delete_dataset, read_dataset
|
|
4
5
|
from .hf import read_hf
|
|
5
6
|
from .json import read_json
|
|
6
7
|
from .listings import listings
|
|
@@ -19,8 +20,10 @@ __all__ = [
|
|
|
19
20
|
"DatasetPrepareError",
|
|
20
21
|
"Sys",
|
|
21
22
|
"datasets",
|
|
23
|
+
"delete_dataset",
|
|
22
24
|
"listings",
|
|
23
25
|
"read_csv",
|
|
26
|
+
"read_database",
|
|
24
27
|
"read_dataset",
|
|
25
28
|
"read_hf",
|
|
26
29
|
"read_json",
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import itertools
|
|
3
|
+
import os
|
|
4
|
+
import sqlite3
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
6
|
+
|
|
7
|
+
import sqlalchemy
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
11
|
+
|
|
12
|
+
import sqlalchemy.orm # noqa: TC004
|
|
13
|
+
|
|
14
|
+
from datachain.lib.data_model import DataType
|
|
15
|
+
from datachain.query import Session
|
|
16
|
+
|
|
17
|
+
from .datachain import DataChain
|
|
18
|
+
|
|
19
|
+
ConnectionType = Union[
|
|
20
|
+
str,
|
|
21
|
+
sqlalchemy.engine.URL,
|
|
22
|
+
sqlalchemy.engine.interfaces.Connectable,
|
|
23
|
+
sqlalchemy.engine.Engine,
|
|
24
|
+
sqlalchemy.engine.Connection,
|
|
25
|
+
sqlalchemy.orm.Session,
|
|
26
|
+
sqlite3.Connection,
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@contextlib.contextmanager
|
|
31
|
+
def _connect(
|
|
32
|
+
connection: "ConnectionType",
|
|
33
|
+
) -> "Iterator[Union[sqlalchemy.engine.Connection, sqlalchemy.orm.Session]]":
|
|
34
|
+
import sqlalchemy.orm
|
|
35
|
+
|
|
36
|
+
with contextlib.ExitStack() as stack:
|
|
37
|
+
engine_kwargs = {"echo": bool(os.environ.get("DEBUG_SHOW_SQL_QUERIES"))}
|
|
38
|
+
if isinstance(connection, (str, sqlalchemy.URL)):
|
|
39
|
+
engine = sqlalchemy.create_engine(connection, **engine_kwargs)
|
|
40
|
+
stack.callback(engine.dispose)
|
|
41
|
+
yield stack.enter_context(engine.connect())
|
|
42
|
+
elif isinstance(connection, sqlite3.Connection):
|
|
43
|
+
engine = sqlalchemy.create_engine(
|
|
44
|
+
"sqlite://", creator=lambda: connection, **engine_kwargs
|
|
45
|
+
)
|
|
46
|
+
# do not close the connection, as it is managed by the caller
|
|
47
|
+
yield engine.connect()
|
|
48
|
+
elif isinstance(connection, sqlalchemy.Engine):
|
|
49
|
+
yield stack.enter_context(connection.connect())
|
|
50
|
+
elif isinstance(connection, (sqlalchemy.Connection, sqlalchemy.orm.Session)):
|
|
51
|
+
# do not close the connection, as it is managed by the caller
|
|
52
|
+
yield connection
|
|
53
|
+
else:
|
|
54
|
+
raise TypeError(f"Unsupported connection type: {type(connection).__name__}")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _infer_schema(
|
|
58
|
+
result: "sqlalchemy.engine.Result",
|
|
59
|
+
to_infer: list[str],
|
|
60
|
+
infer_schema_length: Optional[int] = 100,
|
|
61
|
+
) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
|
|
62
|
+
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
63
|
+
|
|
64
|
+
if not to_infer:
|
|
65
|
+
return [], {}
|
|
66
|
+
|
|
67
|
+
rows = list(itertools.islice(result, infer_schema_length))
|
|
68
|
+
values = {col: [row._mapping[col] for row in rows] for col in to_infer}
|
|
69
|
+
_, output_schema, _ = values_to_tuples("", **values)
|
|
70
|
+
return rows, output_schema
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def read_database(
|
|
74
|
+
query: Union[str, "sqlalchemy.sql.expression.Executable"],
|
|
75
|
+
connection: "ConnectionType",
|
|
76
|
+
params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
|
|
77
|
+
*,
|
|
78
|
+
output: Optional["dict[str, DataType]"] = None,
|
|
79
|
+
session: Optional["Session"] = None,
|
|
80
|
+
settings: Optional[dict] = None,
|
|
81
|
+
in_memory: bool = False,
|
|
82
|
+
infer_schema_length: Optional[int] = 100,
|
|
83
|
+
) -> "DataChain":
|
|
84
|
+
"""
|
|
85
|
+
Read the results of a SQL query into a DataChain, using a given database connection.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
query:
|
|
89
|
+
The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
|
|
90
|
+
`Executable` object.
|
|
91
|
+
connection: SQLAlchemy connectable, str, or a sqlite3 connection
|
|
92
|
+
Using SQLAlchemy makes it possible to use any DB supported by that
|
|
93
|
+
library. If a DBAPI2 object, only sqlite3 is supported. The user is
|
|
94
|
+
responsible for engine disposal and connection closure for the
|
|
95
|
+
SQLAlchemy connectable; str connections are closed automatically.
|
|
96
|
+
params: Parameters to pass to execute method.
|
|
97
|
+
output: A dictionary mapping column names to types, used to override the
|
|
98
|
+
schema inferred from the query results.
|
|
99
|
+
session: Session to use for the chain.
|
|
100
|
+
settings: Settings to use for the chain.
|
|
101
|
+
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
102
|
+
infer_schema_length:
|
|
103
|
+
The maximum number of rows to scan for inferring schema.
|
|
104
|
+
If set to `None`, the full data may be scanned.
|
|
105
|
+
The rows used for schema inference are stored in memory,
|
|
106
|
+
so large values can lead to high memory usage.
|
|
107
|
+
Only applies if the `output` parameter is not set for the given column.
|
|
108
|
+
|
|
109
|
+
Examples:
|
|
110
|
+
Reading from a SQL query against a user-supplied connection:
|
|
111
|
+
```python
|
|
112
|
+
query = "SELECT key, value FROM tbl"
|
|
113
|
+
chain = dc.read_database(query, connection, output={"value": float})
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Load data from a SQLAlchemy driver/engine:
|
|
117
|
+
```python
|
|
118
|
+
from sqlalchemy import create_engine
|
|
119
|
+
engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
|
|
120
|
+
chain = dc.read_database("select * from tbl", engine)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Load data from a parameterized SQLAlchemy query:
|
|
124
|
+
```python
|
|
125
|
+
query = "SELECT key, value FROM tbl WHERE value > :value"
|
|
126
|
+
dc.read_database(query, engine, params={"value": 50})
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Notes:
|
|
130
|
+
This function works with a variety of databases — including, but not limited to,
|
|
131
|
+
SQLite, DuckDB, PostgreSQL, and Snowflake, provided the appropriate driver is
|
|
132
|
+
installed.
|
|
133
|
+
"""
|
|
134
|
+
from datachain.lib.dc.records import read_records
|
|
135
|
+
|
|
136
|
+
output = output or {}
|
|
137
|
+
if isinstance(query, str):
|
|
138
|
+
query = sqlalchemy.text(query)
|
|
139
|
+
kw = {"execution_options": {"stream_results": True}} # use server-side cursors
|
|
140
|
+
with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
|
|
141
|
+
cols = result.keys()
|
|
142
|
+
to_infer = [k for k in cols if k not in output] # preserve the order
|
|
143
|
+
rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
|
|
144
|
+
records = (row._asdict() for row in itertools.chain(rows, result))
|
|
145
|
+
return read_records(
|
|
146
|
+
records,
|
|
147
|
+
session=session,
|
|
148
|
+
settings=settings,
|
|
149
|
+
in_memory=in_memory,
|
|
150
|
+
schema=inferred_schema | output,
|
|
151
|
+
)
|
|
@@ -133,7 +133,7 @@ class DataChain:
|
|
|
133
133
|
.choices[0]
|
|
134
134
|
.message.content,
|
|
135
135
|
)
|
|
136
|
-
.
|
|
136
|
+
.persist()
|
|
137
137
|
)
|
|
138
138
|
|
|
139
139
|
try:
|
|
@@ -443,9 +443,20 @@ class DataChain:
|
|
|
443
443
|
)
|
|
444
444
|
return listings(*args, **kwargs)
|
|
445
445
|
|
|
446
|
+
def persist(self) -> "Self":
|
|
447
|
+
"""Saves temporary chain that will be removed after the process ends.
|
|
448
|
+
Temporary datasets are useful for optimization, for example when we have
|
|
449
|
+
multiple chains starting with identical sub-chain. We can then persist that
|
|
450
|
+
common chain and use it to calculate other chains, to avoid re-calculation
|
|
451
|
+
every time.
|
|
452
|
+
It returns the chain itself.
|
|
453
|
+
"""
|
|
454
|
+
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
455
|
+
return self._evolve(query=self._query.save(feature_schema=schema))
|
|
456
|
+
|
|
446
457
|
def save( # type: ignore[override]
|
|
447
458
|
self,
|
|
448
|
-
name:
|
|
459
|
+
name: str,
|
|
449
460
|
version: Optional[int] = None,
|
|
450
461
|
description: Optional[str] = None,
|
|
451
462
|
labels: Optional[list[str]] = None,
|
|
@@ -454,8 +465,7 @@ class DataChain:
|
|
|
454
465
|
"""Save to a Dataset. It returns the chain itself.
|
|
455
466
|
|
|
456
467
|
Parameters:
|
|
457
|
-
name : dataset name.
|
|
458
|
-
removed after process ends. Temp dataset are useful for optimization.
|
|
468
|
+
name : dataset name.
|
|
459
469
|
version : version of a dataset. Default - the last version that exist.
|
|
460
470
|
description : description of a dataset.
|
|
461
471
|
labels : labels of a dataset.
|
|
@@ -1112,7 +1122,7 @@ class DataChain:
|
|
|
1112
1122
|
if self._query.attached:
|
|
1113
1123
|
chain = self
|
|
1114
1124
|
else:
|
|
1115
|
-
chain = self.
|
|
1125
|
+
chain = self.persist()
|
|
1116
1126
|
assert chain.name is not None # for mypy
|
|
1117
1127
|
return PytorchDataset(
|
|
1118
1128
|
chain.name,
|
|
@@ -166,3 +166,46 @@ def datasets(
|
|
|
166
166
|
output={column: DatasetInfo},
|
|
167
167
|
**{column: datasets_values}, # type: ignore[arg-type]
|
|
168
168
|
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def delete_dataset(
|
|
172
|
+
name: str,
|
|
173
|
+
version: Optional[int] = None,
|
|
174
|
+
force: Optional[bool] = False,
|
|
175
|
+
studio: Optional[bool] = False,
|
|
176
|
+
session: Optional[Session] = None,
|
|
177
|
+
in_memory: bool = False,
|
|
178
|
+
) -> None:
|
|
179
|
+
"""Removes specific dataset version or all dataset versions, depending on
|
|
180
|
+
a force flag.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
name : Dataset name
|
|
184
|
+
version : Optional dataset version
|
|
185
|
+
force: If true, all datasets versions will be removed. Defaults to False.
|
|
186
|
+
studio: If True, removes dataset from Studio only,
|
|
187
|
+
otherwise remove from local. Defaults to False.
|
|
188
|
+
session: Optional session instance. If not provided, uses default session.
|
|
189
|
+
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
190
|
+
|
|
191
|
+
Returns: None
|
|
192
|
+
|
|
193
|
+
Example:
|
|
194
|
+
```py
|
|
195
|
+
import datachain as dc
|
|
196
|
+
dc.delete_dataset("cats")
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
```py
|
|
200
|
+
import datachain as dc
|
|
201
|
+
dc.delete_dataset("cats", version=1)
|
|
202
|
+
```
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
session = Session.get(session, in_memory=in_memory)
|
|
206
|
+
catalog = session.catalog
|
|
207
|
+
if not force:
|
|
208
|
+
version = version or catalog.get_dataset(name).latest_version
|
|
209
|
+
else:
|
|
210
|
+
version = None
|
|
211
|
+
catalog.remove_dataset(name, version=version, force=force, studio=studio)
|
|
@@ -37,7 +37,14 @@ def read_pandas( # type: ignore[override]
|
|
|
37
37
|
"""
|
|
38
38
|
from .utils import DatasetPrepareError
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
def get_col_name(col):
|
|
41
|
+
if isinstance(col, tuple):
|
|
42
|
+
# Join tuple elements with underscore for MultiIndex columns
|
|
43
|
+
return "_".join(map(str, col)).lower()
|
|
44
|
+
# Handle regular string column names
|
|
45
|
+
return str(col).lower()
|
|
46
|
+
|
|
47
|
+
fr_map = {get_col_name(col): df[col].tolist() for col in df.columns}
|
|
41
48
|
|
|
42
49
|
for c in fr_map:
|
|
43
50
|
if not c.isidentifier():
|
|
@@ -1,8 +1,5 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
Optional,
|
|
4
|
-
Union,
|
|
5
|
-
)
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
6
3
|
|
|
7
4
|
import sqlalchemy
|
|
8
5
|
|
|
@@ -12,6 +9,7 @@ from datachain.lib.file import (
|
|
|
12
9
|
)
|
|
13
10
|
from datachain.lib.signal_schema import SignalSchema
|
|
14
11
|
from datachain.query import Session
|
|
12
|
+
from datachain.query.schema import Column
|
|
15
13
|
|
|
16
14
|
if TYPE_CHECKING:
|
|
17
15
|
from typing_extensions import ParamSpec
|
|
@@ -22,7 +20,7 @@ if TYPE_CHECKING:
|
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
def read_records(
|
|
25
|
-
to_insert: Optional[Union[dict,
|
|
23
|
+
to_insert: Optional[Union[dict, Iterable[dict]]],
|
|
26
24
|
session: Optional[Session] = None,
|
|
27
25
|
settings: Optional[dict] = None,
|
|
28
26
|
in_memory: bool = False,
|
|
@@ -54,10 +52,11 @@ def read_records(
|
|
|
54
52
|
|
|
55
53
|
if schema:
|
|
56
54
|
signal_schema = SignalSchema(schema)
|
|
57
|
-
columns = [
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
55
|
+
columns = []
|
|
56
|
+
for c in signal_schema.db_signals(as_columns=True):
|
|
57
|
+
assert isinstance(c, Column)
|
|
58
|
+
kw = {"nullable": c.nullable} if c.nullable is not None else {}
|
|
59
|
+
columns.append(sqlalchemy.Column(c.name, c.type, **kw))
|
|
61
60
|
else:
|
|
62
61
|
columns = [
|
|
63
62
|
sqlalchemy.Column(name, typ)
|
|
@@ -83,8 +82,7 @@ def read_records(
|
|
|
83
82
|
|
|
84
83
|
warehouse = catalog.warehouse
|
|
85
84
|
dr = warehouse.dataset_rows(dsr)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
db.execute(insert_q.values(**record))
|
|
85
|
+
table = dr.get_table()
|
|
86
|
+
warehouse.insert_rows(table, to_insert)
|
|
87
|
+
warehouse.insert_rows_done(table)
|
|
90
88
|
return read_dataset(name=dsr.name, session=session, settings=settings)
|
|
@@ -581,7 +581,11 @@ class SignalSchema:
|
|
|
581
581
|
signals = [
|
|
582
582
|
DEFAULT_DELIMITER.join(path)
|
|
583
583
|
if not as_columns
|
|
584
|
-
else Column(
|
|
584
|
+
else Column(
|
|
585
|
+
DEFAULT_DELIMITER.join(path),
|
|
586
|
+
python_to_sql(_type),
|
|
587
|
+
nullable=is_optional(_type),
|
|
588
|
+
)
|
|
585
589
|
for path, _type, has_subtree, _ in self.get_flat_tree(
|
|
586
590
|
include_hidden=include_hidden
|
|
587
591
|
)
|
|
@@ -990,3 +994,8 @@ class SignalSchema:
|
|
|
990
994
|
}
|
|
991
995
|
|
|
992
996
|
return SignalSchema.deserialize(schema)
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def is_optional(type_: Any) -> bool:
|
|
1000
|
+
"""Check if a type is Optional."""
|
|
1001
|
+
return get_origin(type_) is Union and type(None) in get_args(type_)
|
|
@@ -437,9 +437,17 @@ class UDFStep(Step, ABC):
|
|
|
437
437
|
"distributed processing."
|
|
438
438
|
)
|
|
439
439
|
|
|
440
|
-
from datachain.catalog.loader import
|
|
440
|
+
from datachain.catalog.loader import (
|
|
441
|
+
DISTRIBUTED_IMPORT_PATH,
|
|
442
|
+
get_udf_distributor_class,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
if not (udf_distributor_class := get_udf_distributor_class()):
|
|
446
|
+
raise RuntimeError(
|
|
447
|
+
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
448
|
+
"for distributed UDF processing."
|
|
449
|
+
)
|
|
441
450
|
|
|
442
|
-
udf_distributor_class = get_udf_distributor_class()
|
|
443
451
|
udf_distributor = udf_distributor_class(
|
|
444
452
|
catalog=catalog,
|
|
445
453
|
table=udf_table,
|
|
@@ -1162,16 +1170,6 @@ class DatasetQuery:
|
|
|
1162
1170
|
)
|
|
1163
1171
|
return sqlalchemy.table(table_name)
|
|
1164
1172
|
|
|
1165
|
-
@staticmethod
|
|
1166
|
-
def delete(
|
|
1167
|
-
name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
|
|
1168
|
-
) -> None:
|
|
1169
|
-
from datachain.catalog import get_catalog
|
|
1170
|
-
|
|
1171
|
-
catalog = catalog or get_catalog()
|
|
1172
|
-
version = version or catalog.get_dataset(name).latest_version
|
|
1173
|
-
catalog.remove_dataset(name, version)
|
|
1174
|
-
|
|
1175
1173
|
@property
|
|
1176
1174
|
def attached(self) -> bool:
|
|
1177
1175
|
"""
|
|
@@ -13,7 +13,7 @@ from multiprocess import get_context
|
|
|
13
13
|
|
|
14
14
|
from datachain.catalog import Catalog
|
|
15
15
|
from datachain.catalog.catalog import clone_catalog_with_cache
|
|
16
|
-
from datachain.catalog.loader import get_udf_distributor_class
|
|
16
|
+
from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
|
|
17
17
|
from datachain.lib.udf import _get_cache
|
|
18
18
|
from datachain.query.batch import RowsOutput, RowsOutputBatch
|
|
19
19
|
from datachain.query.dataset import (
|
|
@@ -91,7 +91,12 @@ def udf_entrypoint() -> int:
|
|
|
91
91
|
|
|
92
92
|
|
|
93
93
|
def udf_worker_entrypoint() -> int:
|
|
94
|
-
|
|
94
|
+
if not (udf_distributor_class := get_udf_distributor_class()):
|
|
95
|
+
raise RuntimeError(
|
|
96
|
+
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
97
|
+
"for distributed UDF processing."
|
|
98
|
+
)
|
|
99
|
+
return udf_distributor_class.run_worker()
|
|
95
100
|
|
|
96
101
|
|
|
97
102
|
class UDFDispatcher:
|
|
@@ -40,12 +40,15 @@ class ColumnMeta(type):
|
|
|
40
40
|
class Column(sa.ColumnClause, metaclass=ColumnMeta):
|
|
41
41
|
inherit_cache: Optional[bool] = True
|
|
42
42
|
|
|
43
|
-
def __init__(
|
|
43
|
+
def __init__(
|
|
44
|
+
self, text, type_=None, is_literal=False, nullable=None, _selectable=None
|
|
45
|
+
):
|
|
44
46
|
"""Dataset column."""
|
|
45
47
|
self.name = ColumnMeta.to_db_name(text)
|
|
46
48
|
super().__init__(
|
|
47
49
|
self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
|
|
48
50
|
)
|
|
51
|
+
self.nullable = nullable
|
|
49
52
|
|
|
50
53
|
def __getattr__(self, name: str):
|
|
51
54
|
return Column(self.name + DEFAULT_DELIMITER + name)
|
|
@@ -168,6 +168,7 @@ src/datachain/lib/convert/unflatten.py
|
|
|
168
168
|
src/datachain/lib/convert/values_to_tuples.py
|
|
169
169
|
src/datachain/lib/dc/__init__.py
|
|
170
170
|
src/datachain/lib/dc/csv.py
|
|
171
|
+
src/datachain/lib/dc/database.py
|
|
171
172
|
src/datachain/lib/dc/datachain.py
|
|
172
173
|
src/datachain/lib/dc/datasets.py
|
|
173
174
|
src/datachain/lib/dc/hf.py
|
|
@@ -267,6 +268,7 @@ tests/func/test_metrics.py
|
|
|
267
268
|
tests/func/test_pull.py
|
|
268
269
|
tests/func/test_pytorch.py
|
|
269
270
|
tests/func/test_query.py
|
|
271
|
+
tests/func/test_read_database.py
|
|
270
272
|
tests/func/test_session.py
|
|
271
273
|
tests/func/test_toolkit.py
|
|
272
274
|
tests/func/test_video.py
|
|
@@ -631,10 +631,13 @@ def dataset_rows():
|
|
|
631
631
|
|
|
632
632
|
|
|
633
633
|
@pytest.fixture
|
|
634
|
-
def
|
|
634
|
+
def studio_token():
|
|
635
635
|
with Config(ConfigLevel.GLOBAL).edit() as conf:
|
|
636
636
|
conf["studio"] = {"token": "isat_access_token", "team": "team_name"}
|
|
637
637
|
|
|
638
|
+
|
|
639
|
+
@pytest.fixture
|
|
640
|
+
def studio_datasets(requests_mock, studio_token):
|
|
638
641
|
common_version_info = {
|
|
639
642
|
"status": 1,
|
|
640
643
|
"created_at": "2024-02-23T10:42:31.842944+00:00",
|