datachain 0.28.2__tar.gz → 0.29.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.28.2 → datachain-0.29.0}/PKG-INFO +1 -1
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/data_storage/warehouse.py +2 -1
- datachain-0.29.0/src/datachain/lib/dc/database.py +330 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/datachain.py +104 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/signal_schema.py +2 -2
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain.egg-info/PKG-INFO +1 -1
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/examples/test_examples.py +0 -1
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_hf.py +0 -1
- datachain-0.29.0/tests/func/test_to_database.py +778 -0
- datachain-0.28.2/src/datachain/lib/dc/database.py +0 -153
- {datachain-0.28.2 → datachain-0.29.0}/.cruft.json +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.gitattributes +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.github/codecov.yaml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.github/dependabot.yml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.github/workflows/release.yml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.gitignore +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/.pre-commit-config.yaml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/LICENSE +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/README.rst +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/commands/auth/login.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/commands/auth/logout.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/commands/auth/team.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/commands/auth/token.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/commands/index.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/commands/job/cancel.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/commands/job/clusters.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/commands/job/logs.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/commands/job/ls.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/commands/job/run.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/contributing.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/examples.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/guide/db_migrations.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/guide/delta.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/guide/env.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/guide/index.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/guide/namespaces.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/guide/processing.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/guide/remotes.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/guide/retry.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/index.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/overrides/main.html +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/quick-start.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/datachain.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/func.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/index.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/toolkit.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/torch.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/references/udf.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/docs/tutorials.md +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/mkdocs.yml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/noxfile.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/pyproject.toml +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/setup.cfg +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/__main__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/asyn.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cache.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/client/local.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/config.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/dataset.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/delta.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/error.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/array.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/base.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/func.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/path.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/random.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/string.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/func/window.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/job.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/audio.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/file.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/projects.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/listing.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/namespace.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/node.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/progress.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/project.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/py.typed +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/dataset.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/params.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/session.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/semver.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/studio.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain/utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/conftest.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/data.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/examples/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/functions/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/functions/test_array.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/functions/test_path.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/functions/test_random.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/functions/test_string.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_audio.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_batching.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_client.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_datachain.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_delta.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_file.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_image.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_listing.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_ls.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_metastore.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_pull.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_query.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_read_database.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_retry.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_session.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_video.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/test_atomicity.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/test_cli_studio.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/test_import_time.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/test_telemetry.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_settings.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_client.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_config.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_func.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_query.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_semver.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_session.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.28.2 → datachain-0.29.0}/tests/utils.py +0 -0
|
@@ -21,6 +21,7 @@ from datachain.lib.file import File
|
|
|
21
21
|
from datachain.lib.signal_schema import SignalSchema
|
|
22
22
|
from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
|
|
23
23
|
from datachain.query.batch import RowsOutput
|
|
24
|
+
from datachain.query.schema import ColumnMeta
|
|
24
25
|
from datachain.query.utils import get_query_id_column
|
|
25
26
|
from datachain.sql.functions import path as pathfunc
|
|
26
27
|
from datachain.sql.types import Int, SQLType
|
|
@@ -400,7 +401,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
400
401
|
expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
|
|
401
402
|
sa.func.count(table.c.sys__id),
|
|
402
403
|
)
|
|
403
|
-
size_column_names = [
|
|
404
|
+
size_column_names = [ColumnMeta.to_db_name(s) + "__size" for s in file_signals]
|
|
404
405
|
size_columns = [c for c in table.columns if c.name in size_column_names]
|
|
405
406
|
|
|
406
407
|
if size_columns:
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import itertools
|
|
3
|
+
import os
|
|
4
|
+
import sqlite3
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
6
|
+
|
|
7
|
+
import sqlalchemy
|
|
8
|
+
|
|
9
|
+
from datachain.query.schema import ColumnMeta
|
|
10
|
+
|
|
11
|
+
DEFAULT_DATABASE_BATCH_SIZE = 10_000
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
15
|
+
|
|
16
|
+
import sqlalchemy.orm # noqa: TC004
|
|
17
|
+
|
|
18
|
+
from datachain.lib.data_model import DataType
|
|
19
|
+
from datachain.query import Session
|
|
20
|
+
|
|
21
|
+
from .datachain import DataChain
|
|
22
|
+
|
|
23
|
+
ConnectionType = Union[
|
|
24
|
+
str,
|
|
25
|
+
sqlalchemy.engine.URL,
|
|
26
|
+
sqlalchemy.engine.interfaces.Connectable,
|
|
27
|
+
sqlalchemy.engine.Engine,
|
|
28
|
+
sqlalchemy.engine.Connection,
|
|
29
|
+
sqlalchemy.orm.Session,
|
|
30
|
+
sqlite3.Connection,
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@contextlib.contextmanager
|
|
35
|
+
def _connect(
|
|
36
|
+
connection: "ConnectionType",
|
|
37
|
+
) -> "Iterator[sqlalchemy.engine.Connection]":
|
|
38
|
+
import sqlalchemy.orm
|
|
39
|
+
|
|
40
|
+
with contextlib.ExitStack() as stack:
|
|
41
|
+
engine_kwargs = {"echo": bool(os.environ.get("DEBUG_SHOW_SQL_QUERIES"))}
|
|
42
|
+
if isinstance(connection, (str, sqlalchemy.URL)):
|
|
43
|
+
engine = sqlalchemy.create_engine(connection, **engine_kwargs)
|
|
44
|
+
stack.callback(engine.dispose)
|
|
45
|
+
yield stack.enter_context(engine.connect())
|
|
46
|
+
elif isinstance(connection, sqlite3.Connection):
|
|
47
|
+
engine = sqlalchemy.create_engine(
|
|
48
|
+
"sqlite://", creator=lambda: connection, **engine_kwargs
|
|
49
|
+
)
|
|
50
|
+
# do not close the connection, as it is managed by the caller
|
|
51
|
+
yield engine.connect()
|
|
52
|
+
elif isinstance(connection, sqlalchemy.Engine):
|
|
53
|
+
yield stack.enter_context(connection.connect())
|
|
54
|
+
elif isinstance(connection, sqlalchemy.Connection):
|
|
55
|
+
# do not close the connection, as it is managed by the caller
|
|
56
|
+
yield connection
|
|
57
|
+
elif isinstance(connection, sqlalchemy.orm.Session):
|
|
58
|
+
# For Session objects, get the underlying bind (Engine or Connection)
|
|
59
|
+
# Sessions don't support DDL operations directly
|
|
60
|
+
bind = connection.get_bind()
|
|
61
|
+
if isinstance(bind, sqlalchemy.Engine):
|
|
62
|
+
yield stack.enter_context(bind.connect())
|
|
63
|
+
else:
|
|
64
|
+
# bind is already a Connection
|
|
65
|
+
yield bind
|
|
66
|
+
else:
|
|
67
|
+
raise TypeError(f"Unsupported connection type: {type(connection).__name__}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def to_database(
|
|
71
|
+
chain: "DataChain",
|
|
72
|
+
table_name: str,
|
|
73
|
+
connection: "ConnectionType",
|
|
74
|
+
*,
|
|
75
|
+
batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
|
|
76
|
+
on_conflict: Optional[str] = None,
|
|
77
|
+
column_mapping: Optional[dict[str, Optional[str]]] = None,
|
|
78
|
+
) -> None:
|
|
79
|
+
"""
|
|
80
|
+
Implementation function for exporting DataChain to database tables.
|
|
81
|
+
|
|
82
|
+
This is the core implementation that handles the actual database operations.
|
|
83
|
+
For user-facing documentation, see DataChain.to_database() method.
|
|
84
|
+
"""
|
|
85
|
+
from datachain.utils import batched
|
|
86
|
+
|
|
87
|
+
if on_conflict and on_conflict not in ("ignore", "update"):
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f"on_conflict must be 'ignore' or 'update', got: {on_conflict}"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
signals_schema = chain.signals_schema.clone_without_sys_signals()
|
|
93
|
+
all_columns = [
|
|
94
|
+
sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
|
|
95
|
+
for c in signals_schema.db_signals(as_columns=True)
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
column_mapping = column_mapping or {}
|
|
99
|
+
normalized_column_mapping = _normalize_column_mapping(column_mapping)
|
|
100
|
+
column_indices_and_names, columns = _prepare_columns(
|
|
101
|
+
all_columns, normalized_column_mapping
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
with _connect(connection) as conn:
|
|
105
|
+
metadata = sqlalchemy.MetaData()
|
|
106
|
+
table = sqlalchemy.Table(table_name, metadata, *columns)
|
|
107
|
+
|
|
108
|
+
# Check if table already exists to determine if we should clean up on error.
|
|
109
|
+
inspector = sqlalchemy.inspect(conn)
|
|
110
|
+
assert inspector # to satisfy mypy
|
|
111
|
+
table_existed_before = table_name in inspector.get_table_names()
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
table.create(conn, checkfirst=True)
|
|
115
|
+
rows_iter = chain._leaf_values()
|
|
116
|
+
for batch in batched(rows_iter, batch_rows):
|
|
117
|
+
_process_batch(
|
|
118
|
+
conn, table, batch, on_conflict, column_indices_and_names
|
|
119
|
+
)
|
|
120
|
+
conn.commit()
|
|
121
|
+
except Exception:
|
|
122
|
+
if not table_existed_before:
|
|
123
|
+
try:
|
|
124
|
+
table.drop(conn, checkfirst=True)
|
|
125
|
+
conn.commit()
|
|
126
|
+
except sqlalchemy.exc.SQLAlchemyError:
|
|
127
|
+
pass
|
|
128
|
+
raise
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _normalize_column_mapping(
|
|
132
|
+
column_mapping: dict[str, Optional[str]],
|
|
133
|
+
) -> dict[str, Optional[str]]:
|
|
134
|
+
"""
|
|
135
|
+
Convert column mapping keys from DataChain format (dots) to database format
|
|
136
|
+
(double underscores).
|
|
137
|
+
|
|
138
|
+
This allows users to specify column mappings using the intuitive DataChain
|
|
139
|
+
format like: {"nested_data.value": "data_value"} instead of
|
|
140
|
+
{"nested_data__value": "data_value"}
|
|
141
|
+
"""
|
|
142
|
+
if not column_mapping:
|
|
143
|
+
return {}
|
|
144
|
+
|
|
145
|
+
normalized_mapping: dict[str, Optional[str]] = {}
|
|
146
|
+
original_keys: dict[str, str] = {}
|
|
147
|
+
for key, value in column_mapping.items():
|
|
148
|
+
db_key = ColumnMeta.to_db_name(key)
|
|
149
|
+
if db_key in normalized_mapping:
|
|
150
|
+
prev = original_keys[db_key]
|
|
151
|
+
raise ValueError(
|
|
152
|
+
"Column mapping collision: multiple keys map to the same "
|
|
153
|
+
f"database column name '{db_key}': '{prev}' and '{key}'. "
|
|
154
|
+
)
|
|
155
|
+
normalized_mapping[db_key] = value
|
|
156
|
+
original_keys[db_key] = key
|
|
157
|
+
|
|
158
|
+
# If it's a defaultdict, preserve the default factory
|
|
159
|
+
if hasattr(column_mapping, "default_factory"):
|
|
160
|
+
from collections import defaultdict
|
|
161
|
+
|
|
162
|
+
default_factory = column_mapping.default_factory
|
|
163
|
+
result: dict[str, Optional[str]] = defaultdict(default_factory)
|
|
164
|
+
result.update(normalized_mapping)
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
return normalized_mapping
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _prepare_columns(all_columns, column_mapping):
|
|
171
|
+
"""Prepare column mapping and column definitions."""
|
|
172
|
+
column_indices_and_names = [] # List of (index, target_name) tuples
|
|
173
|
+
columns = []
|
|
174
|
+
for idx, col in enumerate(all_columns):
|
|
175
|
+
if col.name in column_mapping or hasattr(column_mapping, "default_factory"):
|
|
176
|
+
mapped_name = column_mapping[col.name]
|
|
177
|
+
if mapped_name:
|
|
178
|
+
columns.append(sqlalchemy.Column(mapped_name, col.type))
|
|
179
|
+
column_indices_and_names.append((idx, mapped_name))
|
|
180
|
+
else:
|
|
181
|
+
columns.append(col)
|
|
182
|
+
column_indices_and_names.append((idx, col.name))
|
|
183
|
+
return column_indices_and_names, columns
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _process_batch(conn, table, batch, on_conflict, column_indices_and_names):
|
|
187
|
+
"""Process a batch of rows with conflict resolution."""
|
|
188
|
+
|
|
189
|
+
def prepare_row(row_values):
|
|
190
|
+
"""Convert a row tuple to a dictionary with proper DB column names."""
|
|
191
|
+
return {
|
|
192
|
+
target_name: row_values[idx]
|
|
193
|
+
for idx, target_name in column_indices_and_names
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
rows_to_insert = [prepare_row(row) for row in batch]
|
|
197
|
+
|
|
198
|
+
supports_conflict = on_conflict and conn.engine.name in ("postgresql", "sqlite")
|
|
199
|
+
|
|
200
|
+
if supports_conflict:
|
|
201
|
+
# Use dialect-specific insert for conflict resolution
|
|
202
|
+
if conn.engine.name == "postgresql":
|
|
203
|
+
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
|
204
|
+
|
|
205
|
+
insert_stmt = pg_insert(table)
|
|
206
|
+
elif conn.engine.name == "sqlite":
|
|
207
|
+
from sqlalchemy.dialects.sqlite import insert as sqlite_insert
|
|
208
|
+
|
|
209
|
+
insert_stmt = sqlite_insert(table)
|
|
210
|
+
else:
|
|
211
|
+
insert_stmt = table.insert()
|
|
212
|
+
|
|
213
|
+
if supports_conflict:
|
|
214
|
+
if on_conflict == "ignore":
|
|
215
|
+
insert_stmt = insert_stmt.on_conflict_do_nothing()
|
|
216
|
+
elif on_conflict == "update":
|
|
217
|
+
update_values = {
|
|
218
|
+
col.name: insert_stmt.excluded[col.name] for col in table.columns
|
|
219
|
+
}
|
|
220
|
+
insert_stmt = insert_stmt.on_conflict_do_update(set_=update_values)
|
|
221
|
+
elif on_conflict:
|
|
222
|
+
import warnings
|
|
223
|
+
|
|
224
|
+
warnings.warn(
|
|
225
|
+
f"Database does not support conflict resolution. "
|
|
226
|
+
f"Ignoring on_conflict='{on_conflict}' parameter.",
|
|
227
|
+
UserWarning,
|
|
228
|
+
stacklevel=2,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
conn.execute(insert_stmt, rows_to_insert)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def read_database(
|
|
235
|
+
query: Union[str, "sqlalchemy.sql.expression.Executable"],
|
|
236
|
+
connection: "ConnectionType",
|
|
237
|
+
params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
|
|
238
|
+
*,
|
|
239
|
+
output: Optional["dict[str, DataType]"] = None,
|
|
240
|
+
session: Optional["Session"] = None,
|
|
241
|
+
settings: Optional[dict] = None,
|
|
242
|
+
in_memory: bool = False,
|
|
243
|
+
infer_schema_length: Optional[int] = 100,
|
|
244
|
+
) -> "DataChain":
|
|
245
|
+
"""
|
|
246
|
+
Read the results of a SQL query into a DataChain, using a given database connection.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
query:
|
|
250
|
+
The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
|
|
251
|
+
`Executable` object.
|
|
252
|
+
connection: SQLAlchemy connectable, str, or a sqlite3 connection
|
|
253
|
+
Using SQLAlchemy makes it possible to use any DB supported by that
|
|
254
|
+
library. If a DBAPI2 object, only sqlite3 is supported. The user is
|
|
255
|
+
responsible for engine disposal and connection closure for the
|
|
256
|
+
SQLAlchemy connectable; str connections are closed automatically.
|
|
257
|
+
params: Parameters to pass to execute method.
|
|
258
|
+
output: A dictionary mapping column names to types, used to override the
|
|
259
|
+
schema inferred from the query results.
|
|
260
|
+
session: Session to use for the chain.
|
|
261
|
+
settings: Settings to use for the chain.
|
|
262
|
+
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
263
|
+
infer_schema_length:
|
|
264
|
+
The maximum number of rows to scan for inferring schema.
|
|
265
|
+
If set to `None`, the full data may be scanned.
|
|
266
|
+
The rows used for schema inference are stored in memory,
|
|
267
|
+
so large values can lead to high memory usage.
|
|
268
|
+
Only applies if the `output` parameter is not set for the given column.
|
|
269
|
+
|
|
270
|
+
Examples:
|
|
271
|
+
Reading from a SQL query against a user-supplied connection:
|
|
272
|
+
```python
|
|
273
|
+
query = "SELECT key, value FROM tbl"
|
|
274
|
+
chain = dc.read_database(query, connection, output={"value": float})
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Load data from a SQLAlchemy driver/engine:
|
|
278
|
+
```python
|
|
279
|
+
from sqlalchemy import create_engine
|
|
280
|
+
engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
|
|
281
|
+
chain = dc.read_database("select * from tbl", engine)
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
Load data from a parameterized SQLAlchemy query:
|
|
285
|
+
```python
|
|
286
|
+
query = "SELECT key, value FROM tbl WHERE value > :value"
|
|
287
|
+
dc.read_database(query, engine, params={"value": 50})
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
Notes:
|
|
291
|
+
- This function works with a variety of databases — including,
|
|
292
|
+
but not limited to, SQLite, DuckDB, PostgreSQL, and Snowflake,
|
|
293
|
+
provided the appropriate driver is installed.
|
|
294
|
+
- This call is blocking, and will execute the query and return once the
|
|
295
|
+
results are saved.
|
|
296
|
+
"""
|
|
297
|
+
from datachain.lib.dc.records import read_records
|
|
298
|
+
|
|
299
|
+
output = output or {}
|
|
300
|
+
if isinstance(query, str):
|
|
301
|
+
query = sqlalchemy.text(query)
|
|
302
|
+
kw = {"execution_options": {"stream_results": True}} # use server-side cursors
|
|
303
|
+
with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
|
|
304
|
+
cols = result.keys()
|
|
305
|
+
to_infer = [k for k in cols if k not in output] # preserve the order
|
|
306
|
+
rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
|
|
307
|
+
records = (row._asdict() for row in itertools.chain(rows, result))
|
|
308
|
+
return read_records(
|
|
309
|
+
records,
|
|
310
|
+
session=session,
|
|
311
|
+
settings=settings,
|
|
312
|
+
in_memory=in_memory,
|
|
313
|
+
schema=inferred_schema | output,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _infer_schema(
|
|
318
|
+
result: "sqlalchemy.engine.Result",
|
|
319
|
+
to_infer: list[str],
|
|
320
|
+
infer_schema_length: Optional[int] = 100,
|
|
321
|
+
) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
|
|
322
|
+
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
323
|
+
|
|
324
|
+
if not to_infer:
|
|
325
|
+
return [], {}
|
|
326
|
+
|
|
327
|
+
rows = list(itertools.islice(result, infer_schema_length))
|
|
328
|
+
values = {col: [row._mapping[col] for row in rows] for col in to_infer}
|
|
329
|
+
_, output_schema, _ = values_to_tuples("", **values)
|
|
330
|
+
return rows, output_schema
|
|
@@ -58,6 +58,7 @@ from datachain.query.schema import DEFAULT_DELIMITER, Column
|
|
|
58
58
|
from datachain.sql.functions import path as pathfunc
|
|
59
59
|
from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
|
|
60
60
|
|
|
61
|
+
from .database import DEFAULT_DATABASE_BATCH_SIZE
|
|
61
62
|
from .utils import (
|
|
62
63
|
DatasetMergeError,
|
|
63
64
|
DatasetPrepareError,
|
|
@@ -77,11 +78,23 @@ UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
|
|
|
77
78
|
DEFAULT_PARQUET_CHUNK_SIZE = 100_000
|
|
78
79
|
|
|
79
80
|
if TYPE_CHECKING:
|
|
81
|
+
import sqlite3
|
|
82
|
+
|
|
80
83
|
import pandas as pd
|
|
81
84
|
from typing_extensions import ParamSpec, Self
|
|
82
85
|
|
|
83
86
|
P = ParamSpec("P")
|
|
84
87
|
|
|
88
|
+
ConnectionType = Union[
|
|
89
|
+
str,
|
|
90
|
+
sqlalchemy.engine.URL,
|
|
91
|
+
sqlalchemy.engine.interfaces.Connectable,
|
|
92
|
+
sqlalchemy.engine.Engine,
|
|
93
|
+
sqlalchemy.engine.Connection,
|
|
94
|
+
"sqlalchemy.orm.Session",
|
|
95
|
+
sqlite3.Connection,
|
|
96
|
+
]
|
|
97
|
+
|
|
85
98
|
|
|
86
99
|
T = TypeVar("T", bound="DataChain")
|
|
87
100
|
|
|
@@ -2276,6 +2289,97 @@ class DataChain:
|
|
|
2276
2289
|
"""
|
|
2277
2290
|
self.to_json(path, fs_kwargs, include_outer_list=False)
|
|
2278
2291
|
|
|
2292
|
+
def to_database(
|
|
2293
|
+
self,
|
|
2294
|
+
table_name: str,
|
|
2295
|
+
connection: "ConnectionType",
|
|
2296
|
+
*,
|
|
2297
|
+
batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
|
|
2298
|
+
on_conflict: Optional[str] = None,
|
|
2299
|
+
column_mapping: Optional[dict[str, Optional[str]]] = None,
|
|
2300
|
+
) -> None:
|
|
2301
|
+
"""Save chain to a database table using a given database connection.
|
|
2302
|
+
|
|
2303
|
+
This method exports all DataChain records to a database table, creating the
|
|
2304
|
+
table if it doesn't exist and appending data if it does. The table schema
|
|
2305
|
+
is automatically inferred from the DataChain's signal schema.
|
|
2306
|
+
|
|
2307
|
+
Parameters:
|
|
2308
|
+
table_name: Name of the database table to create/write to.
|
|
2309
|
+
connection: SQLAlchemy connectable, str, or a sqlite3 connection
|
|
2310
|
+
Using SQLAlchemy makes it possible to use any DB supported by that
|
|
2311
|
+
library. If a DBAPI2 object, only sqlite3 is supported. The user is
|
|
2312
|
+
responsible for engine disposal and connection closure for the
|
|
2313
|
+
SQLAlchemy connectable; str connections are closed automatically.
|
|
2314
|
+
batch_rows: Number of rows to insert per batch for optimal performance.
|
|
2315
|
+
Larger batches are faster but use more memory. Default: 10,000.
|
|
2316
|
+
on_conflict: Strategy for handling duplicate rows (requires table
|
|
2317
|
+
constraints):
|
|
2318
|
+
- None: Raise error (`sqlalchemy.exc.IntegrityError`) on conflict
|
|
2319
|
+
(default)
|
|
2320
|
+
- "ignore": Skip duplicate rows silently
|
|
2321
|
+
- "update": Update existing rows with new values
|
|
2322
|
+
column_mapping: Optional mapping to rename or skip columns:
|
|
2323
|
+
- Dict mapping DataChain column names to database column names
|
|
2324
|
+
- Set values to None to skip columns entirely, or use `defaultdict` to
|
|
2325
|
+
skip all columns except those specified.
|
|
2326
|
+
|
|
2327
|
+
Examples:
|
|
2328
|
+
Basic usage with PostgreSQL:
|
|
2329
|
+
```py
|
|
2330
|
+
import sqlalchemy as sa
|
|
2331
|
+
import datachain as dc
|
|
2332
|
+
|
|
2333
|
+
chain = dc.read_storage("s3://my-bucket/")
|
|
2334
|
+
engine = sa.create_engine("postgresql://user:pass@localhost/mydb")
|
|
2335
|
+
chain.to_database("files_table", engine)
|
|
2336
|
+
```
|
|
2337
|
+
|
|
2338
|
+
Using SQLite with connection string:
|
|
2339
|
+
```py
|
|
2340
|
+
chain.to_database("my_table", "sqlite:///data.db")
|
|
2341
|
+
```
|
|
2342
|
+
|
|
2343
|
+
Column mapping and renaming:
|
|
2344
|
+
```py
|
|
2345
|
+
mapping = {
|
|
2346
|
+
"user.id": "id",
|
|
2347
|
+
"user.name": "name",
|
|
2348
|
+
"user.password": None # Skip this column
|
|
2349
|
+
}
|
|
2350
|
+
chain.to_database("users", engine, column_mapping=mapping)
|
|
2351
|
+
```
|
|
2352
|
+
|
|
2353
|
+
Handling conflicts (requires PRIMARY KEY or UNIQUE constraints):
|
|
2354
|
+
```py
|
|
2355
|
+
# Skip duplicates
|
|
2356
|
+
chain.to_database("my_table", engine, on_conflict="ignore")
|
|
2357
|
+
|
|
2358
|
+
# Update existing records
|
|
2359
|
+
chain.to_database("my_table", engine, on_conflict="update")
|
|
2360
|
+
```
|
|
2361
|
+
|
|
2362
|
+
Working with different databases:
|
|
2363
|
+
```py
|
|
2364
|
+
# MySQL
|
|
2365
|
+
mysql_engine = sa.create_engine("mysql+pymysql://user:pass@host/db")
|
|
2366
|
+
chain.to_database("mysql_table", mysql_engine)
|
|
2367
|
+
|
|
2368
|
+
# SQLite in-memory
|
|
2369
|
+
chain.to_database("temp_table", "sqlite:///:memory:")
|
|
2370
|
+
```
|
|
2371
|
+
"""
|
|
2372
|
+
from .database import to_database
|
|
2373
|
+
|
|
2374
|
+
to_database(
|
|
2375
|
+
self,
|
|
2376
|
+
table_name,
|
|
2377
|
+
connection,
|
|
2378
|
+
batch_rows=batch_rows,
|
|
2379
|
+
on_conflict=on_conflict,
|
|
2380
|
+
column_mapping=column_mapping,
|
|
2381
|
+
)
|
|
2382
|
+
|
|
2279
2383
|
@classmethod
|
|
2280
2384
|
def from_records(
|
|
2281
2385
|
cls,
|
|
@@ -34,7 +34,7 @@ from datachain.lib.data_model import DataModel, DataType, DataValue
|
|
|
34
34
|
from datachain.lib.file import File
|
|
35
35
|
from datachain.lib.model_store import ModelStore
|
|
36
36
|
from datachain.lib.utils import DataChainParamsError
|
|
37
|
-
from datachain.query.schema import DEFAULT_DELIMITER, Column
|
|
37
|
+
from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
|
|
38
38
|
from datachain.sql.types import SQLType
|
|
39
39
|
|
|
40
40
|
if TYPE_CHECKING:
|
|
@@ -590,7 +590,7 @@ class SignalSchema:
|
|
|
590
590
|
|
|
591
591
|
if name:
|
|
592
592
|
if "." in name:
|
|
593
|
-
name =
|
|
593
|
+
name = ColumnMeta.to_db_name(name)
|
|
594
594
|
|
|
595
595
|
signals = [
|
|
596
596
|
s
|
|
@@ -306,6 +306,7 @@ tests/func/test_read_dataset_version_specifiers.py
|
|
|
306
306
|
tests/func/test_retry.py
|
|
307
307
|
tests/func/test_session.py
|
|
308
308
|
tests/func/test_studio_datetime_parsing.py
|
|
309
|
+
tests/func/test_to_database.py
|
|
309
310
|
tests/func/test_toolkit.py
|
|
310
311
|
tests/func/test_video.py
|
|
311
312
|
tests/func/test_warehouse.py
|
|
@@ -119,7 +119,6 @@ def test_hf_image(tmp_path):
|
|
|
119
119
|
assert row.image.img == image_to_bytes(img)
|
|
120
120
|
|
|
121
121
|
|
|
122
|
-
@pytest.mark.skip("fails with 'NotImplementedError', need to investigate")
|
|
123
122
|
@require_torchcodec
|
|
124
123
|
def test_hf_audio(tmp_path):
|
|
125
124
|
# See https://stackoverflow.com/questions/66191480/how-to-convert-a-numpy-array-to-a-mp3-file
|