datachain 0.28.1__tar.gz → 0.29.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.28.1 → datachain-0.29.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.28.1 → datachain-0.29.0}/PKG-INFO +1 -1
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/warehouse.py +2 -1
- datachain-0.29.0/src/datachain/lib/dc/database.py +330 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/datachain.py +140 -13
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/records.py +4 -2
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/settings.py +23 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/signal_schema.py +2 -2
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/udf.py +27 -4
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/dataset.py +18 -20
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/utils.py +37 -22
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/PKG-INFO +1 -1
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/SOURCES.txt +2 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/examples/test_examples.py +0 -1
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_datachain.py +19 -0
- datachain-0.29.0/tests/func/test_to_database.py +778 -0
- datachain-0.29.0/tests/unit/lib/test_settings.py +61 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_utils.py +19 -0
- datachain-0.28.1/src/datachain/lib/dc/database.py +0 -153
- {datachain-0.28.1 → datachain-0.29.0}/.cruft.json +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.gitattributes +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.github/codecov.yaml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.github/dependabot.yml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.github/workflows/release.yml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/.gitignore +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/LICENSE +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/README.rst +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/commands/auth/login.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/commands/auth/logout.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/commands/auth/team.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/commands/auth/token.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/commands/index.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/commands/job/cancel.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/commands/job/clusters.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/commands/job/logs.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/commands/job/ls.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/commands/job/run.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/contributing.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/examples.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/guide/db_migrations.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/guide/delta.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/guide/env.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/guide/index.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/guide/namespaces.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/guide/processing.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/guide/remotes.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/guide/retry.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/index.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/overrides/main.html +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/quick-start.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/datachain.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/func.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/index.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/toolkit.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/torch.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/references/udf.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/docs/tutorials.md +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/mkdocs.yml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/noxfile.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/pyproject.toml +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/setup.cfg +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/__main__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/asyn.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cache.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/local.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/config.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/dataset.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/delta.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/error.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/array.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/base.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/func.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/path.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/random.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/string.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/window.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/job.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/audio.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/file.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/projects.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/listing.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/namespace.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/node.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/progress.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/project.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/py.typed +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/params.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/session.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/semver.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/studio.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/conftest.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/data.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/examples/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_array.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_path.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_random.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_string.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_audio.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_batching.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_client.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_delta.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_file.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_hf.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_image.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_listing.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_ls.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_metastore.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_pull.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_query.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_read_database.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_retry.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_session.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_video.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/test_atomicity.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/test_cli_studio.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/test_import_time.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/test_telemetry.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_client.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_config.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_func.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_query.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_semver.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_session.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.28.1 → datachain-0.29.0}/tests/utils.py +0 -0
|
@@ -21,6 +21,7 @@ from datachain.lib.file import File
|
|
|
21
21
|
from datachain.lib.signal_schema import SignalSchema
|
|
22
22
|
from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
|
|
23
23
|
from datachain.query.batch import RowsOutput
|
|
24
|
+
from datachain.query.schema import ColumnMeta
|
|
24
25
|
from datachain.query.utils import get_query_id_column
|
|
25
26
|
from datachain.sql.functions import path as pathfunc
|
|
26
27
|
from datachain.sql.types import Int, SQLType
|
|
@@ -400,7 +401,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
400
401
|
expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
|
|
401
402
|
sa.func.count(table.c.sys__id),
|
|
402
403
|
)
|
|
403
|
-
size_column_names = [
|
|
404
|
+
size_column_names = [ColumnMeta.to_db_name(s) + "__size" for s in file_signals]
|
|
404
405
|
size_columns = [c for c in table.columns if c.name in size_column_names]
|
|
405
406
|
|
|
406
407
|
if size_columns:
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import itertools
|
|
3
|
+
import os
|
|
4
|
+
import sqlite3
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
6
|
+
|
|
7
|
+
import sqlalchemy
|
|
8
|
+
|
|
9
|
+
from datachain.query.schema import ColumnMeta
|
|
10
|
+
|
|
11
|
+
DEFAULT_DATABASE_BATCH_SIZE = 10_000
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
15
|
+
|
|
16
|
+
import sqlalchemy.orm # noqa: TC004
|
|
17
|
+
|
|
18
|
+
from datachain.lib.data_model import DataType
|
|
19
|
+
from datachain.query import Session
|
|
20
|
+
|
|
21
|
+
from .datachain import DataChain
|
|
22
|
+
|
|
23
|
+
ConnectionType = Union[
|
|
24
|
+
str,
|
|
25
|
+
sqlalchemy.engine.URL,
|
|
26
|
+
sqlalchemy.engine.interfaces.Connectable,
|
|
27
|
+
sqlalchemy.engine.Engine,
|
|
28
|
+
sqlalchemy.engine.Connection,
|
|
29
|
+
sqlalchemy.orm.Session,
|
|
30
|
+
sqlite3.Connection,
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@contextlib.contextmanager
|
|
35
|
+
def _connect(
|
|
36
|
+
connection: "ConnectionType",
|
|
37
|
+
) -> "Iterator[sqlalchemy.engine.Connection]":
|
|
38
|
+
import sqlalchemy.orm
|
|
39
|
+
|
|
40
|
+
with contextlib.ExitStack() as stack:
|
|
41
|
+
engine_kwargs = {"echo": bool(os.environ.get("DEBUG_SHOW_SQL_QUERIES"))}
|
|
42
|
+
if isinstance(connection, (str, sqlalchemy.URL)):
|
|
43
|
+
engine = sqlalchemy.create_engine(connection, **engine_kwargs)
|
|
44
|
+
stack.callback(engine.dispose)
|
|
45
|
+
yield stack.enter_context(engine.connect())
|
|
46
|
+
elif isinstance(connection, sqlite3.Connection):
|
|
47
|
+
engine = sqlalchemy.create_engine(
|
|
48
|
+
"sqlite://", creator=lambda: connection, **engine_kwargs
|
|
49
|
+
)
|
|
50
|
+
# do not close the connection, as it is managed by the caller
|
|
51
|
+
yield engine.connect()
|
|
52
|
+
elif isinstance(connection, sqlalchemy.Engine):
|
|
53
|
+
yield stack.enter_context(connection.connect())
|
|
54
|
+
elif isinstance(connection, sqlalchemy.Connection):
|
|
55
|
+
# do not close the connection, as it is managed by the caller
|
|
56
|
+
yield connection
|
|
57
|
+
elif isinstance(connection, sqlalchemy.orm.Session):
|
|
58
|
+
# For Session objects, get the underlying bind (Engine or Connection)
|
|
59
|
+
# Sessions don't support DDL operations directly
|
|
60
|
+
bind = connection.get_bind()
|
|
61
|
+
if isinstance(bind, sqlalchemy.Engine):
|
|
62
|
+
yield stack.enter_context(bind.connect())
|
|
63
|
+
else:
|
|
64
|
+
# bind is already a Connection
|
|
65
|
+
yield bind
|
|
66
|
+
else:
|
|
67
|
+
raise TypeError(f"Unsupported connection type: {type(connection).__name__}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def to_database(
|
|
71
|
+
chain: "DataChain",
|
|
72
|
+
table_name: str,
|
|
73
|
+
connection: "ConnectionType",
|
|
74
|
+
*,
|
|
75
|
+
batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
|
|
76
|
+
on_conflict: Optional[str] = None,
|
|
77
|
+
column_mapping: Optional[dict[str, Optional[str]]] = None,
|
|
78
|
+
) -> None:
|
|
79
|
+
"""
|
|
80
|
+
Implementation function for exporting DataChain to database tables.
|
|
81
|
+
|
|
82
|
+
This is the core implementation that handles the actual database operations.
|
|
83
|
+
For user-facing documentation, see DataChain.to_database() method.
|
|
84
|
+
"""
|
|
85
|
+
from datachain.utils import batched
|
|
86
|
+
|
|
87
|
+
if on_conflict and on_conflict not in ("ignore", "update"):
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f"on_conflict must be 'ignore' or 'update', got: {on_conflict}"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
signals_schema = chain.signals_schema.clone_without_sys_signals()
|
|
93
|
+
all_columns = [
|
|
94
|
+
sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
|
|
95
|
+
for c in signals_schema.db_signals(as_columns=True)
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
column_mapping = column_mapping or {}
|
|
99
|
+
normalized_column_mapping = _normalize_column_mapping(column_mapping)
|
|
100
|
+
column_indices_and_names, columns = _prepare_columns(
|
|
101
|
+
all_columns, normalized_column_mapping
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
with _connect(connection) as conn:
|
|
105
|
+
metadata = sqlalchemy.MetaData()
|
|
106
|
+
table = sqlalchemy.Table(table_name, metadata, *columns)
|
|
107
|
+
|
|
108
|
+
# Check if table already exists to determine if we should clean up on error.
|
|
109
|
+
inspector = sqlalchemy.inspect(conn)
|
|
110
|
+
assert inspector # to satisfy mypy
|
|
111
|
+
table_existed_before = table_name in inspector.get_table_names()
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
table.create(conn, checkfirst=True)
|
|
115
|
+
rows_iter = chain._leaf_values()
|
|
116
|
+
for batch in batched(rows_iter, batch_rows):
|
|
117
|
+
_process_batch(
|
|
118
|
+
conn, table, batch, on_conflict, column_indices_and_names
|
|
119
|
+
)
|
|
120
|
+
conn.commit()
|
|
121
|
+
except Exception:
|
|
122
|
+
if not table_existed_before:
|
|
123
|
+
try:
|
|
124
|
+
table.drop(conn, checkfirst=True)
|
|
125
|
+
conn.commit()
|
|
126
|
+
except sqlalchemy.exc.SQLAlchemyError:
|
|
127
|
+
pass
|
|
128
|
+
raise
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _normalize_column_mapping(
|
|
132
|
+
column_mapping: dict[str, Optional[str]],
|
|
133
|
+
) -> dict[str, Optional[str]]:
|
|
134
|
+
"""
|
|
135
|
+
Convert column mapping keys from DataChain format (dots) to database format
|
|
136
|
+
(double underscores).
|
|
137
|
+
|
|
138
|
+
This allows users to specify column mappings using the intuitive DataChain
|
|
139
|
+
format like: {"nested_data.value": "data_value"} instead of
|
|
140
|
+
{"nested_data__value": "data_value"}
|
|
141
|
+
"""
|
|
142
|
+
if not column_mapping:
|
|
143
|
+
return {}
|
|
144
|
+
|
|
145
|
+
normalized_mapping: dict[str, Optional[str]] = {}
|
|
146
|
+
original_keys: dict[str, str] = {}
|
|
147
|
+
for key, value in column_mapping.items():
|
|
148
|
+
db_key = ColumnMeta.to_db_name(key)
|
|
149
|
+
if db_key in normalized_mapping:
|
|
150
|
+
prev = original_keys[db_key]
|
|
151
|
+
raise ValueError(
|
|
152
|
+
"Column mapping collision: multiple keys map to the same "
|
|
153
|
+
f"database column name '{db_key}': '{prev}' and '{key}'. "
|
|
154
|
+
)
|
|
155
|
+
normalized_mapping[db_key] = value
|
|
156
|
+
original_keys[db_key] = key
|
|
157
|
+
|
|
158
|
+
# If it's a defaultdict, preserve the default factory
|
|
159
|
+
if hasattr(column_mapping, "default_factory"):
|
|
160
|
+
from collections import defaultdict
|
|
161
|
+
|
|
162
|
+
default_factory = column_mapping.default_factory
|
|
163
|
+
result: dict[str, Optional[str]] = defaultdict(default_factory)
|
|
164
|
+
result.update(normalized_mapping)
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
return normalized_mapping
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _prepare_columns(all_columns, column_mapping):
|
|
171
|
+
"""Prepare column mapping and column definitions."""
|
|
172
|
+
column_indices_and_names = [] # List of (index, target_name) tuples
|
|
173
|
+
columns = []
|
|
174
|
+
for idx, col in enumerate(all_columns):
|
|
175
|
+
if col.name in column_mapping or hasattr(column_mapping, "default_factory"):
|
|
176
|
+
mapped_name = column_mapping[col.name]
|
|
177
|
+
if mapped_name:
|
|
178
|
+
columns.append(sqlalchemy.Column(mapped_name, col.type))
|
|
179
|
+
column_indices_and_names.append((idx, mapped_name))
|
|
180
|
+
else:
|
|
181
|
+
columns.append(col)
|
|
182
|
+
column_indices_and_names.append((idx, col.name))
|
|
183
|
+
return column_indices_and_names, columns
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _process_batch(conn, table, batch, on_conflict, column_indices_and_names):
|
|
187
|
+
"""Process a batch of rows with conflict resolution."""
|
|
188
|
+
|
|
189
|
+
def prepare_row(row_values):
|
|
190
|
+
"""Convert a row tuple to a dictionary with proper DB column names."""
|
|
191
|
+
return {
|
|
192
|
+
target_name: row_values[idx]
|
|
193
|
+
for idx, target_name in column_indices_and_names
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
rows_to_insert = [prepare_row(row) for row in batch]
|
|
197
|
+
|
|
198
|
+
supports_conflict = on_conflict and conn.engine.name in ("postgresql", "sqlite")
|
|
199
|
+
|
|
200
|
+
if supports_conflict:
|
|
201
|
+
# Use dialect-specific insert for conflict resolution
|
|
202
|
+
if conn.engine.name == "postgresql":
|
|
203
|
+
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
|
204
|
+
|
|
205
|
+
insert_stmt = pg_insert(table)
|
|
206
|
+
elif conn.engine.name == "sqlite":
|
|
207
|
+
from sqlalchemy.dialects.sqlite import insert as sqlite_insert
|
|
208
|
+
|
|
209
|
+
insert_stmt = sqlite_insert(table)
|
|
210
|
+
else:
|
|
211
|
+
insert_stmt = table.insert()
|
|
212
|
+
|
|
213
|
+
if supports_conflict:
|
|
214
|
+
if on_conflict == "ignore":
|
|
215
|
+
insert_stmt = insert_stmt.on_conflict_do_nothing()
|
|
216
|
+
elif on_conflict == "update":
|
|
217
|
+
update_values = {
|
|
218
|
+
col.name: insert_stmt.excluded[col.name] for col in table.columns
|
|
219
|
+
}
|
|
220
|
+
insert_stmt = insert_stmt.on_conflict_do_update(set_=update_values)
|
|
221
|
+
elif on_conflict:
|
|
222
|
+
import warnings
|
|
223
|
+
|
|
224
|
+
warnings.warn(
|
|
225
|
+
f"Database does not support conflict resolution. "
|
|
226
|
+
f"Ignoring on_conflict='{on_conflict}' parameter.",
|
|
227
|
+
UserWarning,
|
|
228
|
+
stacklevel=2,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
conn.execute(insert_stmt, rows_to_insert)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def read_database(
|
|
235
|
+
query: Union[str, "sqlalchemy.sql.expression.Executable"],
|
|
236
|
+
connection: "ConnectionType",
|
|
237
|
+
params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
|
|
238
|
+
*,
|
|
239
|
+
output: Optional["dict[str, DataType]"] = None,
|
|
240
|
+
session: Optional["Session"] = None,
|
|
241
|
+
settings: Optional[dict] = None,
|
|
242
|
+
in_memory: bool = False,
|
|
243
|
+
infer_schema_length: Optional[int] = 100,
|
|
244
|
+
) -> "DataChain":
|
|
245
|
+
"""
|
|
246
|
+
Read the results of a SQL query into a DataChain, using a given database connection.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
query:
|
|
250
|
+
The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
|
|
251
|
+
`Executable` object.
|
|
252
|
+
connection: SQLAlchemy connectable, str, or a sqlite3 connection
|
|
253
|
+
Using SQLAlchemy makes it possible to use any DB supported by that
|
|
254
|
+
library. If a DBAPI2 object, only sqlite3 is supported. The user is
|
|
255
|
+
responsible for engine disposal and connection closure for the
|
|
256
|
+
SQLAlchemy connectable; str connections are closed automatically.
|
|
257
|
+
params: Parameters to pass to execute method.
|
|
258
|
+
output: A dictionary mapping column names to types, used to override the
|
|
259
|
+
schema inferred from the query results.
|
|
260
|
+
session: Session to use for the chain.
|
|
261
|
+
settings: Settings to use for the chain.
|
|
262
|
+
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
263
|
+
infer_schema_length:
|
|
264
|
+
The maximum number of rows to scan for inferring schema.
|
|
265
|
+
If set to `None`, the full data may be scanned.
|
|
266
|
+
The rows used for schema inference are stored in memory,
|
|
267
|
+
so large values can lead to high memory usage.
|
|
268
|
+
Only applies if the `output` parameter is not set for the given column.
|
|
269
|
+
|
|
270
|
+
Examples:
|
|
271
|
+
Reading from a SQL query against a user-supplied connection:
|
|
272
|
+
```python
|
|
273
|
+
query = "SELECT key, value FROM tbl"
|
|
274
|
+
chain = dc.read_database(query, connection, output={"value": float})
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Load data from a SQLAlchemy driver/engine:
|
|
278
|
+
```python
|
|
279
|
+
from sqlalchemy import create_engine
|
|
280
|
+
engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
|
|
281
|
+
chain = dc.read_database("select * from tbl", engine)
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
Load data from a parameterized SQLAlchemy query:
|
|
285
|
+
```python
|
|
286
|
+
query = "SELECT key, value FROM tbl WHERE value > :value"
|
|
287
|
+
dc.read_database(query, engine, params={"value": 50})
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
Notes:
|
|
291
|
+
- This function works with a variety of databases — including,
|
|
292
|
+
but not limited to, SQLite, DuckDB, PostgreSQL, and Snowflake,
|
|
293
|
+
provided the appropriate driver is installed.
|
|
294
|
+
- This call is blocking, and will execute the query and return once the
|
|
295
|
+
results are saved.
|
|
296
|
+
"""
|
|
297
|
+
from datachain.lib.dc.records import read_records
|
|
298
|
+
|
|
299
|
+
output = output or {}
|
|
300
|
+
if isinstance(query, str):
|
|
301
|
+
query = sqlalchemy.text(query)
|
|
302
|
+
kw = {"execution_options": {"stream_results": True}} # use server-side cursors
|
|
303
|
+
with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
|
|
304
|
+
cols = result.keys()
|
|
305
|
+
to_infer = [k for k in cols if k not in output] # preserve the order
|
|
306
|
+
rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
|
|
307
|
+
records = (row._asdict() for row in itertools.chain(rows, result))
|
|
308
|
+
return read_records(
|
|
309
|
+
records,
|
|
310
|
+
session=session,
|
|
311
|
+
settings=settings,
|
|
312
|
+
in_memory=in_memory,
|
|
313
|
+
schema=inferred_schema | output,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _infer_schema(
|
|
318
|
+
result: "sqlalchemy.engine.Result",
|
|
319
|
+
to_infer: list[str],
|
|
320
|
+
infer_schema_length: Optional[int] = 100,
|
|
321
|
+
) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
|
|
322
|
+
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
323
|
+
|
|
324
|
+
if not to_infer:
|
|
325
|
+
return [], {}
|
|
326
|
+
|
|
327
|
+
rows = list(itertools.islice(result, infer_schema_length))
|
|
328
|
+
values = {col: [row._mapping[col] for row in rows] for col in to_infer}
|
|
329
|
+
_, output_schema, _ = values_to_tuples("", **values)
|
|
330
|
+
return rows, output_schema
|
|
@@ -58,6 +58,7 @@ from datachain.query.schema import DEFAULT_DELIMITER, Column
|
|
|
58
58
|
from datachain.sql.functions import path as pathfunc
|
|
59
59
|
from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
|
|
60
60
|
|
|
61
|
+
from .database import DEFAULT_DATABASE_BATCH_SIZE
|
|
61
62
|
from .utils import (
|
|
62
63
|
DatasetMergeError,
|
|
63
64
|
DatasetPrepareError,
|
|
@@ -77,11 +78,23 @@ UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
|
|
|
77
78
|
DEFAULT_PARQUET_CHUNK_SIZE = 100_000
|
|
78
79
|
|
|
79
80
|
if TYPE_CHECKING:
|
|
81
|
+
import sqlite3
|
|
82
|
+
|
|
80
83
|
import pandas as pd
|
|
81
84
|
from typing_extensions import ParamSpec, Self
|
|
82
85
|
|
|
83
86
|
P = ParamSpec("P")
|
|
84
87
|
|
|
88
|
+
ConnectionType = Union[
|
|
89
|
+
str,
|
|
90
|
+
sqlalchemy.engine.URL,
|
|
91
|
+
sqlalchemy.engine.interfaces.Connectable,
|
|
92
|
+
sqlalchemy.engine.Engine,
|
|
93
|
+
sqlalchemy.engine.Connection,
|
|
94
|
+
"sqlalchemy.orm.Session",
|
|
95
|
+
sqlite3.Connection,
|
|
96
|
+
]
|
|
97
|
+
|
|
85
98
|
|
|
86
99
|
T = TypeVar("T", bound="DataChain")
|
|
87
100
|
|
|
@@ -324,6 +337,7 @@ class DataChain:
|
|
|
324
337
|
sys: Optional[bool] = None,
|
|
325
338
|
namespace: Optional[str] = None,
|
|
326
339
|
project: Optional[str] = None,
|
|
340
|
+
batch_rows: Optional[int] = None,
|
|
327
341
|
) -> "Self":
|
|
328
342
|
"""Change settings for chain.
|
|
329
343
|
|
|
@@ -331,22 +345,24 @@ class DataChain:
|
|
|
331
345
|
It returns chain, so, it can be chained later with next operation.
|
|
332
346
|
|
|
333
347
|
Parameters:
|
|
334
|
-
cache : data caching (default=False)
|
|
348
|
+
cache : data caching. (default=False)
|
|
335
349
|
parallel : number of thread for processors. True is a special value to
|
|
336
|
-
enable all available CPUs (default=1)
|
|
350
|
+
enable all available CPUs. (default=1)
|
|
337
351
|
workers : number of distributed workers. Only for Studio mode. (default=1)
|
|
338
|
-
min_task_size : minimum number of tasks (default=1)
|
|
339
|
-
prefetch: number of workers to use for downloading files in advance.
|
|
352
|
+
min_task_size : minimum number of tasks. (default=1)
|
|
353
|
+
prefetch : number of workers to use for downloading files in advance.
|
|
340
354
|
This is enabled by default and uses 2 workers.
|
|
341
355
|
To disable prefetching, set it to 0.
|
|
342
|
-
namespace: namespace name.
|
|
343
|
-
project: project name.
|
|
356
|
+
namespace : namespace name.
|
|
357
|
+
project : project name.
|
|
358
|
+
batch_rows : row limit per insert to balance speed and memory usage.
|
|
359
|
+
(default=2000)
|
|
344
360
|
|
|
345
361
|
Example:
|
|
346
362
|
```py
|
|
347
363
|
chain = (
|
|
348
364
|
chain
|
|
349
|
-
.settings(cache=True, parallel=8)
|
|
365
|
+
.settings(cache=True, parallel=8, batch_rows=300)
|
|
350
366
|
.map(laion=process_webdataset(spec=WDSLaion), params="file")
|
|
351
367
|
)
|
|
352
368
|
```
|
|
@@ -356,7 +372,14 @@ class DataChain:
|
|
|
356
372
|
settings = copy.copy(self._settings)
|
|
357
373
|
settings.add(
|
|
358
374
|
Settings(
|
|
359
|
-
cache,
|
|
375
|
+
cache,
|
|
376
|
+
parallel,
|
|
377
|
+
workers,
|
|
378
|
+
min_task_size,
|
|
379
|
+
prefetch,
|
|
380
|
+
namespace,
|
|
381
|
+
project,
|
|
382
|
+
batch_rows,
|
|
360
383
|
)
|
|
361
384
|
)
|
|
362
385
|
return self._evolve(settings=settings, _sys=sys)
|
|
@@ -711,7 +734,7 @@ class DataChain:
|
|
|
711
734
|
|
|
712
735
|
return self._evolve(
|
|
713
736
|
query=self._query.add_signals(
|
|
714
|
-
udf_obj.to_udf_wrapper(),
|
|
737
|
+
udf_obj.to_udf_wrapper(self._settings.batch_rows),
|
|
715
738
|
**self._settings.to_dict(),
|
|
716
739
|
),
|
|
717
740
|
signal_schema=self.signals_schema | udf_obj.output,
|
|
@@ -749,7 +772,7 @@ class DataChain:
|
|
|
749
772
|
udf_obj.prefetch = prefetch
|
|
750
773
|
return self._evolve(
|
|
751
774
|
query=self._query.generate(
|
|
752
|
-
udf_obj.to_udf_wrapper(),
|
|
775
|
+
udf_obj.to_udf_wrapper(self._settings.batch_rows),
|
|
753
776
|
**self._settings.to_dict(),
|
|
754
777
|
),
|
|
755
778
|
signal_schema=udf_obj.output,
|
|
@@ -885,7 +908,7 @@ class DataChain:
|
|
|
885
908
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
886
909
|
return self._evolve(
|
|
887
910
|
query=self._query.generate(
|
|
888
|
-
udf_obj.to_udf_wrapper(),
|
|
911
|
+
udf_obj.to_udf_wrapper(self._settings.batch_rows),
|
|
889
912
|
partition_by=processed_partition_by,
|
|
890
913
|
**self._settings.to_dict(),
|
|
891
914
|
),
|
|
@@ -917,11 +940,24 @@ class DataChain:
|
|
|
917
940
|
)
|
|
918
941
|
chain.save("new_dataset")
|
|
919
942
|
```
|
|
943
|
+
|
|
944
|
+
.. deprecated:: 0.29.0
|
|
945
|
+
This method is deprecated and will be removed in a future version.
|
|
946
|
+
Use `agg()` instead, which provides the similar functionality.
|
|
920
947
|
"""
|
|
948
|
+
import warnings
|
|
949
|
+
|
|
950
|
+
warnings.warn(
|
|
951
|
+
"batch_map() is deprecated and will be removed in a future version. "
|
|
952
|
+
"Use agg() instead, which provides the similar functionality.",
|
|
953
|
+
DeprecationWarning,
|
|
954
|
+
stacklevel=2,
|
|
955
|
+
)
|
|
921
956
|
udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
|
|
957
|
+
|
|
922
958
|
return self._evolve(
|
|
923
959
|
query=self._query.add_signals(
|
|
924
|
-
udf_obj.to_udf_wrapper(batch),
|
|
960
|
+
udf_obj.to_udf_wrapper(self._settings.batch_rows, batch=batch),
|
|
925
961
|
**self._settings.to_dict(),
|
|
926
962
|
),
|
|
927
963
|
signal_schema=self.signals_schema | udf_obj.output,
|
|
@@ -2253,6 +2289,97 @@ class DataChain:
|
|
|
2253
2289
|
"""
|
|
2254
2290
|
self.to_json(path, fs_kwargs, include_outer_list=False)
|
|
2255
2291
|
|
|
2292
|
+
def to_database(
|
|
2293
|
+
self,
|
|
2294
|
+
table_name: str,
|
|
2295
|
+
connection: "ConnectionType",
|
|
2296
|
+
*,
|
|
2297
|
+
batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
|
|
2298
|
+
on_conflict: Optional[str] = None,
|
|
2299
|
+
column_mapping: Optional[dict[str, Optional[str]]] = None,
|
|
2300
|
+
) -> None:
|
|
2301
|
+
"""Save chain to a database table using a given database connection.
|
|
2302
|
+
|
|
2303
|
+
This method exports all DataChain records to a database table, creating the
|
|
2304
|
+
table if it doesn't exist and appending data if it does. The table schema
|
|
2305
|
+
is automatically inferred from the DataChain's signal schema.
|
|
2306
|
+
|
|
2307
|
+
Parameters:
|
|
2308
|
+
table_name: Name of the database table to create/write to.
|
|
2309
|
+
connection: SQLAlchemy connectable, str, or a sqlite3 connection
|
|
2310
|
+
Using SQLAlchemy makes it possible to use any DB supported by that
|
|
2311
|
+
library. If a DBAPI2 object, only sqlite3 is supported. The user is
|
|
2312
|
+
responsible for engine disposal and connection closure for the
|
|
2313
|
+
SQLAlchemy connectable; str connections are closed automatically.
|
|
2314
|
+
batch_rows: Number of rows to insert per batch for optimal performance.
|
|
2315
|
+
Larger batches are faster but use more memory. Default: 10,000.
|
|
2316
|
+
on_conflict: Strategy for handling duplicate rows (requires table
|
|
2317
|
+
constraints):
|
|
2318
|
+
- None: Raise error (`sqlalchemy.exc.IntegrityError`) on conflict
|
|
2319
|
+
(default)
|
|
2320
|
+
- "ignore": Skip duplicate rows silently
|
|
2321
|
+
- "update": Update existing rows with new values
|
|
2322
|
+
column_mapping: Optional mapping to rename or skip columns:
|
|
2323
|
+
- Dict mapping DataChain column names to database column names
|
|
2324
|
+
- Set values to None to skip columns entirely, or use `defaultdict` to
|
|
2325
|
+
skip all columns except those specified.
|
|
2326
|
+
|
|
2327
|
+
Examples:
|
|
2328
|
+
Basic usage with PostgreSQL:
|
|
2329
|
+
```py
|
|
2330
|
+
import sqlalchemy as sa
|
|
2331
|
+
import datachain as dc
|
|
2332
|
+
|
|
2333
|
+
chain = dc.read_storage("s3://my-bucket/")
|
|
2334
|
+
engine = sa.create_engine("postgresql://user:pass@localhost/mydb")
|
|
2335
|
+
chain.to_database("files_table", engine)
|
|
2336
|
+
```
|
|
2337
|
+
|
|
2338
|
+
Using SQLite with connection string:
|
|
2339
|
+
```py
|
|
2340
|
+
chain.to_database("my_table", "sqlite:///data.db")
|
|
2341
|
+
```
|
|
2342
|
+
|
|
2343
|
+
Column mapping and renaming:
|
|
2344
|
+
```py
|
|
2345
|
+
mapping = {
|
|
2346
|
+
"user.id": "id",
|
|
2347
|
+
"user.name": "name",
|
|
2348
|
+
"user.password": None # Skip this column
|
|
2349
|
+
}
|
|
2350
|
+
chain.to_database("users", engine, column_mapping=mapping)
|
|
2351
|
+
```
|
|
2352
|
+
|
|
2353
|
+
Handling conflicts (requires PRIMARY KEY or UNIQUE constraints):
|
|
2354
|
+
```py
|
|
2355
|
+
# Skip duplicates
|
|
2356
|
+
chain.to_database("my_table", engine, on_conflict="ignore")
|
|
2357
|
+
|
|
2358
|
+
# Update existing records
|
|
2359
|
+
chain.to_database("my_table", engine, on_conflict="update")
|
|
2360
|
+
```
|
|
2361
|
+
|
|
2362
|
+
Working with different databases:
|
|
2363
|
+
```py
|
|
2364
|
+
# MySQL
|
|
2365
|
+
mysql_engine = sa.create_engine("mysql+pymysql://user:pass@host/db")
|
|
2366
|
+
chain.to_database("mysql_table", mysql_engine)
|
|
2367
|
+
|
|
2368
|
+
# SQLite in-memory
|
|
2369
|
+
chain.to_database("temp_table", "sqlite:///:memory:")
|
|
2370
|
+
```
|
|
2371
|
+
"""
|
|
2372
|
+
from .database import to_database
|
|
2373
|
+
|
|
2374
|
+
to_database(
|
|
2375
|
+
self,
|
|
2376
|
+
table_name,
|
|
2377
|
+
connection,
|
|
2378
|
+
batch_rows=batch_rows,
|
|
2379
|
+
on_conflict=on_conflict,
|
|
2380
|
+
column_mapping=column_mapping,
|
|
2381
|
+
)
|
|
2382
|
+
|
|
2256
2383
|
@classmethod
|
|
2257
2384
|
def from_records(
|
|
2258
2385
|
cls,
|
|
@@ -2340,7 +2467,7 @@ class DataChain:
|
|
|
2340
2467
|
def setup(self, **kwargs) -> "Self":
|
|
2341
2468
|
"""Setup variables to pass to UDF functions.
|
|
2342
2469
|
|
|
2343
|
-
Use before running map/gen/agg
|
|
2470
|
+
Use before running map/gen/agg to save an object and pass it as an
|
|
2344
2471
|
argument to the UDF.
|
|
2345
2472
|
|
|
2346
2473
|
The value must be a callable (a `lambda: <value>` syntax can be used to quickly
|
|
@@ -15,6 +15,8 @@ if TYPE_CHECKING:
|
|
|
15
15
|
|
|
16
16
|
P = ParamSpec("P")
|
|
17
17
|
|
|
18
|
+
READ_RECORDS_BATCH_SIZE = 10000
|
|
19
|
+
|
|
18
20
|
|
|
19
21
|
def read_records(
|
|
20
22
|
to_insert: Optional[Union[dict, Iterable[dict]]],
|
|
@@ -41,7 +43,7 @@ def read_records(
|
|
|
41
43
|
Notes:
|
|
42
44
|
This call blocks until all records are inserted.
|
|
43
45
|
"""
|
|
44
|
-
from datachain.query.dataset import
|
|
46
|
+
from datachain.query.dataset import adjust_outputs, get_col_types
|
|
45
47
|
from datachain.sql.types import SQLType
|
|
46
48
|
from datachain.utils import batched
|
|
47
49
|
|
|
@@ -94,7 +96,7 @@ def read_records(
|
|
|
94
96
|
{c.name: c.type for c in columns if isinstance(c.type, SQLType)},
|
|
95
97
|
)
|
|
96
98
|
records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
|
|
97
|
-
for chunk in batched(records,
|
|
99
|
+
for chunk in batched(records, READ_RECORDS_BATCH_SIZE):
|
|
98
100
|
warehouse.insert_rows(table, chunk)
|
|
99
101
|
warehouse.insert_rows_done(table)
|
|
100
102
|
return read_dataset(name=dsr.full_name, session=session, settings=settings)
|