datachain 0.28.1__tar.gz → 0.28.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.28.1 → datachain-0.28.2}/.pre-commit-config.yaml +1 -1
- {datachain-0.28.1 → datachain-0.28.2}/PKG-INFO +1 -1
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/datachain.py +36 -13
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/records.py +4 -2
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/settings.py +23 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/udf.py +27 -4
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/dataset.py +18 -20
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/utils.py +37 -22
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain.egg-info/PKG-INFO +1 -1
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_datachain.py +19 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_hf.py +1 -0
- datachain-0.28.2/tests/unit/lib/test_settings.py +61 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_utils.py +19 -0
- {datachain-0.28.1 → datachain-0.28.2}/.cruft.json +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.gitattributes +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.github/codecov.yaml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.github/dependabot.yml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.github/workflows/release.yml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.github/workflows/tests.yml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/.gitignore +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/LICENSE +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/README.rst +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/assets/datachain.svg +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/commands/auth/login.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/commands/auth/logout.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/commands/auth/team.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/commands/auth/token.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/commands/index.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/commands/job/cancel.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/commands/job/clusters.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/commands/job/logs.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/commands/job/ls.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/commands/job/run.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/contributing.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/examples.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/guide/db_migrations.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/guide/delta.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/guide/env.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/guide/index.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/guide/namespaces.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/guide/processing.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/guide/remotes.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/guide/retry.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/index.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/overrides/main.html +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/quick-start.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/data-types/file.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/data-types/index.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/data-types/pose.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/data-types/segment.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/datachain.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/func.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/index.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/toolkit.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/torch.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/references/udf.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/docs/tutorials.md +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/multimodal/wds.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/mkdocs.yml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/noxfile.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/pyproject.toml +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/setup.cfg +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/__main__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/asyn.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cache.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/cli/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/client/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/client/azure.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/client/gcs.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/client/hf.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/client/local.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/client/s3.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/config.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/dataset.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/delta.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/error.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/fs/reference.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/fs/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/array.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/base.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/conditional.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/func.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/numeric.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/path.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/random.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/string.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/func/window.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/job.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/audio.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/clip.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/file.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/hf.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/image.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/listing.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/projects.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/tar.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/text.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/video.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/listing.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/model/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/model/bbox.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/model/pose.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/model/segment.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/model/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/namespace.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/node.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/progress.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/project.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/py.typed +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/batch.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/metrics.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/params.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/queue.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/schema.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/session.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/udf.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/query/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/remote/studio.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/script_meta.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/semver.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/types.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/sql/utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/studio.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/telemetry.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/conftest.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/data.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/examples/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/examples/test_examples.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/examples/wds_data.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/data/lena.jpg +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/functions/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/functions/test_array.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/functions/test_path.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/functions/test_random.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/functions/test_string.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/model/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_audio.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_batching.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_catalog.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_client.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_data_storage.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_datasets.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_delta.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_file.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_image.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_listing.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_ls.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_metastore.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_metrics.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_pull.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_pytorch.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_query.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_read_database.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_retry.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_session.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_toolkit.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_video.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/func/test_warehouse.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/scripts/feature_class.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/test_atomicity.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/test_cli_e2e.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/test_cli_studio.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/test_import_time.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/test_query_e2e.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/test_telemetry.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/model/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_asyn.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_cache.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_catalog.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_client.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_config.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_dataset.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_func.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_listing.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_metastore.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_query.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_query_params.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_semver.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_serializer.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_session.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.28.1 → datachain-0.28.2}/tests/utils.py +0 -0
|
@@ -324,6 +324,7 @@ class DataChain:
|
|
|
324
324
|
sys: Optional[bool] = None,
|
|
325
325
|
namespace: Optional[str] = None,
|
|
326
326
|
project: Optional[str] = None,
|
|
327
|
+
batch_rows: Optional[int] = None,
|
|
327
328
|
) -> "Self":
|
|
328
329
|
"""Change settings for chain.
|
|
329
330
|
|
|
@@ -331,22 +332,24 @@ class DataChain:
|
|
|
331
332
|
It returns chain, so, it can be chained later with next operation.
|
|
332
333
|
|
|
333
334
|
Parameters:
|
|
334
|
-
cache : data caching (default=False)
|
|
335
|
+
cache : data caching. (default=False)
|
|
335
336
|
parallel : number of thread for processors. True is a special value to
|
|
336
|
-
enable all available CPUs (default=1)
|
|
337
|
+
enable all available CPUs. (default=1)
|
|
337
338
|
workers : number of distributed workers. Only for Studio mode. (default=1)
|
|
338
|
-
min_task_size : minimum number of tasks (default=1)
|
|
339
|
-
prefetch: number of workers to use for downloading files in advance.
|
|
339
|
+
min_task_size : minimum number of tasks. (default=1)
|
|
340
|
+
prefetch : number of workers to use for downloading files in advance.
|
|
340
341
|
This is enabled by default and uses 2 workers.
|
|
341
342
|
To disable prefetching, set it to 0.
|
|
342
|
-
namespace: namespace name.
|
|
343
|
-
project: project name.
|
|
343
|
+
namespace : namespace name.
|
|
344
|
+
project : project name.
|
|
345
|
+
batch_rows : row limit per insert to balance speed and memory usage.
|
|
346
|
+
(default=2000)
|
|
344
347
|
|
|
345
348
|
Example:
|
|
346
349
|
```py
|
|
347
350
|
chain = (
|
|
348
351
|
chain
|
|
349
|
-
.settings(cache=True, parallel=8)
|
|
352
|
+
.settings(cache=True, parallel=8, batch_rows=300)
|
|
350
353
|
.map(laion=process_webdataset(spec=WDSLaion), params="file")
|
|
351
354
|
)
|
|
352
355
|
```
|
|
@@ -356,7 +359,14 @@ class DataChain:
|
|
|
356
359
|
settings = copy.copy(self._settings)
|
|
357
360
|
settings.add(
|
|
358
361
|
Settings(
|
|
359
|
-
cache,
|
|
362
|
+
cache,
|
|
363
|
+
parallel,
|
|
364
|
+
workers,
|
|
365
|
+
min_task_size,
|
|
366
|
+
prefetch,
|
|
367
|
+
namespace,
|
|
368
|
+
project,
|
|
369
|
+
batch_rows,
|
|
360
370
|
)
|
|
361
371
|
)
|
|
362
372
|
return self._evolve(settings=settings, _sys=sys)
|
|
@@ -711,7 +721,7 @@ class DataChain:
|
|
|
711
721
|
|
|
712
722
|
return self._evolve(
|
|
713
723
|
query=self._query.add_signals(
|
|
714
|
-
udf_obj.to_udf_wrapper(),
|
|
724
|
+
udf_obj.to_udf_wrapper(self._settings.batch_rows),
|
|
715
725
|
**self._settings.to_dict(),
|
|
716
726
|
),
|
|
717
727
|
signal_schema=self.signals_schema | udf_obj.output,
|
|
@@ -749,7 +759,7 @@ class DataChain:
|
|
|
749
759
|
udf_obj.prefetch = prefetch
|
|
750
760
|
return self._evolve(
|
|
751
761
|
query=self._query.generate(
|
|
752
|
-
udf_obj.to_udf_wrapper(),
|
|
762
|
+
udf_obj.to_udf_wrapper(self._settings.batch_rows),
|
|
753
763
|
**self._settings.to_dict(),
|
|
754
764
|
),
|
|
755
765
|
signal_schema=udf_obj.output,
|
|
@@ -885,7 +895,7 @@ class DataChain:
|
|
|
885
895
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
886
896
|
return self._evolve(
|
|
887
897
|
query=self._query.generate(
|
|
888
|
-
udf_obj.to_udf_wrapper(),
|
|
898
|
+
udf_obj.to_udf_wrapper(self._settings.batch_rows),
|
|
889
899
|
partition_by=processed_partition_by,
|
|
890
900
|
**self._settings.to_dict(),
|
|
891
901
|
),
|
|
@@ -917,11 +927,24 @@ class DataChain:
|
|
|
917
927
|
)
|
|
918
928
|
chain.save("new_dataset")
|
|
919
929
|
```
|
|
930
|
+
|
|
931
|
+
.. deprecated:: 0.29.0
|
|
932
|
+
This method is deprecated and will be removed in a future version.
|
|
933
|
+
Use `agg()` instead, which provides the similar functionality.
|
|
920
934
|
"""
|
|
935
|
+
import warnings
|
|
936
|
+
|
|
937
|
+
warnings.warn(
|
|
938
|
+
"batch_map() is deprecated and will be removed in a future version. "
|
|
939
|
+
"Use agg() instead, which provides the similar functionality.",
|
|
940
|
+
DeprecationWarning,
|
|
941
|
+
stacklevel=2,
|
|
942
|
+
)
|
|
921
943
|
udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
|
|
944
|
+
|
|
922
945
|
return self._evolve(
|
|
923
946
|
query=self._query.add_signals(
|
|
924
|
-
udf_obj.to_udf_wrapper(batch),
|
|
947
|
+
udf_obj.to_udf_wrapper(self._settings.batch_rows, batch=batch),
|
|
925
948
|
**self._settings.to_dict(),
|
|
926
949
|
),
|
|
927
950
|
signal_schema=self.signals_schema | udf_obj.output,
|
|
@@ -2340,7 +2363,7 @@ class DataChain:
|
|
|
2340
2363
|
def setup(self, **kwargs) -> "Self":
|
|
2341
2364
|
"""Setup variables to pass to UDF functions.
|
|
2342
2365
|
|
|
2343
|
-
Use before running map/gen/agg
|
|
2366
|
+
Use before running map/gen/agg to save an object and pass it as an
|
|
2344
2367
|
argument to the UDF.
|
|
2345
2368
|
|
|
2346
2369
|
The value must be a callable (a `lambda: <value>` syntax can be used to quickly
|
|
@@ -15,6 +15,8 @@ if TYPE_CHECKING:
|
|
|
15
15
|
|
|
16
16
|
P = ParamSpec("P")
|
|
17
17
|
|
|
18
|
+
READ_RECORDS_BATCH_SIZE = 10000
|
|
19
|
+
|
|
18
20
|
|
|
19
21
|
def read_records(
|
|
20
22
|
to_insert: Optional[Union[dict, Iterable[dict]]],
|
|
@@ -41,7 +43,7 @@ def read_records(
|
|
|
41
43
|
Notes:
|
|
42
44
|
This call blocks until all records are inserted.
|
|
43
45
|
"""
|
|
44
|
-
from datachain.query.dataset import
|
|
46
|
+
from datachain.query.dataset import adjust_outputs, get_col_types
|
|
45
47
|
from datachain.sql.types import SQLType
|
|
46
48
|
from datachain.utils import batched
|
|
47
49
|
|
|
@@ -94,7 +96,7 @@ def read_records(
|
|
|
94
96
|
{c.name: c.type for c in columns if isinstance(c.type, SQLType)},
|
|
95
97
|
)
|
|
96
98
|
records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
|
|
97
|
-
for chunk in batched(records,
|
|
99
|
+
for chunk in batched(records, READ_RECORDS_BATCH_SIZE):
|
|
98
100
|
warehouse.insert_rows(table, chunk)
|
|
99
101
|
warehouse.insert_rows_done(table)
|
|
100
102
|
return read_dataset(name=dsr.full_name, session=session, settings=settings)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from datachain.lib.utils import DataChainParamsError
|
|
2
|
+
from datachain.utils import DEFAULT_CHUNK_ROWS
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
class SettingsError(DataChainParamsError):
|
|
@@ -16,6 +17,7 @@ class Settings:
|
|
|
16
17
|
prefetch=None,
|
|
17
18
|
namespace=None,
|
|
18
19
|
project=None,
|
|
20
|
+
batch_rows=None,
|
|
19
21
|
):
|
|
20
22
|
self._cache = cache
|
|
21
23
|
self.parallel = parallel
|
|
@@ -24,6 +26,7 @@ class Settings:
|
|
|
24
26
|
self.prefetch = prefetch
|
|
25
27
|
self.namespace = namespace
|
|
26
28
|
self.project = project
|
|
29
|
+
self._chunk_rows = batch_rows
|
|
27
30
|
|
|
28
31
|
if not isinstance(cache, bool) and cache is not None:
|
|
29
32
|
raise SettingsError(
|
|
@@ -53,6 +56,18 @@ class Settings:
|
|
|
53
56
|
f", {min_task_size.__class__.__name__} was given"
|
|
54
57
|
)
|
|
55
58
|
|
|
59
|
+
if batch_rows is not None and not isinstance(batch_rows, int):
|
|
60
|
+
raise SettingsError(
|
|
61
|
+
"'batch_rows' argument must be int or None"
|
|
62
|
+
f", {batch_rows.__class__.__name__} was given"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if batch_rows is not None and batch_rows <= 0:
|
|
66
|
+
raise SettingsError(
|
|
67
|
+
"'batch_rows' argument must be positive integer"
|
|
68
|
+
f", {batch_rows} was given"
|
|
69
|
+
)
|
|
70
|
+
|
|
56
71
|
@property
|
|
57
72
|
def cache(self):
|
|
58
73
|
return self._cache if self._cache is not None else False
|
|
@@ -61,6 +76,10 @@ class Settings:
|
|
|
61
76
|
def workers(self):
|
|
62
77
|
return self._workers if self._workers is not None else False
|
|
63
78
|
|
|
79
|
+
@property
|
|
80
|
+
def batch_rows(self):
|
|
81
|
+
return self._chunk_rows if self._chunk_rows is not None else DEFAULT_CHUNK_ROWS
|
|
82
|
+
|
|
64
83
|
def to_dict(self):
|
|
65
84
|
res = {}
|
|
66
85
|
if self._cache is not None:
|
|
@@ -75,6 +94,8 @@ class Settings:
|
|
|
75
94
|
res["namespace"] = self.namespace
|
|
76
95
|
if self.project is not None:
|
|
77
96
|
res["project"] = self.project
|
|
97
|
+
if self._chunk_rows is not None:
|
|
98
|
+
res["batch_rows"] = self._chunk_rows
|
|
78
99
|
return res
|
|
79
100
|
|
|
80
101
|
def add(self, settings: "Settings"):
|
|
@@ -86,3 +107,5 @@ class Settings:
|
|
|
86
107
|
self.project = settings.project or self.project
|
|
87
108
|
if settings.prefetch is not None:
|
|
88
109
|
self.prefetch = settings.prefetch
|
|
110
|
+
if settings._chunk_rows is not None:
|
|
111
|
+
self._chunk_rows = settings._chunk_rows
|
|
@@ -62,19 +62,21 @@ class UDFProperties:
|
|
|
62
62
|
return self.udf.get_batching(use_partitioning)
|
|
63
63
|
|
|
64
64
|
@property
|
|
65
|
-
def
|
|
66
|
-
return self.udf.
|
|
65
|
+
def batch_rows(self):
|
|
66
|
+
return self.udf.batch_rows
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
@attrs.define(slots=False)
|
|
70
70
|
class UDFAdapter:
|
|
71
71
|
inner: "UDFBase"
|
|
72
72
|
output: UDFOutputSpec
|
|
73
|
+
batch_rows: Optional[int] = None
|
|
73
74
|
batch: int = 1
|
|
74
75
|
|
|
75
76
|
def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
|
|
76
77
|
if use_partitioning:
|
|
77
78
|
return Partition()
|
|
79
|
+
|
|
78
80
|
if self.batch == 1:
|
|
79
81
|
return NoBatching()
|
|
80
82
|
if self.batch > 1:
|
|
@@ -233,10 +235,15 @@ class UDFBase(AbstractUDF):
|
|
|
233
235
|
def signal_names(self) -> Iterable[str]:
|
|
234
236
|
return self.output.to_udf_spec().keys()
|
|
235
237
|
|
|
236
|
-
def to_udf_wrapper(
|
|
238
|
+
def to_udf_wrapper(
|
|
239
|
+
self,
|
|
240
|
+
batch_rows: Optional[int] = None,
|
|
241
|
+
batch: int = 1,
|
|
242
|
+
) -> UDFAdapter:
|
|
237
243
|
return UDFAdapter(
|
|
238
244
|
self,
|
|
239
245
|
self.output.to_udf_spec(),
|
|
246
|
+
batch_rows,
|
|
240
247
|
batch,
|
|
241
248
|
)
|
|
242
249
|
|
|
@@ -418,11 +425,27 @@ class Mapper(UDFBase):
|
|
|
418
425
|
|
|
419
426
|
|
|
420
427
|
class BatchMapper(UDFBase):
|
|
421
|
-
"""Inherit from this class to pass to `DataChain.batch_map()`.
|
|
428
|
+
"""Inherit from this class to pass to `DataChain.batch_map()`.
|
|
429
|
+
|
|
430
|
+
.. deprecated:: 0.29.0
|
|
431
|
+
This class is deprecated and will be removed in a future version.
|
|
432
|
+
Use `Aggregator` instead, which provides the similar functionality.
|
|
433
|
+
"""
|
|
422
434
|
|
|
423
435
|
is_input_batched = True
|
|
424
436
|
is_output_batched = True
|
|
425
437
|
|
|
438
|
+
def __init__(self):
|
|
439
|
+
import warnings
|
|
440
|
+
|
|
441
|
+
warnings.warn(
|
|
442
|
+
"BatchMapper is deprecated and will be removed in a future version. "
|
|
443
|
+
"Use Aggregator instead, which provides the similar functionality.",
|
|
444
|
+
DeprecationWarning,
|
|
445
|
+
stacklevel=2,
|
|
446
|
+
)
|
|
447
|
+
super().__init__()
|
|
448
|
+
|
|
426
449
|
def run(
|
|
427
450
|
self,
|
|
428
451
|
udf_fields: Sequence[str],
|
|
@@ -333,32 +333,24 @@ def process_udf_outputs(
|
|
|
333
333
|
udf_table: "Table",
|
|
334
334
|
udf_results: Iterator[Iterable["UDFResult"]],
|
|
335
335
|
udf: "UDFAdapter",
|
|
336
|
-
batch_size: int = INSERT_BATCH_SIZE,
|
|
337
336
|
cb: Callback = DEFAULT_CALLBACK,
|
|
338
337
|
) -> None:
|
|
339
|
-
import psutil
|
|
340
|
-
|
|
341
|
-
rows: list[UDFResult] = []
|
|
342
338
|
# Optimization: Compute row types once, rather than for every row.
|
|
343
339
|
udf_col_types = get_col_types(warehouse, udf.output)
|
|
340
|
+
batch_rows = udf.batch_rows or INSERT_BATCH_SIZE
|
|
344
341
|
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
):
|
|
355
|
-
for row_chunk in batched(rows, batch_size):
|
|
356
|
-
warehouse.insert_rows(udf_table, row_chunk)
|
|
357
|
-
rows.clear()
|
|
342
|
+
def _insert_rows():
|
|
343
|
+
for udf_output in udf_results:
|
|
344
|
+
if not udf_output:
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
with safe_closing(udf_output):
|
|
348
|
+
for row in udf_output:
|
|
349
|
+
cb.relative_update()
|
|
350
|
+
yield adjust_outputs(warehouse, row, udf_col_types)
|
|
358
351
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
warehouse.insert_rows(udf_table, row_chunk)
|
|
352
|
+
for row_chunk in batched(_insert_rows(), batch_rows):
|
|
353
|
+
warehouse.insert_rows(udf_table, row_chunk)
|
|
362
354
|
|
|
363
355
|
warehouse.insert_rows_done(udf_table)
|
|
364
356
|
|
|
@@ -401,6 +393,7 @@ class UDFStep(Step, ABC):
|
|
|
401
393
|
min_task_size: Optional[int] = None
|
|
402
394
|
is_generator = False
|
|
403
395
|
cache: bool = False
|
|
396
|
+
batch_rows: Optional[int] = None
|
|
404
397
|
|
|
405
398
|
@abstractmethod
|
|
406
399
|
def create_udf_table(self, query: Select) -> "Table":
|
|
@@ -602,6 +595,7 @@ class UDFStep(Step, ABC):
|
|
|
602
595
|
parallel=self.parallel,
|
|
603
596
|
workers=self.workers,
|
|
604
597
|
min_task_size=self.min_task_size,
|
|
598
|
+
batch_rows=self.batch_rows,
|
|
605
599
|
)
|
|
606
600
|
return self.__class__(self.udf, self.catalog)
|
|
607
601
|
|
|
@@ -1633,6 +1627,7 @@ class DatasetQuery:
|
|
|
1633
1627
|
min_task_size: Optional[int] = None,
|
|
1634
1628
|
partition_by: Optional[PartitionByType] = None,
|
|
1635
1629
|
cache: bool = False,
|
|
1630
|
+
batch_rows: Optional[int] = None,
|
|
1636
1631
|
) -> "Self":
|
|
1637
1632
|
"""
|
|
1638
1633
|
Adds one or more signals based on the results from the provided UDF.
|
|
@@ -1658,6 +1653,7 @@ class DatasetQuery:
|
|
|
1658
1653
|
workers=workers,
|
|
1659
1654
|
min_task_size=min_task_size,
|
|
1660
1655
|
cache=cache,
|
|
1656
|
+
batch_rows=batch_rows,
|
|
1661
1657
|
)
|
|
1662
1658
|
)
|
|
1663
1659
|
return query
|
|
@@ -1679,6 +1675,7 @@ class DatasetQuery:
|
|
|
1679
1675
|
namespace: Optional[str] = None,
|
|
1680
1676
|
project: Optional[str] = None,
|
|
1681
1677
|
cache: bool = False,
|
|
1678
|
+
batch_rows: Optional[int] = None,
|
|
1682
1679
|
) -> "Self":
|
|
1683
1680
|
query = self.clone()
|
|
1684
1681
|
steps = query.steps
|
|
@@ -1691,6 +1688,7 @@ class DatasetQuery:
|
|
|
1691
1688
|
workers=workers,
|
|
1692
1689
|
min_task_size=min_task_size,
|
|
1693
1690
|
cache=cache,
|
|
1691
|
+
batch_rows=batch_rows,
|
|
1694
1692
|
)
|
|
1695
1693
|
)
|
|
1696
1694
|
return query
|
|
@@ -11,7 +11,6 @@ import time
|
|
|
11
11
|
from collections.abc import Iterable, Iterator, Sequence
|
|
12
12
|
from contextlib import contextmanager
|
|
13
13
|
from datetime import date, datetime, timezone
|
|
14
|
-
from itertools import chain, islice
|
|
15
14
|
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
|
16
15
|
from uuid import UUID
|
|
17
16
|
|
|
@@ -26,6 +25,8 @@ if TYPE_CHECKING:
|
|
|
26
25
|
from typing_extensions import Self
|
|
27
26
|
|
|
28
27
|
|
|
28
|
+
DEFAULT_CHUNK_ROWS = 2000
|
|
29
|
+
|
|
29
30
|
logger = logging.getLogger("datachain")
|
|
30
31
|
|
|
31
32
|
NUL = b"\0"
|
|
@@ -225,30 +226,44 @@ def get_envs_by_prefix(prefix: str) -> dict[str, str]:
|
|
|
225
226
|
_T_co = TypeVar("_T_co", covariant=True)
|
|
226
227
|
|
|
227
228
|
|
|
228
|
-
def
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
229
|
+
def _dynamic_batched_core(
|
|
230
|
+
iterable: Iterable[_T_co],
|
|
231
|
+
batch_rows: int,
|
|
232
|
+
) -> Iterator[list[_T_co]]:
|
|
233
|
+
"""Core batching logic that yields lists."""
|
|
234
|
+
|
|
235
|
+
batch: list[_T_co] = []
|
|
236
|
+
|
|
237
|
+
for item in iterable:
|
|
238
|
+
# Check if adding this item would exceed limits
|
|
239
|
+
if len(batch) >= batch_rows and batch: # Yield current batch if we have one
|
|
240
|
+
yield batch
|
|
241
|
+
batch = []
|
|
242
|
+
|
|
243
|
+
batch.append(item)
|
|
244
|
+
|
|
245
|
+
# Yield any remaining items
|
|
246
|
+
if batch:
|
|
236
247
|
yield batch
|
|
237
248
|
|
|
238
249
|
|
|
239
|
-
def
|
|
240
|
-
"""
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
250
|
+
def batched(iterable: Iterable[_T_co], batch_rows: int) -> Iterator[tuple[_T_co, ...]]:
|
|
251
|
+
"""
|
|
252
|
+
Batch data into tuples of length batch_rows .
|
|
253
|
+
The last batch may be shorter.
|
|
254
|
+
"""
|
|
255
|
+
yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_rows))
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def batched_it(
|
|
259
|
+
iterable: Iterable[_T_co],
|
|
260
|
+
batch_rows: int = DEFAULT_CHUNK_ROWS,
|
|
261
|
+
) -> Iterator[Iterator[_T_co]]:
|
|
262
|
+
"""
|
|
263
|
+
Batch data into iterators with dynamic sizing
|
|
264
|
+
based on row count and memory usage.
|
|
265
|
+
"""
|
|
266
|
+
yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_rows))
|
|
252
267
|
|
|
253
268
|
|
|
254
269
|
def flatten(items):
|
|
@@ -379,6 +379,7 @@ tests/unit/lib/test_partition_by.py
|
|
|
379
379
|
tests/unit/lib/test_project.py
|
|
380
380
|
tests/unit/lib/test_python_to_sql.py
|
|
381
381
|
tests/unit/lib/test_schema.py
|
|
382
|
+
tests/unit/lib/test_settings.py
|
|
382
383
|
tests/unit/lib/test_signal_schema.py
|
|
383
384
|
tests/unit/lib/test_sql_to_python.py
|
|
384
385
|
tests/unit/lib/test_text.py
|
|
@@ -2420,3 +2420,22 @@ def test_agg_sample(catalog_tmpfile, parallel, sample):
|
|
|
2420
2420
|
records = list(ds.to_records())
|
|
2421
2421
|
assert len(records) == sample
|
|
2422
2422
|
assert all(row["count"] == 1 for row in records)
|
|
2423
|
+
|
|
2424
|
+
|
|
2425
|
+
def test_batch_for_map(test_session):
|
|
2426
|
+
# Create a chain with batch settings
|
|
2427
|
+
chain = dc.read_values(x=list(range(100)), session=test_session)
|
|
2428
|
+
chain_with_settings = chain.settings(batch_rows=15)
|
|
2429
|
+
|
|
2430
|
+
def add_one(x):
|
|
2431
|
+
return x + 1
|
|
2432
|
+
|
|
2433
|
+
result = chain_with_settings.map(add_one, output={"result": int})
|
|
2434
|
+
|
|
2435
|
+
results = [
|
|
2436
|
+
r[0] for r in result.to_iter("result")
|
|
2437
|
+
] # Access the first element of each tuple
|
|
2438
|
+
|
|
2439
|
+
assert len(results) == 100
|
|
2440
|
+
|
|
2441
|
+
assert set(results) == set(range(1, 101))
|
|
@@ -119,6 +119,7 @@ def test_hf_image(tmp_path):
|
|
|
119
119
|
assert row.image.img == image_to_bytes(img)
|
|
120
120
|
|
|
121
121
|
|
|
122
|
+
@pytest.mark.skip("fails with 'NotImplementedError', need to investigate")
|
|
122
123
|
@require_torchcodec
|
|
123
124
|
def test_hf_audio(tmp_path):
|
|
124
125
|
# See https://stackoverflow.com/questions/66191480/how-to-convert-a-numpy-array-to-a-mp3-file
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from datachain.lib.settings import Settings, SettingsError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_settings_defaults_and_custom():
|
|
7
|
+
"""Test Settings class with default and custom batch parameters."""
|
|
8
|
+
# Default values
|
|
9
|
+
settings = Settings()
|
|
10
|
+
assert settings.batch_rows == 2000
|
|
11
|
+
|
|
12
|
+
# Custom values
|
|
13
|
+
settings = Settings(batch_rows=500)
|
|
14
|
+
assert settings.batch_rows == 500
|
|
15
|
+
|
|
16
|
+
# to_dict method
|
|
17
|
+
d = settings.to_dict()
|
|
18
|
+
assert d["batch_rows"] == 500
|
|
19
|
+
|
|
20
|
+
# Chaining
|
|
21
|
+
s2 = settings
|
|
22
|
+
s3 = s2
|
|
23
|
+
assert s3.batch_rows == 500
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_settings_validation():
|
|
27
|
+
# Valid
|
|
28
|
+
settings = Settings(batch_rows=100)
|
|
29
|
+
assert settings.batch_rows == 100
|
|
30
|
+
|
|
31
|
+
# Invalid batch_rows
|
|
32
|
+
with pytest.raises(SettingsError):
|
|
33
|
+
Settings(batch_rows="invalid")
|
|
34
|
+
|
|
35
|
+
# Zero batch_rows
|
|
36
|
+
with pytest.raises(SettingsError):
|
|
37
|
+
Settings(batch_rows=0)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_settings_add():
|
|
41
|
+
"""Test Settings.add() method with batch_rows."""
|
|
42
|
+
# Create base settings
|
|
43
|
+
base_settings = Settings(batch_rows=1000)
|
|
44
|
+
|
|
45
|
+
# Create settings to add
|
|
46
|
+
add_settings = Settings(batch_rows=2000)
|
|
47
|
+
|
|
48
|
+
# Add settings
|
|
49
|
+
base_settings.add(add_settings)
|
|
50
|
+
|
|
51
|
+
# Verify that values from add_settings override base_settings
|
|
52
|
+
assert base_settings.batch_rows == 2000
|
|
53
|
+
|
|
54
|
+
# Test with None values (should not override)
|
|
55
|
+
base_settings = Settings(batch_rows=1000)
|
|
56
|
+
none_settings = Settings(batch_rows=None)
|
|
57
|
+
|
|
58
|
+
base_settings.add(none_settings)
|
|
59
|
+
|
|
60
|
+
# Verify that None values don't override existing values
|
|
61
|
+
assert base_settings.batch_rows == 1000
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import pytest
|
|
4
4
|
|
|
5
5
|
from datachain.utils import (
|
|
6
|
+
batched,
|
|
6
7
|
datachain_paths_join,
|
|
7
8
|
determine_processes,
|
|
8
9
|
determine_workers,
|
|
@@ -253,3 +254,21 @@ def test_nested_dict_path_set(data, path, value, expected):
|
|
|
253
254
|
)
|
|
254
255
|
def test_row_to_nested_dict(headers, row, expected):
|
|
255
256
|
assert row_to_nested_dict(headers, row) == expected
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def test_batched_basic():
|
|
260
|
+
"""Test basic batching functionality."""
|
|
261
|
+
data = list(range(10))
|
|
262
|
+
batches = list(batched(data, 3))
|
|
263
|
+
assert batches == [(0, 1, 2), (3, 4, 5), (6, 7, 8), (9,)]
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def test_batched_row_limit():
|
|
267
|
+
"""Test dynamic batching with row count limit."""
|
|
268
|
+
data = list(range(15))
|
|
269
|
+
batches = list(batched(data, batch_rows=4))
|
|
270
|
+
assert len(batches) == 4 # 15 items / 4 max = 4 batches
|
|
271
|
+
assert batches[0] == (0, 1, 2, 3)
|
|
272
|
+
assert batches[1] == (4, 5, 6, 7)
|
|
273
|
+
assert batches[2] == (8, 9, 10, 11)
|
|
274
|
+
assert batches[3] == (12, 13, 14)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|