datachain 0.14.2__tar.gz → 0.14.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.14.2/src/datachain.egg-info → datachain-0.14.4}/PKG-INFO +1 -1
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/catalog/loader.py +4 -9
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/warehouse.py +9 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dataset_info.py +5 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/datasets.py +2 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/udf.py +3 -3
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/dataset.py +39 -40
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/dispatch.py +62 -58
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/session.py +4 -0
- datachain-0.14.4/src/datachain/query/udf.py +49 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/utils.py +30 -4
- {datachain-0.14.2 → datachain-0.14.4/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_datachain.py +11 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_datachain.py +13 -1
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_catalog_loader.py +7 -14
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_session.py +12 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_utils.py +46 -10
- datachain-0.14.2/src/datachain/query/udf.py +0 -20
- {datachain-0.14.2 → datachain-0.14.4}/.cruft.json +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.gitattributes +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.github/codecov.yaml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.github/dependabot.yml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.github/workflows/release.yml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.github/workflows/tests.yml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.gitignore +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/.pre-commit-config.yaml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/LICENSE +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/README.rst +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/assets/datachain.svg +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/contributing.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/examples.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/index.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/overrides/main.html +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/quick-start.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/file.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/index.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/pose.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/segment.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/datachain.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/func.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/index.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/remotes.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/toolkit.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/torch.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/references/udf.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/docs/tutorials.md +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/multimodal/wds.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/mkdocs.yml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/noxfile.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/pyproject.toml +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/setup.cfg +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/__main__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/asyn.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cache.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/azure.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/gcs.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/hf.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/local.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/s3.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/config.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/dataset.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/error.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/fs/reference.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/fs/utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/array.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/base.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/conditional.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/func.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/numeric.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/path.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/random.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/string.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/window.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/job.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/clip.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/datachain.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/file.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/hf.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/image.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/listing.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/settings.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/tar.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/text.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/video.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/listing.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/bbox.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/pose.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/segment.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/node.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/progress.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/py.typed +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/batch.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/metrics.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/params.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/queue.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/schema.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/remote/studio.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/script_meta.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/types.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/studio.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/telemetry.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/conftest.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/data.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/examples/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/examples/test_examples.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/examples/wds_data.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/data/lena.jpg +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_catalog.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_client.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_data_storage.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_datasets.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_file.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_hf.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_image.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_listing.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_ls.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_metrics.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_pull.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_pytorch.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_query.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_session.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_toolkit.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_video.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_warehouse.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/scripts/feature_class.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/test_atomicity.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/test_cli_e2e.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/test_cli_studio.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/test_import_time.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/test_query_e2e.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/test_telemetry.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/model/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_asyn.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_cache.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_catalog.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_client.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_config.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_dataset.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_func.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_listing.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_metastore.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_query.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_query_params.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_serializer.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.14.2 → datachain-0.14.4}/tests/utils.py +0 -0
|
@@ -7,6 +7,7 @@ from datachain.utils import get_envs_by_prefix
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from datachain.catalog import Catalog
|
|
9
9
|
from datachain.data_storage import AbstractMetastore, AbstractWarehouse
|
|
10
|
+
from datachain.query.udf import AbstractUDFDistributor
|
|
10
11
|
|
|
11
12
|
METASTORE_SERIALIZED = "DATACHAIN__METASTORE"
|
|
12
13
|
METASTORE_IMPORT_PATH = "DATACHAIN_METASTORE"
|
|
@@ -15,7 +16,6 @@ WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
|
|
|
15
16
|
WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
|
|
16
17
|
WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
|
|
17
18
|
DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
|
|
18
|
-
DISTRIBUTED_ARG_PREFIX = "DATACHAIN_DISTRIBUTED_ARG_"
|
|
19
19
|
|
|
20
20
|
IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
21
21
|
|
|
@@ -100,27 +100,22 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
|
100
100
|
return warehouse_class(**warehouse_args)
|
|
101
101
|
|
|
102
102
|
|
|
103
|
-
def
|
|
103
|
+
def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
|
|
104
104
|
distributed_import_path = os.environ.get(DISTRIBUTED_IMPORT_PATH)
|
|
105
|
-
distributed_arg_envs = get_envs_by_prefix(DISTRIBUTED_ARG_PREFIX)
|
|
106
|
-
# Convert env variable names to keyword argument names by lowercasing them
|
|
107
|
-
distributed_args = {k.lower(): v for k, v in distributed_arg_envs.items()}
|
|
108
105
|
|
|
109
106
|
if not distributed_import_path:
|
|
110
107
|
raise RuntimeError(
|
|
111
108
|
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
112
109
|
"for distributed UDF processing."
|
|
113
110
|
)
|
|
114
|
-
# Distributed class paths are specified as (for example):
|
|
115
|
-
# module.classname
|
|
111
|
+
# Distributed class paths are specified as (for example): module.classname
|
|
116
112
|
if "." not in distributed_import_path:
|
|
117
113
|
raise RuntimeError(
|
|
118
114
|
f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
|
|
119
115
|
)
|
|
120
116
|
module_name, _, class_name = distributed_import_path.rpartition(".")
|
|
121
117
|
distributed = import_module(module_name)
|
|
122
|
-
|
|
123
|
-
return distributed_class(**distributed_args | kwargs)
|
|
118
|
+
return getattr(distributed, class_name)
|
|
124
119
|
|
|
125
120
|
|
|
126
121
|
def get_catalog(
|
|
@@ -199,6 +199,15 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
199
199
|
# Query Execution
|
|
200
200
|
#
|
|
201
201
|
|
|
202
|
+
def query_count(self, query: sa.sql.selectable.Select) -> int:
|
|
203
|
+
"""Count the number of rows in a query."""
|
|
204
|
+
count_query = sa.select(func.count(1)).select_from(query.subquery())
|
|
205
|
+
return next(self.db.execute(count_query))[0]
|
|
206
|
+
|
|
207
|
+
def table_rows_count(self, table) -> int:
|
|
208
|
+
count_query = sa.select(func.count(1)).select_from(table)
|
|
209
|
+
return next(self.db.execute(count_query))[0]
|
|
210
|
+
|
|
202
211
|
def dataset_select_paginated(
|
|
203
212
|
self,
|
|
204
213
|
query,
|
|
@@ -12,6 +12,7 @@ from datachain.dataset import (
|
|
|
12
12
|
)
|
|
13
13
|
from datachain.job import Job
|
|
14
14
|
from datachain.lib.data_model import DataModel
|
|
15
|
+
from datachain.query.session import Session
|
|
15
16
|
from datachain.utils import TIME_ZERO
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
@@ -32,6 +33,10 @@ class DatasetInfo(DataModel):
|
|
|
32
33
|
error_message: str = Field(default="")
|
|
33
34
|
error_stack: str = Field(default="")
|
|
34
35
|
|
|
36
|
+
@property
|
|
37
|
+
def is_temp(self) -> bool:
|
|
38
|
+
return Session.is_temp_dataset(self.name)
|
|
39
|
+
|
|
35
40
|
@staticmethod
|
|
36
41
|
def _validate_dict(
|
|
37
42
|
v: Optional[Union[str, dict]],
|
|
@@ -16,7 +16,6 @@ from datachain.lib.convert.flatten import flatten
|
|
|
16
16
|
from datachain.lib.data_model import DataValue
|
|
17
17
|
from datachain.lib.file import File
|
|
18
18
|
from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
|
|
19
|
-
from datachain.progress import CombinedDownloadCallback
|
|
20
19
|
from datachain.query.batch import (
|
|
21
20
|
Batch,
|
|
22
21
|
BatchingStrategy,
|
|
@@ -327,8 +326,9 @@ def _prefetch_inputs(
|
|
|
327
326
|
|
|
328
327
|
if after_prefetch is None:
|
|
329
328
|
after_prefetch = noop
|
|
330
|
-
if
|
|
331
|
-
|
|
329
|
+
if download_cb and hasattr(download_cb, "increment_file_count"):
|
|
330
|
+
increment_file_count: Callable[[], None] = download_cb.increment_file_count
|
|
331
|
+
after_prefetch = increment_file_count
|
|
332
332
|
|
|
333
333
|
f = partial(_prefetch_input, download_cb=download_cb, after_prefetch=after_prefetch)
|
|
334
334
|
mapper = AsyncMapper(f, prepared_inputs, workers=prefetch)
|
|
@@ -55,10 +55,12 @@ from datachain.lib.udf import UDFAdapter, _get_cache
|
|
|
55
55
|
from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
|
|
56
56
|
from datachain.query.schema import C, UDFParamSpec, normalize_param
|
|
57
57
|
from datachain.query.session import Session
|
|
58
|
+
from datachain.query.udf import UdfInfo
|
|
58
59
|
from datachain.sql.functions.random import rand
|
|
59
60
|
from datachain.utils import (
|
|
60
61
|
batched,
|
|
61
62
|
determine_processes,
|
|
63
|
+
determine_workers,
|
|
62
64
|
filtered_cloudpickle_dumps,
|
|
63
65
|
get_datachain_executable,
|
|
64
66
|
safe_closing,
|
|
@@ -74,7 +76,6 @@ if TYPE_CHECKING:
|
|
|
74
76
|
from datachain.data_storage import AbstractWarehouse
|
|
75
77
|
from datachain.dataset import DatasetRecord
|
|
76
78
|
from datachain.lib.udf import UDFAdapter, UDFResult
|
|
77
|
-
from datachain.query.udf import UdfInfo
|
|
78
79
|
|
|
79
80
|
P = ParamSpec("P")
|
|
80
81
|
|
|
@@ -414,20 +415,15 @@ class UDFStep(Step, ABC):
|
|
|
414
415
|
def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
|
|
415
416
|
from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
|
|
416
417
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
if (
|
|
421
|
-
not workers
|
|
422
|
-
and os.environ.get("DATACHAIN_DISTRIBUTED")
|
|
423
|
-
and os.environ.get("DATACHAIN_SETTINGS_WORKERS")
|
|
424
|
-
):
|
|
425
|
-
# Enable distributed processing by default if the module is available,
|
|
426
|
-
# and a default number of workers is provided.
|
|
427
|
-
workers = True
|
|
418
|
+
rows_total = self.catalog.warehouse.query_count(query)
|
|
419
|
+
if rows_total == 0:
|
|
420
|
+
return
|
|
428
421
|
|
|
429
|
-
|
|
422
|
+
workers = determine_workers(self.workers, rows_total=rows_total)
|
|
423
|
+
processes = determine_processes(self.parallel, rows_total=rows_total)
|
|
430
424
|
|
|
425
|
+
use_partitioning = self.partition_by is not None
|
|
426
|
+
batching = self.udf.get_batching(use_partitioning)
|
|
431
427
|
udf_fields = [str(c.name) for c in query.selected_columns]
|
|
432
428
|
|
|
433
429
|
prefetch = self.udf.prefetch
|
|
@@ -441,23 +437,24 @@ class UDFStep(Step, ABC):
|
|
|
441
437
|
"distributed processing."
|
|
442
438
|
)
|
|
443
439
|
|
|
444
|
-
from datachain.catalog.loader import
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
processes,
|
|
440
|
+
from datachain.catalog.loader import get_udf_distributor_class
|
|
441
|
+
|
|
442
|
+
udf_distributor_class = get_udf_distributor_class()
|
|
443
|
+
udf_distributor = udf_distributor_class(
|
|
444
|
+
catalog=catalog,
|
|
445
|
+
table=udf_table,
|
|
446
|
+
query=query,
|
|
447
|
+
udf_data=filtered_cloudpickle_dumps(self.udf),
|
|
448
|
+
batching=batching,
|
|
449
|
+
workers=workers,
|
|
450
|
+
processes=processes,
|
|
456
451
|
udf_fields=udf_fields,
|
|
452
|
+
rows_total=rows_total,
|
|
453
|
+
use_cache=self.cache,
|
|
457
454
|
is_generator=self.is_generator,
|
|
458
|
-
|
|
459
|
-
cache=self.cache,
|
|
455
|
+
min_task_size=self.min_task_size,
|
|
460
456
|
)
|
|
457
|
+
udf_distributor()
|
|
461
458
|
elif processes:
|
|
462
459
|
# Parallel processing (faster for more CPU-heavy UDFs)
|
|
463
460
|
if catalog.in_memory:
|
|
@@ -465,19 +462,21 @@ class UDFStep(Step, ABC):
|
|
|
465
462
|
"In-memory databases cannot be used "
|
|
466
463
|
"with parallel processing."
|
|
467
464
|
)
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
465
|
+
|
|
466
|
+
udf_info = UdfInfo(
|
|
467
|
+
udf_data=filtered_cloudpickle_dumps(self.udf),
|
|
468
|
+
catalog_init=catalog.get_init_params(),
|
|
469
|
+
metastore_clone_params=catalog.metastore.clone_params(),
|
|
470
|
+
warehouse_clone_params=catalog.warehouse.clone_params(),
|
|
471
|
+
table=udf_table,
|
|
472
|
+
query=query,
|
|
473
|
+
udf_fields=udf_fields,
|
|
474
|
+
batching=batching,
|
|
475
|
+
processes=processes,
|
|
476
|
+
is_generator=self.is_generator,
|
|
477
|
+
cache=self.cache,
|
|
478
|
+
rows_total=rows_total,
|
|
479
|
+
)
|
|
481
480
|
|
|
482
481
|
# Run the UDFDispatcher in another process to avoid needing
|
|
483
482
|
# if __name__ == '__main__': in user scripts
|
|
@@ -4,18 +4,16 @@ from itertools import chain
|
|
|
4
4
|
from multiprocessing import cpu_count
|
|
5
5
|
from sys import stdin
|
|
6
6
|
from threading import Timer
|
|
7
|
-
from typing import TYPE_CHECKING, Optional
|
|
7
|
+
from typing import TYPE_CHECKING, Literal, Optional
|
|
8
8
|
|
|
9
|
-
import attrs
|
|
10
9
|
import multiprocess
|
|
11
10
|
from cloudpickle import load, loads
|
|
12
11
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
13
12
|
from multiprocess import get_context
|
|
14
|
-
from sqlalchemy.sql import func
|
|
15
13
|
|
|
16
14
|
from datachain.catalog import Catalog
|
|
17
15
|
from datachain.catalog.catalog import clone_catalog_with_cache
|
|
18
|
-
from datachain.catalog.loader import
|
|
16
|
+
from datachain.catalog.loader import get_udf_distributor_class
|
|
19
17
|
from datachain.lib.udf import _get_cache
|
|
20
18
|
from datachain.query.batch import RowsOutput, RowsOutputBatch
|
|
21
19
|
from datachain.query.dataset import (
|
|
@@ -59,7 +57,9 @@ def udf_entrypoint() -> int:
|
|
|
59
57
|
dispatch = UDFDispatcher(udf_info)
|
|
60
58
|
|
|
61
59
|
query = udf_info["query"]
|
|
60
|
+
rows_total = udf_info["rows_total"]
|
|
62
61
|
batching = udf_info["batching"]
|
|
62
|
+
is_generator = udf_info["is_generator"]
|
|
63
63
|
n_workers = udf_info["processes"]
|
|
64
64
|
if n_workers is True:
|
|
65
65
|
n_workers = None # Use default number of CPUs (cores)
|
|
@@ -67,34 +67,31 @@ def udf_entrypoint() -> int:
|
|
|
67
67
|
wh_cls, wh_args, wh_kwargs = udf_info["warehouse_clone_params"]
|
|
68
68
|
warehouse: AbstractWarehouse = wh_cls(*wh_args, **wh_kwargs)
|
|
69
69
|
|
|
70
|
-
total_rows = next(
|
|
71
|
-
warehouse.db.execute(
|
|
72
|
-
query.with_only_columns(func.count(query.c.sys__id)).order_by(None)
|
|
73
|
-
)
|
|
74
|
-
)[0]
|
|
75
|
-
|
|
76
70
|
with contextlib.closing(
|
|
77
71
|
batching(warehouse.dataset_select_paginated, query, ids_only=True)
|
|
78
72
|
) as udf_inputs:
|
|
79
73
|
download_cb = get_download_callback()
|
|
80
74
|
processed_cb = get_processed_callback()
|
|
75
|
+
generated_cb = get_generated_callback(is_generator)
|
|
81
76
|
try:
|
|
82
77
|
dispatch.run_udf_parallel(
|
|
83
78
|
udf_inputs,
|
|
84
|
-
|
|
79
|
+
rows_total=rows_total,
|
|
85
80
|
n_workers=n_workers,
|
|
86
|
-
processed_cb=processed_cb,
|
|
87
81
|
download_cb=download_cb,
|
|
82
|
+
processed_cb=processed_cb,
|
|
83
|
+
generated_cb=generated_cb,
|
|
88
84
|
)
|
|
89
85
|
finally:
|
|
90
86
|
download_cb.close()
|
|
91
87
|
processed_cb.close()
|
|
88
|
+
generated_cb.close()
|
|
92
89
|
|
|
93
90
|
return 0
|
|
94
91
|
|
|
95
92
|
|
|
96
93
|
def udf_worker_entrypoint() -> int:
|
|
97
|
-
return
|
|
94
|
+
return get_udf_distributor_class().run_worker()
|
|
98
95
|
|
|
99
96
|
|
|
100
97
|
class UDFDispatcher:
|
|
@@ -134,7 +131,6 @@ class UDFDispatcher:
|
|
|
134
131
|
self.done_queue,
|
|
135
132
|
self.query,
|
|
136
133
|
self.table,
|
|
137
|
-
self.is_generator,
|
|
138
134
|
self.is_batching,
|
|
139
135
|
self.cache,
|
|
140
136
|
self.udf_fields,
|
|
@@ -158,20 +154,18 @@ class UDFDispatcher:
|
|
|
158
154
|
for _ in range(n_workers):
|
|
159
155
|
put_into_queue(task_queue, STOP_SIGNAL)
|
|
160
156
|
|
|
161
|
-
def create_input_queue(self):
|
|
162
|
-
return self.ctx.Queue()
|
|
163
|
-
|
|
164
157
|
def run_udf_parallel( # noqa: C901, PLR0912
|
|
165
158
|
self,
|
|
166
159
|
input_rows: Iterable[RowsOutput],
|
|
167
|
-
|
|
160
|
+
rows_total: int,
|
|
168
161
|
n_workers: Optional[int] = None,
|
|
169
|
-
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
170
162
|
download_cb: Callback = DEFAULT_CALLBACK,
|
|
163
|
+
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
164
|
+
generated_cb: Callback = DEFAULT_CALLBACK,
|
|
171
165
|
) -> None:
|
|
172
166
|
n_workers = get_n_workers_from_arg(n_workers)
|
|
173
167
|
|
|
174
|
-
input_batch_size =
|
|
168
|
+
input_batch_size = rows_total // n_workers
|
|
175
169
|
if input_batch_size == 0:
|
|
176
170
|
input_batch_size = 1
|
|
177
171
|
elif input_batch_size > DEFAULT_BATCH_SIZE:
|
|
@@ -220,6 +214,8 @@ class UDFDispatcher:
|
|
|
220
214
|
download_cb.relative_update(downloaded)
|
|
221
215
|
if processed := result.get("processed"):
|
|
222
216
|
processed_cb.relative_update(processed)
|
|
217
|
+
if generated := result.get("generated"):
|
|
218
|
+
generated_cb.relative_update(generated)
|
|
223
219
|
|
|
224
220
|
status = result["status"]
|
|
225
221
|
if status in (OK_STATUS, NOTIFY_STATUS):
|
|
@@ -266,46 +262,61 @@ class UDFDispatcher:
|
|
|
266
262
|
p.join()
|
|
267
263
|
|
|
268
264
|
|
|
269
|
-
class
|
|
270
|
-
def __init__(self, queue: "multiprocess.Queue"):
|
|
265
|
+
class DownloadCallback(Callback):
|
|
266
|
+
def __init__(self, queue: "multiprocess.Queue") -> None:
|
|
271
267
|
self.queue = queue
|
|
272
268
|
super().__init__()
|
|
273
269
|
|
|
274
270
|
def relative_update(self, inc: int = 1) -> None:
|
|
271
|
+
# This callback is used to notify the size of the downloaded files
|
|
272
|
+
pass
|
|
273
|
+
|
|
274
|
+
def increment_file_count(self, inc: int = 1) -> None:
|
|
275
275
|
put_into_queue(self.queue, {"status": NOTIFY_STATUS, "downloaded": inc})
|
|
276
276
|
|
|
277
277
|
|
|
278
278
|
class ProcessedCallback(Callback):
|
|
279
|
-
def __init__(
|
|
280
|
-
self
|
|
279
|
+
def __init__(
|
|
280
|
+
self,
|
|
281
|
+
name: Literal["processed", "generated"],
|
|
282
|
+
queue: "multiprocess.Queue",
|
|
283
|
+
) -> None:
|
|
284
|
+
self.name = name
|
|
285
|
+
self.queue = queue
|
|
281
286
|
super().__init__()
|
|
282
287
|
|
|
283
288
|
def relative_update(self, inc: int = 1) -> None:
|
|
284
|
-
self.
|
|
289
|
+
put_into_queue(self.queue, {"status": NOTIFY_STATUS, self.name: inc})
|
|
285
290
|
|
|
286
291
|
|
|
287
|
-
@attrs.define
|
|
288
292
|
class UDFWorker:
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
293
|
+
def __init__(
|
|
294
|
+
self,
|
|
295
|
+
catalog: "Catalog",
|
|
296
|
+
udf: "UDFAdapter",
|
|
297
|
+
task_queue: "multiprocess.Queue",
|
|
298
|
+
done_queue: "multiprocess.Queue",
|
|
299
|
+
query: "Select",
|
|
300
|
+
table: "Table",
|
|
301
|
+
is_batching: bool,
|
|
302
|
+
cache: bool,
|
|
303
|
+
udf_fields: Sequence[str],
|
|
304
|
+
) -> None:
|
|
305
|
+
self.catalog = catalog
|
|
306
|
+
self.udf = udf
|
|
307
|
+
self.task_queue = task_queue
|
|
308
|
+
self.done_queue = done_queue
|
|
309
|
+
self.query = query
|
|
310
|
+
self.table = table
|
|
311
|
+
self.is_batching = is_batching
|
|
312
|
+
self.cache = cache
|
|
313
|
+
self.udf_fields = udf_fields
|
|
314
|
+
|
|
315
|
+
self.download_cb = DownloadCallback(self.done_queue)
|
|
316
|
+
self.processed_cb = ProcessedCallback("processed", self.done_queue)
|
|
317
|
+
self.generated_cb = ProcessedCallback("generated", self.done_queue)
|
|
304
318
|
|
|
305
319
|
def run(self) -> None:
|
|
306
|
-
processed_cb = ProcessedCallback()
|
|
307
|
-
generated_cb = get_generated_callback(self.is_generator)
|
|
308
|
-
|
|
309
320
|
prefetch = self.udf.prefetch
|
|
310
321
|
with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
|
|
311
322
|
catalog = clone_catalog_with_cache(self.catalog, _cache)
|
|
@@ -314,29 +325,22 @@ class UDFWorker:
|
|
|
314
325
|
self.get_inputs(),
|
|
315
326
|
catalog,
|
|
316
327
|
self.cache,
|
|
317
|
-
download_cb=self.
|
|
318
|
-
processed_cb=processed_cb,
|
|
328
|
+
download_cb=self.download_cb,
|
|
329
|
+
processed_cb=self.processed_cb,
|
|
319
330
|
)
|
|
320
331
|
with safe_closing(udf_results):
|
|
321
332
|
process_udf_outputs(
|
|
322
333
|
catalog.warehouse,
|
|
323
334
|
self.table,
|
|
324
|
-
self.notify_and_process(udf_results
|
|
335
|
+
self.notify_and_process(udf_results),
|
|
325
336
|
self.udf,
|
|
326
|
-
cb=generated_cb,
|
|
337
|
+
cb=self.generated_cb,
|
|
327
338
|
)
|
|
339
|
+
put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
|
|
328
340
|
|
|
329
|
-
|
|
330
|
-
self.done_queue,
|
|
331
|
-
{"status": FINISHED_STATUS, "processed": processed_cb.processed_rows},
|
|
332
|
-
)
|
|
333
|
-
|
|
334
|
-
def notify_and_process(self, udf_results, processed_cb):
|
|
341
|
+
def notify_and_process(self, udf_results):
|
|
335
342
|
for row in udf_results:
|
|
336
|
-
put_into_queue(
|
|
337
|
-
self.done_queue,
|
|
338
|
-
{"status": OK_STATUS, "processed": processed_cb.processed_rows},
|
|
339
|
-
)
|
|
343
|
+
put_into_queue(self.done_queue, {"status": OK_STATUS})
|
|
340
344
|
yield row
|
|
341
345
|
|
|
342
346
|
def get_inputs(self):
|
|
@@ -100,6 +100,10 @@ class Session:
|
|
|
100
100
|
def get_temp_prefix(self) -> str:
|
|
101
101
|
return f"{self.DATASET_PREFIX}{self.name}_"
|
|
102
102
|
|
|
103
|
+
@classmethod
|
|
104
|
+
def is_temp_dataset(cls, name) -> bool:
|
|
105
|
+
return name.startswith(cls.DATASET_PREFIX)
|
|
106
|
+
|
|
103
107
|
def _cleanup_temp_datasets(self) -> None:
|
|
104
108
|
prefix = self.get_temp_prefix()
|
|
105
109
|
try:
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, TypedDict, Union
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from sqlalchemy import Select, Table
|
|
6
|
+
|
|
7
|
+
from datachain.catalog import Catalog
|
|
8
|
+
from datachain.query.batch import BatchingStrategy
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class UdfInfo(TypedDict):
|
|
12
|
+
udf_data: bytes
|
|
13
|
+
catalog_init: dict[str, Any]
|
|
14
|
+
metastore_clone_params: tuple[Callable[..., Any], list[Any], dict[str, Any]]
|
|
15
|
+
warehouse_clone_params: tuple[Callable[..., Any], list[Any], dict[str, Any]]
|
|
16
|
+
table: "Table"
|
|
17
|
+
query: "Select"
|
|
18
|
+
udf_fields: list[str]
|
|
19
|
+
batching: "BatchingStrategy"
|
|
20
|
+
processes: Optional[int]
|
|
21
|
+
is_generator: bool
|
|
22
|
+
cache: bool
|
|
23
|
+
rows_total: int
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AbstractUDFDistributor(ABC):
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
catalog: "Catalog",
|
|
31
|
+
table: "Table",
|
|
32
|
+
query: "Select",
|
|
33
|
+
udf_data: bytes,
|
|
34
|
+
batching: "BatchingStrategy",
|
|
35
|
+
workers: Union[bool, int],
|
|
36
|
+
processes: Union[bool, int],
|
|
37
|
+
udf_fields: list[str],
|
|
38
|
+
rows_total: int,
|
|
39
|
+
use_cache: bool,
|
|
40
|
+
is_generator: bool = False,
|
|
41
|
+
min_task_size: Optional[Union[str, int]] = None,
|
|
42
|
+
) -> None: ...
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def __call__(self) -> None: ...
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def run_worker() -> int: ...
|
|
@@ -286,15 +286,41 @@ def retry_with_backoff(retries=5, backoff_sec=1, errors=(Exception,)):
|
|
|
286
286
|
return retry
|
|
287
287
|
|
|
288
288
|
|
|
289
|
-
def
|
|
289
|
+
def determine_workers(
|
|
290
|
+
workers: Union[bool, int],
|
|
291
|
+
rows_total: Optional[int] = None,
|
|
292
|
+
) -> Union[bool, int]:
|
|
293
|
+
"""Determine the number of workers to use for distributed processing."""
|
|
294
|
+
if rows_total is not None and rows_total <= 1:
|
|
295
|
+
# Disable distributed processing if there is no rows or only one row.
|
|
296
|
+
return False
|
|
297
|
+
if (
|
|
298
|
+
workers is False
|
|
299
|
+
and os.environ.get("DATACHAIN_DISTRIBUTED")
|
|
300
|
+
and os.environ.get("DATACHAIN_SETTINGS_WORKERS")
|
|
301
|
+
):
|
|
302
|
+
# Enable distributed processing by default if the module is available,
|
|
303
|
+
# and a default number of workers is provided.
|
|
304
|
+
workers = int(os.environ["DATACHAIN_SETTINGS_WORKERS"])
|
|
305
|
+
if not workers or workers <= 0:
|
|
306
|
+
return False
|
|
307
|
+
return workers
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def determine_processes(
|
|
311
|
+
parallel: Optional[Union[bool, int]] = None,
|
|
312
|
+
rows_total: Optional[int] = None,
|
|
313
|
+
) -> Union[bool, int]:
|
|
314
|
+
"""Determine the number of processes to use for parallel processing."""
|
|
315
|
+
if rows_total is not None and rows_total <= 1:
|
|
316
|
+
# Disable parallel processing if there is no rows or only one row.
|
|
317
|
+
return False
|
|
290
318
|
if parallel is None and os.environ.get("DATACHAIN_SETTINGS_PARALLEL") is not None:
|
|
291
319
|
parallel = int(os.environ["DATACHAIN_SETTINGS_PARALLEL"])
|
|
292
|
-
if parallel is None or parallel is False:
|
|
320
|
+
if parallel is None or parallel is False or parallel == 0:
|
|
293
321
|
return False
|
|
294
322
|
if parallel is True:
|
|
295
323
|
return True
|
|
296
|
-
if parallel == 0:
|
|
297
|
-
return False
|
|
298
324
|
if parallel < 0:
|
|
299
325
|
return True
|
|
300
326
|
return parallel
|
|
@@ -538,6 +538,17 @@ def test_show(capsys, test_session):
|
|
|
538
538
|
assert f"{i} {first_name[i]}" in normalized_output
|
|
539
539
|
|
|
540
540
|
|
|
541
|
+
def test_show_without_temp_datasets(capsys, test_session):
|
|
542
|
+
dc.read_values(
|
|
543
|
+
key=[1, 2, 3, 4], session=test_session
|
|
544
|
+
).save() # creates temp dataset
|
|
545
|
+
dc.datasets().show()
|
|
546
|
+
captured = capsys.readouterr()
|
|
547
|
+
normalized_output = re.sub(r"\s+", " ", captured.out)
|
|
548
|
+
print(normalized_output)
|
|
549
|
+
assert "Empty result" in normalized_output
|
|
550
|
+
|
|
551
|
+
|
|
541
552
|
def test_class_method_deprecated(capsys, test_session):
|
|
542
553
|
with pytest.warns(DeprecationWarning):
|
|
543
554
|
dc.DataChain.from_values(key=["a", "b", "c"], session=test_session)
|
|
@@ -4,7 +4,7 @@ import math
|
|
|
4
4
|
import os
|
|
5
5
|
import re
|
|
6
6
|
from collections.abc import Generator, Iterator
|
|
7
|
-
from unittest.mock import ANY
|
|
7
|
+
from unittest.mock import ANY, patch
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import pandas as pd
|
|
@@ -26,6 +26,7 @@ from datachain.lib.signal_schema import (
|
|
|
26
26
|
SignalResolvingTypeError,
|
|
27
27
|
SignalSchema,
|
|
28
28
|
)
|
|
29
|
+
from datachain.lib.udf import UDFAdapter
|
|
29
30
|
from datachain.lib.udf_signature import UdfSignatureError
|
|
30
31
|
from datachain.lib.utils import DataChainColumnError, DataChainParamsError
|
|
31
32
|
from datachain.sql.types import Float, Int64, String
|
|
@@ -270,6 +271,17 @@ def test_read_record_empty_chain_without_schema(test_session):
|
|
|
270
271
|
)
|
|
271
272
|
|
|
272
273
|
|
|
274
|
+
def test_empty_chain_skip_udf_run(test_session):
|
|
275
|
+
# Test that UDF is not called for empty chain
|
|
276
|
+
with patch.object(UDFAdapter, "run") as mock_udf_run:
|
|
277
|
+
(
|
|
278
|
+
dc.read_records([], schema={"val": int}, session=test_session)
|
|
279
|
+
.map(lambda val: val * 2, params="val", output={"val2": int})
|
|
280
|
+
.exec()
|
|
281
|
+
)
|
|
282
|
+
mock_udf_run.assert_not_called()
|
|
283
|
+
|
|
284
|
+
|
|
273
285
|
def test_datasets(test_session):
|
|
274
286
|
ds = dc.datasets(session=test_session)
|
|
275
287
|
datasets = [d for d in ds.collect("dataset") if d.name == "fibonacci"]
|