datachain 0.30.4__tar.gz → 0.30.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.30.4 → datachain-0.30.6}/PKG-INFO +3 -3
- {datachain-0.30.4 → datachain-0.30.6}/docs/guide/delta.md +20 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/get_started/json-csv-reader.py +8 -6
- datachain-0.30.6/examples/get_started/nested_datamodel.py +70 -0
- {datachain-0.30.4 → datachain-0.30.6}/pyproject.toml +2 -2
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/commands/datasets.py +32 -17
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/data_storage/warehouse.py +2 -2
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/delta.py +36 -20
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/arrow.py +2 -2
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/datachain.py +17 -7
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/datasets.py +4 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/storage.py +5 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/model_store.py +12 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/dispatch.py +5 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/sqlite/base.py +12 -11
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/sqlite/types.py +8 -13
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/types.py +3 -3
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/utils.py +1 -1
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain.egg-info/PKG-INFO +3 -3
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain.egg-info/SOURCES.txt +2 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain.egg-info/requires.txt +2 -2
- {datachain-0.30.4 → datachain-0.30.6}/tests/conftest.py +3 -5
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/functions/test_array.py +82 -4
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_data_storage.py +2 -2
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_delta.py +88 -33
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_retry.py +40 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_datachain.py +36 -0
- datachain-0.30.6/tests/unit/sql/sqlite/test_types.py +40 -0
- datachain-0.30.6/tests/unit/test_cli_datasets.py +64 -0
- datachain-0.30.4/tests/unit/sql/sqlite/test_types.py +0 -19
- {datachain-0.30.4 → datachain-0.30.6}/.cruft.json +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.gitattributes +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.github/codecov.yaml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.github/dependabot.yml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.github/workflows/release.yml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.github/workflows/tests.yml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.gitignore +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/.pre-commit-config.yaml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/LICENSE +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/README.rst +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/assets/datachain.svg +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/commands/auth/login.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/commands/auth/logout.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/commands/auth/team.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/commands/auth/token.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/commands/index.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/commands/job/cancel.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/commands/job/clusters.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/commands/job/logs.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/commands/job/ls.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/commands/job/run.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/contributing.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/examples.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/guide/db_migrations.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/guide/env.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/guide/index.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/guide/namespaces.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/guide/processing.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/guide/remotes.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/guide/retry.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/index.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/overrides/main.html +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/quick-start.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/data-types/file.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/data-types/index.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/data-types/pose.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/data-types/segment.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/datachain.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/func.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/functions/aggregate.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/functions/array.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/functions/conditional.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/functions/numeric.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/functions/path.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/functions/random.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/functions/string.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/functions/window.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/index.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/toolkit.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/torch.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/references/udf.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/docs/tutorials.md +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/multimodal/wds.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/mkdocs.yml +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/noxfile.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/setup.cfg +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/__main__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/asyn.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cache.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/cli/utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/client/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/client/azure.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/client/gcs.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/client/hf.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/client/local.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/client/s3.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/config.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/dataset.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/error.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/fs/reference.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/fs/utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/array.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/base.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/conditional.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/func.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/numeric.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/path.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/random.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/string.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/func/window.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/job.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/audio.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/clip.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/file.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/hf.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/image.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/listing.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/projects.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/settings.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/tar.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/text.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/udf.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/video.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/listing.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/model/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/model/bbox.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/model/pose.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/model/segment.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/model/utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/namespace.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/node.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/progress.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/project.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/py.typed +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/batch.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/dataset.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/metrics.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/params.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/queue.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/schema.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/session.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/udf.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/query/utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/remote/studio.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/script_meta.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/semver.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/postgresql_dialect.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/postgresql_types.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/sql/utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/studio.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/telemetry.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/data.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/examples/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/examples/test_examples.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/examples/wds_data.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/data/lena.jpg +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/functions/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/functions/test_path.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/functions/test_random.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/functions/test_string.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/model/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_audio.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_batching.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_catalog.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_client.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_datachain.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_datasets.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_file.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_hf.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_image.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_listing.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_ls.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_metastore.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_metrics.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_mutate.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_pull.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_pytorch.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_query.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_read_database.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_session.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_to_database.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_toolkit.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_video.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/func/test_warehouse.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/scripts/feature_class.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/test_atomicity.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/test_cli_e2e.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/test_cli_studio.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/test_import_time.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/test_query_e2e.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/test_telemetry.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_settings.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/model/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_asyn.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_cache.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_catalog.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_client.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_config.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_dataset.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_func.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_listing.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_metastore.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_query.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_query_params.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_semver.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_serializer.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_session.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_utils.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.30.4 → datachain-0.30.6}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.30.
|
|
3
|
+
Version: 0.30.6
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -22,6 +22,7 @@ Requires-Dist: tomlkit
|
|
|
22
22
|
Requires-Dist: tqdm
|
|
23
23
|
Requires-Dist: numpy<3,>=1
|
|
24
24
|
Requires-Dist: pandas>=2.0.0
|
|
25
|
+
Requires-Dist: ujson>=5.10.0
|
|
25
26
|
Requires-Dist: packaging
|
|
26
27
|
Requires-Dist: pyarrow
|
|
27
28
|
Requires-Dist: typing-extensions
|
|
@@ -38,7 +39,6 @@ Requires-Dist: shtab<2,>=1.3.4
|
|
|
38
39
|
Requires-Dist: sqlalchemy>=2
|
|
39
40
|
Requires-Dist: multiprocess==0.70.16
|
|
40
41
|
Requires-Dist: cloudpickle
|
|
41
|
-
Requires-Dist: orjson>=3.10.5
|
|
42
42
|
Requires-Dist: pydantic
|
|
43
43
|
Requires-Dist: jmespath>=1.0
|
|
44
44
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
@@ -92,7 +92,7 @@ Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
|
92
92
|
Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
|
|
93
93
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
94
94
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
95
|
-
Requires-Dist: pytest-
|
|
95
|
+
Requires-Dist: pytest-dotenv; extra == "tests"
|
|
96
96
|
Requires-Dist: virtualenv; extra == "tests"
|
|
97
97
|
Requires-Dist: dulwich; extra == "tests"
|
|
98
98
|
Requires-Dist: hypothesis; extra == "tests"
|
|
@@ -80,3 +80,23 @@ Delta processing can be combined with [retry processing](./retry.md) to create a
|
|
|
80
80
|
|
|
81
81
|
1. Processes only new or changed records (delta)
|
|
82
82
|
2. Reprocesses records with errors or that are missing (retry)
|
|
83
|
+
|
|
84
|
+
## Using Delta with Restricted Methods
|
|
85
|
+
|
|
86
|
+
By default, delta updates cannot be combined with the following methods:
|
|
87
|
+
|
|
88
|
+
1. `merge`
|
|
89
|
+
2. `union`
|
|
90
|
+
3. `distinct`
|
|
91
|
+
4. `agg`
|
|
92
|
+
5. `group_by`
|
|
93
|
+
|
|
94
|
+
These methods are restricted because they may produce **unexpected results** when used with delta processing. Delta runs the chain only on a subset of rows (new and changed records), while methods like `distinct`, `agg`, or `group_by` are designed to operate on the entire dataset.
|
|
95
|
+
|
|
96
|
+
Similarly, combining delta with methods like `merge` or `union` may result in duplicated rows when merging with a static dataset.
|
|
97
|
+
|
|
98
|
+
If you still need to use these methods together with delta, you can override this restriction by setting the additional flag:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
delta_unsafe=True
|
|
102
|
+
```
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from typing import Optional
|
|
2
3
|
|
|
3
4
|
import datachain as dc
|
|
@@ -39,7 +40,7 @@ def main():
|
|
|
39
40
|
uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
40
41
|
|
|
41
42
|
# Print JSON schema in Pydantic format from main COCO annotation
|
|
42
|
-
chain = dc.read_storage(uri, anon=
|
|
43
|
+
chain = dc.read_storage(uri, anon=True).filter(dc.C("file.path").glob("*.json"))
|
|
43
44
|
file = chain.limit(1).to_values("file")[0]
|
|
44
45
|
print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
|
|
45
46
|
|
|
@@ -65,11 +66,12 @@ def main():
|
|
|
65
66
|
dynamic_csv_ds.print_schema()
|
|
66
67
|
dynamic_csv_ds.show()
|
|
67
68
|
|
|
68
|
-
print(
|
|
69
|
-
"Note: script might hang at the end due to https://github.com/apache/arrow/issues/43497"
|
|
70
|
-
)
|
|
71
|
-
print("Just press Ctrl+C to exit.")
|
|
72
|
-
|
|
73
69
|
|
|
74
70
|
if __name__ == "__main__":
|
|
75
71
|
main()
|
|
72
|
+
|
|
73
|
+
# Force exit without cleanup to avoid hanging due to arrow issue
|
|
74
|
+
print(
|
|
75
|
+
"Note: script might warn about leaked semaphore at the end due to https://github.com/apache/arrow/issues/43497"
|
|
76
|
+
)
|
|
77
|
+
os._exit(0)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Example: Nested DataModels with parallel execution.
|
|
2
|
+
|
|
3
|
+
Demonstrates mapping a function that returns a nested DataModel (a DataModel
|
|
4
|
+
containing other DataModels).
|
|
5
|
+
|
|
6
|
+
The example keeps things minimal: we persist a tiny dataset, run a parallel map
|
|
7
|
+
that returns a nested DataModel, and display the result.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import Field
|
|
13
|
+
|
|
14
|
+
import datachain as dc
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Metric(dc.DataModel):
|
|
18
|
+
"""Represents a single computed metric with quality metadata."""
|
|
19
|
+
|
|
20
|
+
value: Optional[float] = Field(default=None, description="Computed metric value")
|
|
21
|
+
confidence: Optional[float] = Field(
|
|
22
|
+
default=None, description="Confidence / quality score"
|
|
23
|
+
)
|
|
24
|
+
status: Optional[str] = Field(default=None, description="Processing status label")
|
|
25
|
+
metric_error: Optional[str] = Field(
|
|
26
|
+
default=None, description="Error message if metric computation failed"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SampleMetrics(dc.DataModel):
|
|
31
|
+
"""Container for two illustrative nested metrics.
|
|
32
|
+
|
|
33
|
+
Each sub-field is its own DataModel instance to demonstrate nested schemas
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
metric_primary: Metric = Field(
|
|
37
|
+
default_factory=lambda: Metric(), description="Primary metric"
|
|
38
|
+
)
|
|
39
|
+
metric_secondary: Metric = Field(
|
|
40
|
+
default_factory=lambda: Metric(), description="Secondary metric"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def generate_sample_metrics() -> SampleMetrics:
|
|
45
|
+
"""Synthesize a pair of metrics.
|
|
46
|
+
|
|
47
|
+
In real scenarios you'd compute these values; here we just return constants
|
|
48
|
+
to keep the example deterministic.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
return SampleMetrics(
|
|
52
|
+
metric_primary=Metric(value=50.0, confidence=0.95, status="ok"),
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def main():
|
|
57
|
+
(
|
|
58
|
+
dc.read_values(record_id=[1, 2])
|
|
59
|
+
.settings(parallel=2) # Keep it parallel to test serialization
|
|
60
|
+
.map(metrics=generate_sample_metrics)
|
|
61
|
+
.save("nested_datamodel")
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
dc.read_dataset("nested_datamodel").show()
|
|
65
|
+
|
|
66
|
+
print(dc.read_dataset("nested_datamodel").to_values("metrics"))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
main()
|
|
@@ -26,6 +26,7 @@ dependencies = [
|
|
|
26
26
|
"tqdm",
|
|
27
27
|
"numpy>=1,<3",
|
|
28
28
|
"pandas>=2.0.0",
|
|
29
|
+
"ujson>=5.10.0",
|
|
29
30
|
"packaging",
|
|
30
31
|
"pyarrow",
|
|
31
32
|
"typing-extensions",
|
|
@@ -42,7 +43,6 @@ dependencies = [
|
|
|
42
43
|
"sqlalchemy>=2",
|
|
43
44
|
"multiprocess==0.70.16",
|
|
44
45
|
"cloudpickle",
|
|
45
|
-
"orjson>=3.10.5",
|
|
46
46
|
"pydantic",
|
|
47
47
|
"jmespath>=1.0",
|
|
48
48
|
"datamodel-code-generator>=0.25",
|
|
@@ -108,7 +108,7 @@ tests = [
|
|
|
108
108
|
"pytest-servers[all]>=0.5.9",
|
|
109
109
|
"pytest-benchmark[histogram]",
|
|
110
110
|
"pytest-xdist>=3.3.1",
|
|
111
|
-
"pytest-
|
|
111
|
+
"pytest-dotenv",
|
|
112
112
|
"virtualenv",
|
|
113
113
|
"dulwich",
|
|
114
114
|
"hypothesis",
|
|
@@ -1,30 +1,41 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable, Iterator
|
|
3
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
3
4
|
|
|
4
5
|
from tabulate import tabulate
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
from datachain.catalog import Catalog
|
|
8
|
-
|
|
7
|
+
from datachain import semver
|
|
9
8
|
from datachain.catalog import is_namespace_local
|
|
10
9
|
from datachain.cli.utils import determine_flavors
|
|
11
10
|
from datachain.config import Config
|
|
12
11
|
from datachain.error import DataChainError, DatasetNotFoundError
|
|
13
12
|
from datachain.studio import list_datasets as list_datasets_studio
|
|
14
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from datachain.catalog import Catalog
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def group_dataset_versions(
|
|
19
|
+
datasets: Iterable[tuple[str, str]], latest_only=True
|
|
20
|
+
) -> dict[str, Union[str, list[str]]]:
|
|
21
|
+
grouped: dict[str, list[tuple[int, int, int]]] = {}
|
|
15
22
|
|
|
16
|
-
def group_dataset_versions(datasets, latest_only=True):
|
|
17
|
-
grouped = {}
|
|
18
23
|
# Sort to ensure groupby works as expected
|
|
19
24
|
# (groupby expects consecutive items with the same key)
|
|
20
25
|
for name, version in sorted(datasets):
|
|
21
|
-
grouped.setdefault(name, []).append(version)
|
|
26
|
+
grouped.setdefault(name, []).append(semver.parse(version))
|
|
22
27
|
|
|
23
28
|
if latest_only:
|
|
24
29
|
# For each dataset name, pick the highest version.
|
|
25
|
-
return {
|
|
30
|
+
return {
|
|
31
|
+
name: semver.create(*(max(versions))) for name, versions in grouped.items()
|
|
32
|
+
}
|
|
33
|
+
|
|
26
34
|
# For each dataset name, return a sorted list of unique versions.
|
|
27
|
-
return {
|
|
35
|
+
return {
|
|
36
|
+
name: [semver.create(*v) for v in sorted(set(versions))]
|
|
37
|
+
for name, versions in grouped.items()
|
|
38
|
+
}
|
|
28
39
|
|
|
29
40
|
|
|
30
41
|
def list_datasets(
|
|
@@ -35,7 +46,7 @@ def list_datasets(
|
|
|
35
46
|
team: Optional[str] = None,
|
|
36
47
|
latest_only: bool = True,
|
|
37
48
|
name: Optional[str] = None,
|
|
38
|
-
):
|
|
49
|
+
) -> None:
|
|
39
50
|
token = Config().read().get("studio", {}).get("token")
|
|
40
51
|
all, local, studio = determine_flavors(studio, local, all, token)
|
|
41
52
|
if name:
|
|
@@ -95,27 +106,31 @@ def list_datasets(
|
|
|
95
106
|
print(tabulate(rows, headers="keys"))
|
|
96
107
|
|
|
97
108
|
|
|
98
|
-
def list_datasets_local(
|
|
109
|
+
def list_datasets_local(
|
|
110
|
+
catalog: "Catalog", name: Optional[str] = None
|
|
111
|
+
) -> Iterator[tuple[str, str]]:
|
|
99
112
|
if name:
|
|
100
113
|
yield from list_datasets_local_versions(catalog, name)
|
|
101
114
|
return
|
|
102
115
|
|
|
103
116
|
for d in catalog.ls_datasets():
|
|
104
117
|
for v in d.versions:
|
|
105
|
-
yield
|
|
118
|
+
yield d.full_name, v.version
|
|
106
119
|
|
|
107
120
|
|
|
108
|
-
def list_datasets_local_versions(
|
|
121
|
+
def list_datasets_local_versions(
|
|
122
|
+
catalog: "Catalog", name: str
|
|
123
|
+
) -> Iterator[tuple[str, str]]:
|
|
109
124
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
110
125
|
|
|
111
126
|
ds = catalog.get_dataset(
|
|
112
127
|
name, namespace_name=namespace_name, project_name=project_name
|
|
113
128
|
)
|
|
114
129
|
for v in ds.versions:
|
|
115
|
-
yield
|
|
130
|
+
yield name, v.version
|
|
116
131
|
|
|
117
132
|
|
|
118
|
-
def _datasets_tabulate_row(name, both, local_version, studio_version):
|
|
133
|
+
def _datasets_tabulate_row(name, both, local_version, studio_version) -> dict[str, str]:
|
|
119
134
|
row = {
|
|
120
135
|
"Name": name,
|
|
121
136
|
}
|
|
@@ -136,7 +151,7 @@ def rm_dataset(
|
|
|
136
151
|
force: Optional[bool] = False,
|
|
137
152
|
studio: Optional[bool] = False,
|
|
138
153
|
team: Optional[str] = None,
|
|
139
|
-
):
|
|
154
|
+
) -> None:
|
|
140
155
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
141
156
|
|
|
142
157
|
if studio:
|
|
@@ -166,7 +181,7 @@ def edit_dataset(
|
|
|
166
181
|
description: Optional[str] = None,
|
|
167
182
|
attrs: Optional[list[str]] = None,
|
|
168
183
|
team: Optional[str] = None,
|
|
169
|
-
):
|
|
184
|
+
) -> None:
|
|
170
185
|
from datachain.lib.dc.utils import is_studio
|
|
171
186
|
|
|
172
187
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import glob
|
|
2
|
-
import json
|
|
3
2
|
import logging
|
|
4
3
|
import posixpath
|
|
5
4
|
import random
|
|
@@ -11,6 +10,7 @@ from urllib.parse import urlparse
|
|
|
11
10
|
|
|
12
11
|
import attrs
|
|
13
12
|
import sqlalchemy as sa
|
|
13
|
+
import ujson as json
|
|
14
14
|
from sqlalchemy.sql.expression import true
|
|
15
15
|
|
|
16
16
|
from datachain.client import Client
|
|
@@ -122,7 +122,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
122
122
|
if value_type is str:
|
|
123
123
|
return val
|
|
124
124
|
if value_type in (dict, list):
|
|
125
|
-
return json.dumps(val)
|
|
125
|
+
return json.dumps(val, ensure_ascii=False)
|
|
126
126
|
raise ValueError(
|
|
127
127
|
f"Cannot convert value {val!r} with type {value_type} to JSON"
|
|
128
128
|
)
|
|
@@ -4,7 +4,7 @@ from functools import wraps
|
|
|
4
4
|
from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
|
5
5
|
|
|
6
6
|
import datachain
|
|
7
|
-
from datachain.dataset import DatasetDependency
|
|
7
|
+
from datachain.dataset import DatasetDependency, DatasetRecord
|
|
8
8
|
from datachain.error import DatasetNotFoundError
|
|
9
9
|
from datachain.project import Project
|
|
10
10
|
|
|
@@ -30,9 +30,10 @@ def delta_disabled(
|
|
|
30
30
|
|
|
31
31
|
@wraps(method)
|
|
32
32
|
def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
|
|
33
|
-
if self.delta:
|
|
33
|
+
if self.delta and not self._delta_unsafe:
|
|
34
34
|
raise NotImplementedError(
|
|
35
|
-
f"
|
|
35
|
+
f"Cannot use {method.__name__} with delta datasets - may cause"
|
|
36
|
+
" inconsistency. Use delta_unsafe flag to allow this operation."
|
|
36
37
|
)
|
|
37
38
|
return method(self, *args, **kwargs)
|
|
38
39
|
|
|
@@ -124,10 +125,19 @@ def _get_retry_chain(
|
|
|
124
125
|
# Subtract also diff chain since some items might be picked
|
|
125
126
|
# up by `delta=True` itself (e.g. records got modified AND are missing in the
|
|
126
127
|
# result dataset atm)
|
|
127
|
-
|
|
128
|
+
on = [on] if isinstance(on, str) else on
|
|
129
|
+
|
|
130
|
+
return (
|
|
131
|
+
retry_chain.diff(
|
|
132
|
+
diff_chain, on=on, added=True, same=True, modified=False, deleted=False
|
|
133
|
+
).distinct(*on)
|
|
134
|
+
if retry_chain
|
|
135
|
+
else None
|
|
136
|
+
)
|
|
128
137
|
|
|
129
138
|
|
|
130
139
|
def _get_source_info(
|
|
140
|
+
source_ds: DatasetRecord,
|
|
131
141
|
name: str,
|
|
132
142
|
namespace_name: str,
|
|
133
143
|
project_name: str,
|
|
@@ -154,25 +164,23 @@ def _get_source_info(
|
|
|
154
164
|
indirect=False,
|
|
155
165
|
)
|
|
156
166
|
|
|
157
|
-
|
|
158
|
-
if not
|
|
167
|
+
source_ds_dep = next((d for d in dependencies if d.name == source_ds.name), None)
|
|
168
|
+
if not source_ds_dep:
|
|
159
169
|
# Starting dataset was removed, back off to normal dataset creation
|
|
160
170
|
return None, None, None, None, None
|
|
161
171
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
project_name=source_ds_project.name,
|
|
169
|
-
).latest_version
|
|
172
|
+
# Refresh starting dataset to have new versions if they are created
|
|
173
|
+
source_ds = catalog.get_dataset(
|
|
174
|
+
source_ds.name,
|
|
175
|
+
namespace_name=source_ds.project.namespace.name,
|
|
176
|
+
project_name=source_ds.project.name,
|
|
177
|
+
)
|
|
170
178
|
|
|
171
179
|
return (
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
180
|
+
source_ds.name,
|
|
181
|
+
source_ds.project,
|
|
182
|
+
source_ds_dep.version,
|
|
183
|
+
source_ds.latest_version,
|
|
176
184
|
dependencies,
|
|
177
185
|
)
|
|
178
186
|
|
|
@@ -244,7 +252,14 @@ def delta_retry_update(
|
|
|
244
252
|
source_ds_version,
|
|
245
253
|
source_ds_latest_version,
|
|
246
254
|
dependencies,
|
|
247
|
-
) = _get_source_info(
|
|
255
|
+
) = _get_source_info(
|
|
256
|
+
dc._query.starting_step.dataset, # type: ignore[union-attr]
|
|
257
|
+
name,
|
|
258
|
+
namespace_name,
|
|
259
|
+
project_name,
|
|
260
|
+
latest_version,
|
|
261
|
+
catalog,
|
|
262
|
+
)
|
|
248
263
|
|
|
249
264
|
# If source_ds_name is None, starting dataset was removed
|
|
250
265
|
if source_ds_name is None:
|
|
@@ -267,8 +282,9 @@ def delta_retry_update(
|
|
|
267
282
|
if dependencies:
|
|
268
283
|
dependencies = copy(dependencies)
|
|
269
284
|
dependencies = [d for d in dependencies if d is not None]
|
|
285
|
+
source_ds_dep = next(d for d in dependencies if d.name == source_ds_name)
|
|
270
286
|
# Update to latest version
|
|
271
|
-
|
|
287
|
+
source_ds_dep.version = source_ds_latest_version # type: ignore[union-attr]
|
|
272
288
|
|
|
273
289
|
# Handle retry functionality if enabled
|
|
274
290
|
if delta_retry:
|
|
@@ -2,8 +2,8 @@ from collections.abc import Sequence
|
|
|
2
2
|
from itertools import islice
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
4
|
|
|
5
|
-
import orjson
|
|
6
5
|
import pyarrow as pa
|
|
6
|
+
import ujson as json
|
|
7
7
|
from pyarrow._csv import ParseOptions
|
|
8
8
|
from pyarrow.dataset import CsvFileFormat, dataset
|
|
9
9
|
from tqdm.auto import tqdm
|
|
@@ -269,7 +269,7 @@ def _get_hf_schema(
|
|
|
269
269
|
def _get_datachain_schema(schema: "pa.Schema") -> Optional[SignalSchema]:
|
|
270
270
|
"""Return a restored SignalSchema from parquet metadata, if any is found."""
|
|
271
271
|
if schema.metadata and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in schema.metadata:
|
|
272
|
-
serialized_signal_schema =
|
|
272
|
+
serialized_signal_schema = json.loads(
|
|
273
273
|
schema.metadata[DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY]
|
|
274
274
|
)
|
|
275
275
|
return SignalSchema.deserialize(serialized_signal_schema)
|
|
@@ -19,8 +19,8 @@ from typing import (
|
|
|
19
19
|
overload,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
-
import orjson
|
|
23
22
|
import sqlalchemy
|
|
23
|
+
import ujson as json
|
|
24
24
|
from pydantic import BaseModel
|
|
25
25
|
from sqlalchemy.sql.elements import ColumnElement
|
|
26
26
|
from tqdm import tqdm
|
|
@@ -193,6 +193,7 @@ class DataChain:
|
|
|
193
193
|
self._setup: dict = setup or {}
|
|
194
194
|
self._sys = _sys
|
|
195
195
|
self._delta = False
|
|
196
|
+
self._delta_unsafe = False
|
|
196
197
|
self._delta_on: Optional[Union[str, Sequence[str]]] = None
|
|
197
198
|
self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
|
|
198
199
|
self._delta_compare: Optional[Union[str, Sequence[str]]] = None
|
|
@@ -216,6 +217,7 @@ class DataChain:
|
|
|
216
217
|
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
217
218
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
218
219
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
220
|
+
delta_unsafe: bool = False,
|
|
219
221
|
) -> "Self":
|
|
220
222
|
"""Marks this chain as delta, which means special delta process will be
|
|
221
223
|
called on saving dataset for optimization"""
|
|
@@ -226,6 +228,7 @@ class DataChain:
|
|
|
226
228
|
self._delta_result_on = right_on
|
|
227
229
|
self._delta_compare = compare
|
|
228
230
|
self._delta_retry = delta_retry
|
|
231
|
+
self._delta_unsafe = delta_unsafe
|
|
229
232
|
return self
|
|
230
233
|
|
|
231
234
|
@property
|
|
@@ -238,6 +241,10 @@ class DataChain:
|
|
|
238
241
|
"""Returns True if this chain is ran in "delta" update mode"""
|
|
239
242
|
return self._delta
|
|
240
243
|
|
|
244
|
+
@property
|
|
245
|
+
def delta_unsafe(self) -> bool:
|
|
246
|
+
return self._delta_unsafe
|
|
247
|
+
|
|
241
248
|
@property
|
|
242
249
|
def schema(self) -> dict[str, DataType]:
|
|
243
250
|
"""Get schema of the chain."""
|
|
@@ -328,6 +335,7 @@ class DataChain:
|
|
|
328
335
|
right_on=self._delta_result_on,
|
|
329
336
|
compare=self._delta_compare,
|
|
330
337
|
delta_retry=self._delta_retry,
|
|
338
|
+
delta_unsafe=self._delta_unsafe,
|
|
331
339
|
)
|
|
332
340
|
|
|
333
341
|
return chain
|
|
@@ -462,8 +470,6 @@ class DataChain:
|
|
|
462
470
|
Returns:
|
|
463
471
|
DataChain: A new DataChain instance with the new set of columns.
|
|
464
472
|
"""
|
|
465
|
-
import json
|
|
466
|
-
|
|
467
473
|
import pyarrow as pa
|
|
468
474
|
|
|
469
475
|
from datachain.lib.arrow import schema_to_output
|
|
@@ -2129,9 +2135,9 @@ class DataChain:
|
|
|
2129
2135
|
fsspec_fs = client.create_fs(**fs_kwargs)
|
|
2130
2136
|
|
|
2131
2137
|
_partition_cols = list(partition_cols) if partition_cols else None
|
|
2132
|
-
signal_schema_metadata =
|
|
2133
|
-
self._effective_signals_schema.serialize()
|
|
2134
|
-
)
|
|
2138
|
+
signal_schema_metadata = json.dumps(
|
|
2139
|
+
self._effective_signals_schema.serialize(), ensure_ascii=False
|
|
2140
|
+
).encode("utf-8")
|
|
2135
2141
|
|
|
2136
2142
|
column_names, column_chunks = self.to_columnar_data_with_names(chunk_size)
|
|
2137
2143
|
|
|
@@ -2278,7 +2284,11 @@ class DataChain:
|
|
|
2278
2284
|
f.write(b"\n")
|
|
2279
2285
|
else:
|
|
2280
2286
|
is_first = False
|
|
2281
|
-
f.write(
|
|
2287
|
+
f.write(
|
|
2288
|
+
json.dumps(
|
|
2289
|
+
row_to_nested_dict(headers, row), ensure_ascii=False
|
|
2290
|
+
).encode("utf-8")
|
|
2291
|
+
)
|
|
2282
2292
|
if include_outer_list:
|
|
2283
2293
|
# This makes the file JSON instead of JSON lines.
|
|
2284
2294
|
f.write(b"\n]\n")
|
|
@@ -40,6 +40,7 @@ def read_dataset(
|
|
|
40
40
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
41
41
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
42
42
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
43
|
+
delta_unsafe: bool = False,
|
|
43
44
|
update: bool = False,
|
|
44
45
|
) -> "DataChain":
|
|
45
46
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
@@ -80,6 +81,8 @@ def read_dataset(
|
|
|
80
81
|
update: If True always checks for newer versions available on Studio, even if
|
|
81
82
|
some version of the dataset exists locally already. If False (default), it
|
|
82
83
|
will only fetch the dataset from Studio if it is not found locally.
|
|
84
|
+
delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
|
|
85
|
+
distinct.
|
|
83
86
|
|
|
84
87
|
|
|
85
88
|
Example:
|
|
@@ -205,6 +208,7 @@ def read_dataset(
|
|
|
205
208
|
right_on=delta_result_on,
|
|
206
209
|
compare=delta_compare,
|
|
207
210
|
delta_retry=delta_retry,
|
|
211
|
+
delta_unsafe=delta_unsafe,
|
|
208
212
|
)
|
|
209
213
|
|
|
210
214
|
return chain
|
|
@@ -43,6 +43,7 @@ def read_storage(
|
|
|
43
43
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
44
44
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
45
45
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
46
|
+
delta_unsafe: bool = False,
|
|
46
47
|
client_config: Optional[dict] = None,
|
|
47
48
|
) -> "DataChain":
|
|
48
49
|
"""Get data from storage(s) as a list of file with all file attributes.
|
|
@@ -77,6 +78,9 @@ def read_storage(
|
|
|
77
78
|
(error mode)
|
|
78
79
|
- True: Reprocess records missing from the result dataset (missing mode)
|
|
79
80
|
- None: No retry processing (default)
|
|
81
|
+
delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
|
|
82
|
+
distinct. Caller must ensure datasets are consistent and not partially
|
|
83
|
+
updated.
|
|
80
84
|
|
|
81
85
|
Returns:
|
|
82
86
|
DataChain: A DataChain object containing the file information.
|
|
@@ -218,6 +222,7 @@ def read_storage(
|
|
|
218
222
|
right_on=delta_result_on,
|
|
219
223
|
compare=delta_compare,
|
|
220
224
|
delta_retry=delta_retry,
|
|
225
|
+
delta_unsafe=delta_unsafe,
|
|
221
226
|
)
|
|
222
227
|
|
|
223
228
|
return storage_chain
|
|
@@ -89,3 +89,15 @@ class ModelStore:
|
|
|
89
89
|
and ModelStore.is_pydantic(parent_type)
|
|
90
90
|
and "@" in ModelStore.get_name(parent_type)
|
|
91
91
|
)
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def rebuild_all(cls) -> None:
|
|
95
|
+
"""Ensure pydantic schemas are (re)built for all registered models.
|
|
96
|
+
|
|
97
|
+
Uses ``force=True`` to avoid subtle cases where a deserialized class
|
|
98
|
+
(e.g. from by-value cloudpickle in workers) reports built state but
|
|
99
|
+
nested model field schemas aren't fully resolved yet.
|
|
100
|
+
"""
|
|
101
|
+
for versions in cls.store.values():
|
|
102
|
+
for model in versions.values():
|
|
103
|
+
model.model_rebuild(force=True)
|
|
@@ -13,6 +13,7 @@ from multiprocess import get_context
|
|
|
13
13
|
from datachain.catalog import Catalog
|
|
14
14
|
from datachain.catalog.catalog import clone_catalog_with_cache
|
|
15
15
|
from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
|
|
16
|
+
from datachain.lib.model_store import ModelStore
|
|
16
17
|
from datachain.lib.udf import _get_cache
|
|
17
18
|
from datachain.query.dataset import (
|
|
18
19
|
get_download_callback,
|
|
@@ -130,6 +131,8 @@ class UDFDispatcher:
|
|
|
130
131
|
|
|
131
132
|
def _create_worker(self) -> "UDFWorker":
|
|
132
133
|
udf: UDFAdapter = loads(self.udf_data)
|
|
134
|
+
# Ensure all registered DataModels have rebuilt schemas in worker processes.
|
|
135
|
+
ModelStore.rebuild_all()
|
|
133
136
|
return UDFWorker(
|
|
134
137
|
self.catalog,
|
|
135
138
|
udf,
|
|
@@ -196,6 +199,8 @@ class UDFDispatcher:
|
|
|
196
199
|
generated_cb: Callback = DEFAULT_CALLBACK,
|
|
197
200
|
) -> None:
|
|
198
201
|
udf: UDFAdapter = loads(self.udf_data)
|
|
202
|
+
# Rebuild schemas in single process too for consistency (cheap, idempotent).
|
|
203
|
+
ModelStore.rebuild_all()
|
|
199
204
|
|
|
200
205
|
if ids_only and not self.is_batching:
|
|
201
206
|
input_rows = flatten(input_rows)
|