datachain 0.5.1__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.5.1 → datachain-0.6.0}/.pre-commit-config.yaml +2 -2
- {datachain-0.5.1/src/datachain.egg-info → datachain-0.6.0}/PKG-INFO +1 -1
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/values_to_tuples.py +2 -2
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/data_model.py +1 -1
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/dc.py +18 -2
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/signal_schema.py +6 -6
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/udf.py +177 -151
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/batch.py +0 -10
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/dataset.py +1 -1
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/dispatch.py +0 -12
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/string.py +12 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/sqlite/base.py +10 -5
- {datachain-0.5.1 → datachain-0.6.0/src/datachain.egg-info}/PKG-INFO +1 -1
- datachain-0.6.0/tests/scripts/feature_class_exception.py +11 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_datachain.py +11 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_string.py +15 -0
- datachain-0.5.1/tests/scripts/feature_class_exception.py +0 -24
- {datachain-0.5.1 → datachain-0.6.0}/.cruft.json +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.gitattributes +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.github/codecov.yaml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.github/dependabot.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.github/workflows/release.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/.gitignore +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/CONTRIBUTING.rst +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/LICENSE +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/README.rst +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/assets/flowchart.png +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/index.md +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/references/datachain.md +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/references/datatype.md +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/references/file.md +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/references/index.md +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/references/sql.md +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/references/torch.md +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/docs/references/udf.md +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/mkdocs.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/noxfile.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/overrides/main.html +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/pyproject.toml +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/setup.cfg +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/__main__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/asyn.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/cache.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/cli.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/cli_utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/local.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/config.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/dataset.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/error.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/job.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/file.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/listing.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/node.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/progress.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/py.typed +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/params.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/session.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/storage.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain/utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/conftest.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/data.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/examples/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_client.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_datachain.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_listing.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_ls.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_pull.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_query.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/test_atomicity.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/test_telemetry.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_client.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_query.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_session.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_storage.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.5.1 → datachain-0.6.0}/tests/utils.py +0 -0
|
@@ -4,7 +4,7 @@ ci:
|
|
|
4
4
|
skip: [mypy]
|
|
5
5
|
repos:
|
|
6
6
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
7
|
-
rev:
|
|
7
|
+
rev: v5.0.0
|
|
8
8
|
hooks:
|
|
9
9
|
- id: check-added-large-files
|
|
10
10
|
exclude: '^tests/examples/data/'
|
|
@@ -24,7 +24,7 @@ repos:
|
|
|
24
24
|
- id: trailing-whitespace
|
|
25
25
|
exclude: '^LICENSES/'
|
|
26
26
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
27
|
-
rev: 'v0.6.
|
|
27
|
+
rev: 'v0.6.9'
|
|
28
28
|
hooks:
|
|
29
29
|
- id: ruff
|
|
30
30
|
args: [--fix, --exit-non-zero-on-fix]
|
|
@@ -4,7 +4,7 @@ from typing import Any, Union
|
|
|
4
4
|
from datachain.lib.data_model import (
|
|
5
5
|
DataType,
|
|
6
6
|
DataTypeNames,
|
|
7
|
-
|
|
7
|
+
DataValue,
|
|
8
8
|
is_chain_type,
|
|
9
9
|
)
|
|
10
10
|
from datachain.lib.utils import DataChainParamsError
|
|
@@ -20,7 +20,7 @@ class ValuesToTupleError(DataChainParamsError):
|
|
|
20
20
|
def values_to_tuples( # noqa: C901, PLR0912
|
|
21
21
|
ds_name: str = "",
|
|
22
22
|
output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
|
|
23
|
-
**fr_map: Sequence[
|
|
23
|
+
**fr_map: Sequence[DataValue],
|
|
24
24
|
) -> tuple[Any, Any, Any]:
|
|
25
25
|
if output:
|
|
26
26
|
if not isinstance(output, (Sequence, str, dict)):
|
|
@@ -18,7 +18,7 @@ StandardType = Union[
|
|
|
18
18
|
]
|
|
19
19
|
DataType = Union[type[BaseModel], StandardType]
|
|
20
20
|
DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
|
|
21
|
-
|
|
21
|
+
DataValue = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class DataModel(BaseModel):
|
|
@@ -62,6 +62,7 @@ from datachain.telemetry import telemetry
|
|
|
62
62
|
from datachain.utils import batched_it, inside_notebook
|
|
63
63
|
|
|
64
64
|
if TYPE_CHECKING:
|
|
65
|
+
from pyarrow import DataType as ArrowDataType
|
|
65
66
|
from typing_extensions import Concatenate, ParamSpec, Self
|
|
66
67
|
|
|
67
68
|
from datachain.lib.hf import HFDatasetType
|
|
@@ -1024,7 +1025,7 @@ class DataChain:
|
|
|
1024
1025
|
The supported functions:
|
|
1025
1026
|
Numerical: +, -, *, /, rand(), avg(), count(), func(),
|
|
1026
1027
|
greatest(), least(), max(), min(), sum()
|
|
1027
|
-
String: length(), split()
|
|
1028
|
+
String: length(), split(), replace(), regexp_replace()
|
|
1028
1029
|
Filename: name(), parent(), file_stem(), file_ext()
|
|
1029
1030
|
Array: length(), sip_hash_64(), euclidean_distance(),
|
|
1030
1031
|
cosine_distance()
|
|
@@ -1709,6 +1710,7 @@ class DataChain:
|
|
|
1709
1710
|
nrows=None,
|
|
1710
1711
|
session: Optional[Session] = None,
|
|
1711
1712
|
settings: Optional[dict] = None,
|
|
1713
|
+
column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
|
|
1712
1714
|
**kwargs,
|
|
1713
1715
|
) -> "DataChain":
|
|
1714
1716
|
"""Generate chain from csv files.
|
|
@@ -1727,6 +1729,9 @@ class DataChain:
|
|
|
1727
1729
|
nrows : Optional row limit.
|
|
1728
1730
|
session : Session to use for the chain.
|
|
1729
1731
|
settings : Settings to use for the chain.
|
|
1732
|
+
column_types : Dictionary of column names and their corresponding types.
|
|
1733
|
+
It is passed to CSV reader and for each column specified type auto
|
|
1734
|
+
inference is disabled.
|
|
1730
1735
|
|
|
1731
1736
|
Example:
|
|
1732
1737
|
Reading a csv file:
|
|
@@ -1742,6 +1747,15 @@ class DataChain:
|
|
|
1742
1747
|
from pandas.io.parsers.readers import STR_NA_VALUES
|
|
1743
1748
|
from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
|
|
1744
1749
|
from pyarrow.dataset import CsvFileFormat
|
|
1750
|
+
from pyarrow.lib import type_for_alias
|
|
1751
|
+
|
|
1752
|
+
if column_types:
|
|
1753
|
+
column_types = {
|
|
1754
|
+
name: type_for_alias(typ) if isinstance(typ, str) else typ
|
|
1755
|
+
for name, typ in column_types.items()
|
|
1756
|
+
}
|
|
1757
|
+
else:
|
|
1758
|
+
column_types = {}
|
|
1745
1759
|
|
|
1746
1760
|
chain = DataChain.from_storage(
|
|
1747
1761
|
path, session=session, settings=settings, **kwargs
|
|
@@ -1767,7 +1781,9 @@ class DataChain:
|
|
|
1767
1781
|
parse_options = ParseOptions(delimiter=delimiter)
|
|
1768
1782
|
read_options = ReadOptions(column_names=column_names)
|
|
1769
1783
|
convert_options = ConvertOptions(
|
|
1770
|
-
strings_can_be_null=True,
|
|
1784
|
+
strings_can_be_null=True,
|
|
1785
|
+
null_values=STR_NA_VALUES,
|
|
1786
|
+
column_types=column_types,
|
|
1771
1787
|
)
|
|
1772
1788
|
format = CsvFileFormat(
|
|
1773
1789
|
parse_options=parse_options,
|
|
@@ -25,7 +25,7 @@ from typing_extensions import Literal as LiteralEx
|
|
|
25
25
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
26
26
|
from datachain.lib.convert.sql_to_python import sql_to_python
|
|
27
27
|
from datachain.lib.convert.unflatten import unflatten_to_json_pos
|
|
28
|
-
from datachain.lib.data_model import DataModel, DataType
|
|
28
|
+
from datachain.lib.data_model import DataModel, DataType, DataValue
|
|
29
29
|
from datachain.lib.file import File
|
|
30
30
|
from datachain.lib.model_store import ModelStore
|
|
31
31
|
from datachain.lib.utils import DataChainParamsError
|
|
@@ -110,7 +110,7 @@ class SignalSchema:
|
|
|
110
110
|
values: dict[str, DataType]
|
|
111
111
|
tree: dict[str, Any]
|
|
112
112
|
setup_func: dict[str, Callable]
|
|
113
|
-
setup_values: Optional[dict[str,
|
|
113
|
+
setup_values: Optional[dict[str, Any]]
|
|
114
114
|
|
|
115
115
|
def __init__(
|
|
116
116
|
self,
|
|
@@ -333,21 +333,21 @@ class SignalSchema:
|
|
|
333
333
|
res[db_name] = python_to_sql(type_)
|
|
334
334
|
return res
|
|
335
335
|
|
|
336
|
-
def row_to_objs(self, row: Sequence[Any]) -> list[
|
|
336
|
+
def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
|
|
337
337
|
self._init_setup_values()
|
|
338
338
|
|
|
339
|
-
objs = []
|
|
339
|
+
objs: list[DataValue] = []
|
|
340
340
|
pos = 0
|
|
341
341
|
for name, fr_type in self.values.items():
|
|
342
342
|
if self.setup_values and (val := self.setup_values.get(name, None)):
|
|
343
343
|
objs.append(val)
|
|
344
344
|
elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
|
|
345
345
|
j, pos = unflatten_to_json_pos(fr, row, pos)
|
|
346
|
-
objs.append(fr(**j))
|
|
346
|
+
objs.append(fr(**j))
|
|
347
347
|
else:
|
|
348
348
|
objs.append(row[pos])
|
|
349
349
|
pos += 1
|
|
350
|
-
return objs
|
|
350
|
+
return objs
|
|
351
351
|
|
|
352
352
|
def contains_file(self) -> bool:
|
|
353
353
|
for type_ in self.values.values():
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
import traceback
|
|
3
3
|
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
4
|
-
from dataclasses import dataclass
|
|
5
4
|
from typing import TYPE_CHECKING, Any, Callable, Optional
|
|
6
5
|
|
|
6
|
+
import attrs
|
|
7
7
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
8
8
|
from pydantic import BaseModel
|
|
9
9
|
|
|
10
10
|
from datachain.dataset import RowDict
|
|
11
11
|
from datachain.lib.convert.flatten import flatten
|
|
12
|
+
from datachain.lib.data_model import DataValue
|
|
12
13
|
from datachain.lib.file import File
|
|
13
14
|
from datachain.lib.signal_schema import SignalSchema
|
|
14
15
|
from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
|
|
@@ -18,16 +19,14 @@ from datachain.query.batch import (
|
|
|
18
19
|
NoBatching,
|
|
19
20
|
Partition,
|
|
20
21
|
RowsOutputBatch,
|
|
21
|
-
UDFInputBatch,
|
|
22
22
|
)
|
|
23
|
-
from datachain.query.schema import ColumnParameter, UDFParameter
|
|
24
23
|
|
|
25
24
|
if TYPE_CHECKING:
|
|
26
25
|
from typing_extensions import Self
|
|
27
26
|
|
|
28
27
|
from datachain.catalog import Catalog
|
|
29
28
|
from datachain.lib.udf_signature import UdfSignature
|
|
30
|
-
from datachain.query.batch import RowsOutput
|
|
29
|
+
from datachain.query.batch import RowsOutput
|
|
31
30
|
|
|
32
31
|
|
|
33
32
|
class UdfError(DataChainParamsError):
|
|
@@ -45,11 +44,21 @@ UDFOutputSpec = Mapping[str, ColumnType]
|
|
|
45
44
|
UDFResult = dict[str, Any]
|
|
46
45
|
|
|
47
46
|
|
|
48
|
-
@
|
|
47
|
+
@attrs.define
|
|
49
48
|
class UDFProperties:
|
|
50
|
-
|
|
49
|
+
udf: "UDFAdapter"
|
|
51
50
|
|
|
52
|
-
|
|
51
|
+
def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
|
|
52
|
+
return self.udf.get_batching(use_partitioning)
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def batch(self):
|
|
56
|
+
return self.udf.batch
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@attrs.define(slots=False)
|
|
60
|
+
class UDFAdapter:
|
|
61
|
+
inner: "UDFBase"
|
|
53
62
|
output: UDFOutputSpec
|
|
54
63
|
batch: int = 1
|
|
55
64
|
|
|
@@ -62,20 +71,10 @@ class UDFProperties:
|
|
|
62
71
|
return Batch(self.batch)
|
|
63
72
|
raise ValueError(f"invalid batch size {self.batch}")
|
|
64
73
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
class UDFAdapter:
|
|
70
|
-
def __init__(
|
|
71
|
-
self,
|
|
72
|
-
inner: "UDFBase",
|
|
73
|
-
properties: UDFProperties,
|
|
74
|
-
):
|
|
75
|
-
self.inner = inner
|
|
76
|
-
self.properties = properties
|
|
77
|
-
self.signal_names = properties.signal_names()
|
|
78
|
-
self.output = properties.output
|
|
74
|
+
@property
|
|
75
|
+
def properties(self):
|
|
76
|
+
# For backwards compatibility.
|
|
77
|
+
return UDFProperties(self)
|
|
79
78
|
|
|
80
79
|
def run(
|
|
81
80
|
self,
|
|
@@ -87,72 +86,14 @@ class UDFAdapter:
|
|
|
87
86
|
download_cb: Callback = DEFAULT_CALLBACK,
|
|
88
87
|
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
89
88
|
) -> Iterator[Iterable[UDFResult]]:
|
|
90
|
-
self.inner.
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
[RowDict(zip(udf_fields, row)) for row in batch.rows]
|
|
99
|
-
)
|
|
100
|
-
else:
|
|
101
|
-
n_rows = 1
|
|
102
|
-
inputs = RowDict(zip(udf_fields, batch))
|
|
103
|
-
output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
|
|
104
|
-
processed_cb.relative_update(n_rows)
|
|
105
|
-
yield output
|
|
106
|
-
|
|
107
|
-
if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
|
|
108
|
-
self.inner.teardown()
|
|
109
|
-
|
|
110
|
-
def run_once(
|
|
111
|
-
self,
|
|
112
|
-
catalog: "Catalog",
|
|
113
|
-
arg: "UDFInput",
|
|
114
|
-
is_generator: bool = False,
|
|
115
|
-
cache: bool = False,
|
|
116
|
-
cb: Callback = DEFAULT_CALLBACK,
|
|
117
|
-
) -> Iterable[UDFResult]:
|
|
118
|
-
if isinstance(arg, UDFInputBatch):
|
|
119
|
-
udf_inputs = [
|
|
120
|
-
self.bind_parameters(catalog, row, cache=cache, cb=cb)
|
|
121
|
-
for row in arg.rows
|
|
122
|
-
]
|
|
123
|
-
udf_outputs = self.inner.run_once(udf_inputs, cache=cache, download_cb=cb)
|
|
124
|
-
return self._process_results(arg.rows, udf_outputs, is_generator)
|
|
125
|
-
if isinstance(arg, RowDict):
|
|
126
|
-
udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
|
|
127
|
-
udf_outputs = self.inner.run_once(udf_inputs, cache=cache, download_cb=cb)
|
|
128
|
-
if not is_generator:
|
|
129
|
-
# udf_outputs is generator already if is_generator=True
|
|
130
|
-
udf_outputs = [udf_outputs]
|
|
131
|
-
return self._process_results([arg], udf_outputs, is_generator)
|
|
132
|
-
raise ValueError(f"Unexpected UDF argument: {arg}")
|
|
133
|
-
|
|
134
|
-
def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
|
|
135
|
-
return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
|
|
136
|
-
|
|
137
|
-
def _process_results(
|
|
138
|
-
self,
|
|
139
|
-
rows: Sequence["RowDict"],
|
|
140
|
-
results: Sequence[Sequence[Any]],
|
|
141
|
-
is_generator=False,
|
|
142
|
-
) -> Iterable[UDFResult]:
|
|
143
|
-
"""Create a list of dictionaries representing UDF results."""
|
|
144
|
-
|
|
145
|
-
# outputting rows
|
|
146
|
-
if is_generator:
|
|
147
|
-
# each row in results is a tuple of column values
|
|
148
|
-
return (dict(zip(self.signal_names, row)) for row in results)
|
|
149
|
-
|
|
150
|
-
# outputting signals
|
|
151
|
-
row_ids = [row["sys__id"] for row in rows]
|
|
152
|
-
return [
|
|
153
|
-
{"sys__id": row_id} | dict(zip(self.signal_names, signals))
|
|
154
|
-
for row_id, signals in zip(row_ids, results)
|
|
155
|
-
]
|
|
89
|
+
yield from self.inner.run(
|
|
90
|
+
udf_fields,
|
|
91
|
+
udf_inputs,
|
|
92
|
+
catalog,
|
|
93
|
+
cache,
|
|
94
|
+
download_cb,
|
|
95
|
+
processed_cb,
|
|
96
|
+
)
|
|
156
97
|
|
|
157
98
|
|
|
158
99
|
class UDFBase(AbstractUDF):
|
|
@@ -203,17 +144,12 @@ class UDFBase(AbstractUDF):
|
|
|
203
144
|
```
|
|
204
145
|
"""
|
|
205
146
|
|
|
206
|
-
is_input_batched = False
|
|
207
147
|
is_output_batched = False
|
|
208
|
-
is_input_grouped = False
|
|
209
|
-
params_spec: Optional[list[str]]
|
|
210
148
|
catalog: "Optional[Catalog]"
|
|
211
149
|
|
|
212
150
|
def __init__(self):
|
|
213
|
-
self.params = None
|
|
151
|
+
self.params: Optional[SignalSchema] = None
|
|
214
152
|
self.output = None
|
|
215
|
-
self.params_spec = None
|
|
216
|
-
self.output_spec = None
|
|
217
153
|
self.catalog = None
|
|
218
154
|
self._func = None
|
|
219
155
|
|
|
@@ -241,11 +177,6 @@ class UDFBase(AbstractUDF):
|
|
|
241
177
|
):
|
|
242
178
|
self.params = params
|
|
243
179
|
self.output = sign.output_schema
|
|
244
|
-
|
|
245
|
-
params_spec = self.params.to_udf_spec()
|
|
246
|
-
self.params_spec = list(params_spec.keys())
|
|
247
|
-
self.output_spec = self.output.to_udf_spec()
|
|
248
|
-
|
|
249
180
|
self._func = func
|
|
250
181
|
|
|
251
182
|
@classmethod
|
|
@@ -273,48 +204,27 @@ class UDFBase(AbstractUDF):
|
|
|
273
204
|
def name(self):
|
|
274
205
|
return self.__class__.__name__
|
|
275
206
|
|
|
207
|
+
@property
|
|
208
|
+
def signal_names(self) -> Iterable[str]:
|
|
209
|
+
return self.output.to_udf_spec().keys()
|
|
210
|
+
|
|
276
211
|
def to_udf_wrapper(self, batch: int = 1) -> UDFAdapter:
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
212
|
+
return UDFAdapter(
|
|
213
|
+
self,
|
|
214
|
+
self.output.to_udf_spec(),
|
|
215
|
+
batch,
|
|
280
216
|
)
|
|
281
|
-
return UDFAdapter(self, properties)
|
|
282
|
-
|
|
283
|
-
def validate_results(self, results, *args, **kwargs):
|
|
284
|
-
return results
|
|
285
217
|
|
|
286
|
-
def
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
# Generator expression is required, otherwise the value will be materialized
|
|
298
|
-
res = (self._flatten_row(row) for row in result_objs)
|
|
299
|
-
|
|
300
|
-
if not self.is_output_batched:
|
|
301
|
-
res = list(res)
|
|
302
|
-
assert (
|
|
303
|
-
len(res) == 1
|
|
304
|
-
), f"{self.name} returns {len(res)} rows while it's not batched"
|
|
305
|
-
if isinstance(res[0], tuple):
|
|
306
|
-
res = res[0]
|
|
307
|
-
elif (
|
|
308
|
-
self.is_input_batched
|
|
309
|
-
and self.is_output_batched
|
|
310
|
-
and not self.is_input_grouped
|
|
311
|
-
):
|
|
312
|
-
res = list(res)
|
|
313
|
-
assert len(res) == len(
|
|
314
|
-
rows
|
|
315
|
-
), f"{self.name} returns {len(res)} rows while {len(rows)} expected"
|
|
316
|
-
|
|
317
|
-
return res
|
|
218
|
+
def run(
|
|
219
|
+
self,
|
|
220
|
+
udf_fields: "Sequence[str]",
|
|
221
|
+
udf_inputs: "Iterable[Any]",
|
|
222
|
+
catalog: "Catalog",
|
|
223
|
+
cache: bool,
|
|
224
|
+
download_cb: Callback = DEFAULT_CALLBACK,
|
|
225
|
+
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
226
|
+
) -> Iterator[Iterable[UDFResult]]:
|
|
227
|
+
raise NotImplementedError
|
|
318
228
|
|
|
319
229
|
def _flatten_row(self, row):
|
|
320
230
|
if len(self.output.values) > 1 and not isinstance(row, BaseModel):
|
|
@@ -328,17 +238,28 @@ class UDFBase(AbstractUDF):
|
|
|
328
238
|
def _obj_to_list(obj):
|
|
329
239
|
return flatten(obj) if isinstance(obj, BaseModel) else [obj]
|
|
330
240
|
|
|
331
|
-
def
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
241
|
+
def _parse_row(
|
|
242
|
+
self, row_dict: RowDict, cache: bool, download_cb: Callback
|
|
243
|
+
) -> list[DataValue]:
|
|
244
|
+
assert self.params
|
|
245
|
+
row = [row_dict[p] for p in self.params.to_udf_spec()]
|
|
246
|
+
obj_row = self.params.row_to_objs(row)
|
|
247
|
+
for obj in obj_row:
|
|
248
|
+
if isinstance(obj, File):
|
|
249
|
+
assert self.catalog is not None
|
|
250
|
+
obj._set_stream(
|
|
251
|
+
self.catalog, caching_enabled=cache, download_cb=download_cb
|
|
252
|
+
)
|
|
253
|
+
return obj_row
|
|
254
|
+
|
|
255
|
+
def _prepare_row(self, row, udf_fields, cache, download_cb):
|
|
256
|
+
row_dict = RowDict(zip(udf_fields, row))
|
|
257
|
+
return self._parse_row(row_dict, cache, download_cb)
|
|
258
|
+
|
|
259
|
+
def _prepare_row_and_id(self, row, udf_fields, cache, download_cb):
|
|
260
|
+
row_dict = RowDict(zip(udf_fields, row))
|
|
261
|
+
udf_input = self._parse_row(row_dict, cache, download_cb)
|
|
262
|
+
return row_dict["sys__id"], *udf_input
|
|
342
263
|
|
|
343
264
|
def process_safe(self, obj_rows):
|
|
344
265
|
try:
|
|
@@ -358,23 +279,128 @@ class UDFBase(AbstractUDF):
|
|
|
358
279
|
class Mapper(UDFBase):
|
|
359
280
|
"""Inherit from this class to pass to `DataChain.map()`."""
|
|
360
281
|
|
|
282
|
+
def run(
|
|
283
|
+
self,
|
|
284
|
+
udf_fields: "Sequence[str]",
|
|
285
|
+
udf_inputs: "Iterable[Sequence[Any]]",
|
|
286
|
+
catalog: "Catalog",
|
|
287
|
+
cache: bool,
|
|
288
|
+
download_cb: Callback = DEFAULT_CALLBACK,
|
|
289
|
+
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
290
|
+
) -> Iterator[Iterable[UDFResult]]:
|
|
291
|
+
self.catalog = catalog
|
|
292
|
+
self.setup()
|
|
293
|
+
|
|
294
|
+
for row in udf_inputs:
|
|
295
|
+
id_, *udf_args = self._prepare_row_and_id(
|
|
296
|
+
row, udf_fields, cache, download_cb
|
|
297
|
+
)
|
|
298
|
+
result_objs = self.process_safe(udf_args)
|
|
299
|
+
udf_output = self._flatten_row(result_objs)
|
|
300
|
+
output = [{"sys__id": id_} | dict(zip(self.signal_names, udf_output))]
|
|
301
|
+
processed_cb.relative_update(1)
|
|
302
|
+
yield output
|
|
303
|
+
|
|
304
|
+
self.teardown()
|
|
305
|
+
|
|
361
306
|
|
|
362
307
|
class BatchMapper(UDFBase):
|
|
363
308
|
"""Inherit from this class to pass to `DataChain.batch_map()`."""
|
|
364
309
|
|
|
365
|
-
is_input_batched = True
|
|
366
310
|
is_output_batched = True
|
|
367
311
|
|
|
312
|
+
def run(
|
|
313
|
+
self,
|
|
314
|
+
udf_fields: Sequence[str],
|
|
315
|
+
udf_inputs: Iterable[RowsOutputBatch],
|
|
316
|
+
catalog: "Catalog",
|
|
317
|
+
cache: bool,
|
|
318
|
+
download_cb: Callback = DEFAULT_CALLBACK,
|
|
319
|
+
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
320
|
+
) -> Iterator[Iterable[UDFResult]]:
|
|
321
|
+
self.catalog = catalog
|
|
322
|
+
self.setup()
|
|
323
|
+
|
|
324
|
+
for batch in udf_inputs:
|
|
325
|
+
n_rows = len(batch.rows)
|
|
326
|
+
row_ids, *udf_args = zip(
|
|
327
|
+
*[
|
|
328
|
+
self._prepare_row_and_id(row, udf_fields, cache, download_cb)
|
|
329
|
+
for row in batch.rows
|
|
330
|
+
]
|
|
331
|
+
)
|
|
332
|
+
result_objs = list(self.process_safe(udf_args))
|
|
333
|
+
n_objs = len(result_objs)
|
|
334
|
+
assert (
|
|
335
|
+
n_objs == n_rows
|
|
336
|
+
), f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
|
|
337
|
+
udf_outputs = (self._flatten_row(row) for row in result_objs)
|
|
338
|
+
output = [
|
|
339
|
+
{"sys__id": row_id} | dict(zip(self.signal_names, signals))
|
|
340
|
+
for row_id, signals in zip(row_ids, udf_outputs)
|
|
341
|
+
]
|
|
342
|
+
processed_cb.relative_update(n_rows)
|
|
343
|
+
yield output
|
|
344
|
+
|
|
345
|
+
self.teardown()
|
|
346
|
+
|
|
368
347
|
|
|
369
348
|
class Generator(UDFBase):
|
|
370
349
|
"""Inherit from this class to pass to `DataChain.gen()`."""
|
|
371
350
|
|
|
372
351
|
is_output_batched = True
|
|
373
352
|
|
|
353
|
+
def run(
|
|
354
|
+
self,
|
|
355
|
+
udf_fields: "Sequence[str]",
|
|
356
|
+
udf_inputs: "Iterable[Sequence[Any]]",
|
|
357
|
+
catalog: "Catalog",
|
|
358
|
+
cache: bool,
|
|
359
|
+
download_cb: Callback = DEFAULT_CALLBACK,
|
|
360
|
+
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
361
|
+
) -> Iterator[Iterable[UDFResult]]:
|
|
362
|
+
self.catalog = catalog
|
|
363
|
+
self.setup()
|
|
364
|
+
|
|
365
|
+
for row in udf_inputs:
|
|
366
|
+
udf_args = self._prepare_row(row, udf_fields, cache, download_cb)
|
|
367
|
+
result_objs = self.process_safe(udf_args)
|
|
368
|
+
udf_outputs = (self._flatten_row(row) for row in result_objs)
|
|
369
|
+
output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
|
|
370
|
+
processed_cb.relative_update(1)
|
|
371
|
+
yield output
|
|
372
|
+
|
|
373
|
+
self.teardown()
|
|
374
|
+
|
|
374
375
|
|
|
375
376
|
class Aggregator(UDFBase):
|
|
376
377
|
"""Inherit from this class to pass to `DataChain.agg()`."""
|
|
377
378
|
|
|
378
|
-
is_input_batched = True
|
|
379
379
|
is_output_batched = True
|
|
380
|
-
|
|
380
|
+
|
|
381
|
+
def run(
|
|
382
|
+
self,
|
|
383
|
+
udf_fields: "Sequence[str]",
|
|
384
|
+
udf_inputs: Iterable[RowsOutputBatch],
|
|
385
|
+
catalog: "Catalog",
|
|
386
|
+
cache: bool,
|
|
387
|
+
download_cb: Callback = DEFAULT_CALLBACK,
|
|
388
|
+
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
389
|
+
) -> Iterator[Iterable[UDFResult]]:
|
|
390
|
+
self.catalog = catalog
|
|
391
|
+
self.setup()
|
|
392
|
+
|
|
393
|
+
for batch in udf_inputs:
|
|
394
|
+
udf_args = zip(
|
|
395
|
+
*[
|
|
396
|
+
self._prepare_row(row, udf_fields, cache, download_cb)
|
|
397
|
+
for row in batch.rows
|
|
398
|
+
]
|
|
399
|
+
)
|
|
400
|
+
result_objs = self.process_safe(udf_args)
|
|
401
|
+
udf_outputs = (self._flatten_row(row) for row in result_objs)
|
|
402
|
+
output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
|
|
403
|
+
processed_cb.relative_update(len(batch.rows))
|
|
404
|
+
yield output
|
|
405
|
+
|
|
406
|
+
self.teardown()
|
|
@@ -11,8 +11,6 @@ from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from sqlalchemy import Select
|
|
13
13
|
|
|
14
|
-
from datachain.dataset import RowDict
|
|
15
|
-
|
|
16
14
|
|
|
17
15
|
@dataclass
|
|
18
16
|
class RowsOutputBatch:
|
|
@@ -22,14 +20,6 @@ class RowsOutputBatch:
|
|
|
22
20
|
RowsOutput = Union[Sequence, RowsOutputBatch]
|
|
23
21
|
|
|
24
22
|
|
|
25
|
-
@dataclass
|
|
26
|
-
class UDFInputBatch:
|
|
27
|
-
rows: Sequence["RowDict"]
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
UDFInput = Union["RowDict", UDFInputBatch]
|
|
31
|
-
|
|
32
|
-
|
|
33
23
|
class BatchingStrategy(ABC):
|
|
34
24
|
"""BatchingStrategy provides means of batching UDF executions."""
|
|
35
25
|
|
|
@@ -392,7 +392,7 @@ class UDFStep(Step, ABC):
|
|
|
392
392
|
|
|
393
393
|
def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
|
|
394
394
|
use_partitioning = self.partition_by is not None
|
|
395
|
-
batching = self.udf.
|
|
395
|
+
batching = self.udf.get_batching(use_partitioning)
|
|
396
396
|
workers = self.workers
|
|
397
397
|
if (
|
|
398
398
|
not workers
|
|
@@ -114,7 +114,6 @@ class UDFDispatcher:
|
|
|
114
114
|
catalog: Optional[Catalog] = None
|
|
115
115
|
task_queue: Optional[multiprocess.Queue] = None
|
|
116
116
|
done_queue: Optional[multiprocess.Queue] = None
|
|
117
|
-
_batch_size: Optional[int] = None
|
|
118
117
|
|
|
119
118
|
def __init__(
|
|
120
119
|
self,
|
|
@@ -154,17 +153,6 @@ class UDFDispatcher:
|
|
|
154
153
|
self.done_queue = None
|
|
155
154
|
self.ctx = get_context("spawn")
|
|
156
155
|
|
|
157
|
-
@property
|
|
158
|
-
def batch_size(self):
|
|
159
|
-
if self._batch_size is None:
|
|
160
|
-
if hasattr(self.udf, "properties") and hasattr(
|
|
161
|
-
self.udf.properties, "batch"
|
|
162
|
-
):
|
|
163
|
-
self._batch_size = self.udf.properties.batch
|
|
164
|
-
else:
|
|
165
|
-
self._batch_size = 1
|
|
166
|
-
return self._batch_size
|
|
167
|
-
|
|
168
156
|
def _create_worker(self) -> "UDFWorker":
|
|
169
157
|
if not self.catalog:
|
|
170
158
|
id_generator = self.id_generator_class(
|
|
@@ -37,6 +37,18 @@ class regexp_replace(GenericFunction): # noqa: N801
|
|
|
37
37
|
inherit_cache = True
|
|
38
38
|
|
|
39
39
|
|
|
40
|
+
class replace(GenericFunction): # noqa: N801
|
|
41
|
+
"""
|
|
42
|
+
Replaces substring with another string.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
type = String()
|
|
46
|
+
package = "string"
|
|
47
|
+
name = "replace"
|
|
48
|
+
inherit_cache = True
|
|
49
|
+
|
|
50
|
+
|
|
40
51
|
compiler_not_implemented(length)
|
|
41
52
|
compiler_not_implemented(split)
|
|
42
53
|
compiler_not_implemented(regexp_replace)
|
|
54
|
+
compiler_not_implemented(replace)
|