datachain 0.12.0__tar.gz → 0.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.12.0 → datachain-0.13.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.12.0 → datachain-0.13.0}/PKG-INFO +1 -1
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/catalog/catalog.py +6 -2
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/ls.py +8 -6
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/show.py +7 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/gcs.py +1 -1
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/s3.py +1 -1
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/metastore.py +6 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/dc.py +36 -7
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/file.py +8 -1
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/meta_formats.py +2 -2
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/signal_schema.py +65 -18
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/udf.py +3 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/udf_signature.py +17 -9
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/dataset.py +4 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/sqlite/base.py +2 -2
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/PKG-INFO +1 -1
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_datachain.py +16 -1
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_datachain.py +49 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_datachain_bootstrap.py +2 -2
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_signal_schema.py +209 -26
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_udf_signature.py +17 -7
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_array.py +7 -2
- {datachain-0.12.0 → datachain-0.13.0}/.cruft.json +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.gitattributes +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.github/codecov.yaml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.github/dependabot.yml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.github/workflows/release.yml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/.gitignore +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/LICENSE +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/README.rst +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/contributing.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/examples.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/index.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/overrides/main.html +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/quick-start.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/datachain.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/func.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/index.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/remotes.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/toolkit.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/torch.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/references/udf.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/docs/tutorials.md +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/mkdocs.yml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/noxfile.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/pyproject.toml +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/setup.cfg +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/__main__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/asyn.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cache.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/local.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/config.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/dataset.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/error.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/array.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/base.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/func.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/path.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/random.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/string.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/window.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/job.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/listing.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/node.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/progress.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/py.typed +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/params.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/session.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/studio.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain/utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/conftest.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/data.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/examples/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_client.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_file.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_hf.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_image.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_listing.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_ls.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_pull.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_query.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_session.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_video.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/test_atomicity.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/test_cli_studio.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/test_import_time.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/test_telemetry.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_client.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_config.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_func.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_query.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_session.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.12.0 → datachain-0.13.0}/tests/utils.py +0 -0
|
@@ -777,6 +777,8 @@ class Catalog:
|
|
|
777
777
|
validate_version: Optional[bool] = True,
|
|
778
778
|
listing: Optional[bool] = False,
|
|
779
779
|
uuid: Optional[str] = None,
|
|
780
|
+
description: Optional[str] = None,
|
|
781
|
+
labels: Optional[list[str]] = None,
|
|
780
782
|
) -> "DatasetRecord":
|
|
781
783
|
"""
|
|
782
784
|
Creates new dataset of a specific version.
|
|
@@ -803,6 +805,8 @@ class Catalog:
|
|
|
803
805
|
query_script=query_script,
|
|
804
806
|
schema=schema,
|
|
805
807
|
ignore_if_exists=True,
|
|
808
|
+
description=description,
|
|
809
|
+
labels=labels,
|
|
806
810
|
)
|
|
807
811
|
|
|
808
812
|
version = version or default_version
|
|
@@ -1608,7 +1612,7 @@ class Catalog:
|
|
|
1608
1612
|
except TerminationSignal as exc:
|
|
1609
1613
|
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1610
1614
|
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1611
|
-
|
|
1615
|
+
logger.info("Shutting down process %s, received %r", proc.pid, exc)
|
|
1612
1616
|
# Rather than forwarding the signal to the child, we try to shut it down
|
|
1613
1617
|
# gracefully. This is because we consider the script to be interactive
|
|
1614
1618
|
# and special, so we give it time to cleanup before exiting.
|
|
@@ -1623,7 +1627,7 @@ class Catalog:
|
|
|
1623
1627
|
if thread:
|
|
1624
1628
|
thread.join() # wait for the reader thread
|
|
1625
1629
|
|
|
1626
|
-
|
|
1630
|
+
logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1627
1631
|
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1628
1632
|
raise QueryScriptCancelError(
|
|
1629
1633
|
"Query script was canceled by user",
|
|
@@ -38,11 +38,12 @@ def ls_local(
|
|
|
38
38
|
):
|
|
39
39
|
from datachain import DataChain
|
|
40
40
|
|
|
41
|
-
if catalog is None:
|
|
42
|
-
from datachain.catalog import get_catalog
|
|
43
|
-
|
|
44
|
-
catalog = get_catalog(client_config=client_config)
|
|
45
41
|
if sources:
|
|
42
|
+
if catalog is None:
|
|
43
|
+
from datachain.catalog import get_catalog
|
|
44
|
+
|
|
45
|
+
catalog = get_catalog(client_config=client_config)
|
|
46
|
+
|
|
46
47
|
actual_sources = list(ls_urls(sources, catalog=catalog, long=long, **kwargs))
|
|
47
48
|
if len(actual_sources) == 1:
|
|
48
49
|
for _, entries in actual_sources:
|
|
@@ -61,8 +62,9 @@ def ls_local(
|
|
|
61
62
|
for entry in entries:
|
|
62
63
|
print(format_ls_entry(entry))
|
|
63
64
|
else:
|
|
64
|
-
|
|
65
|
-
|
|
65
|
+
# Collect results in a list here to prevent interference from `tqdm` and `print`
|
|
66
|
+
listing = list(DataChain.listings().collect("listing"))
|
|
67
|
+
for ls in listing:
|
|
66
68
|
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
67
69
|
|
|
68
70
|
|
|
@@ -40,6 +40,13 @@ def show(
|
|
|
40
40
|
.offset(offset)
|
|
41
41
|
)
|
|
42
42
|
records = query.to_db_records()
|
|
43
|
+
print("Name: ", name)
|
|
44
|
+
if dataset.description:
|
|
45
|
+
print("Description: ", dataset.description)
|
|
46
|
+
if dataset.labels:
|
|
47
|
+
print("Labels: ", ",".join(dataset.labels))
|
|
48
|
+
print("\n")
|
|
49
|
+
|
|
43
50
|
show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
|
|
44
51
|
|
|
45
52
|
if schema and dataset_version.feature_schema:
|
|
@@ -30,7 +30,7 @@ class GCSClient(Client):
|
|
|
30
30
|
if kwargs.pop("anon", False):
|
|
31
31
|
kwargs["token"] = "anon" # noqa: S105
|
|
32
32
|
|
|
33
|
-
return cast(GCSFileSystem, super().create_fs(**kwargs))
|
|
33
|
+
return cast("GCSFileSystem", super().create_fs(**kwargs))
|
|
34
34
|
|
|
35
35
|
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
36
36
|
"""
|
|
@@ -55,7 +55,7 @@ class ClientS3(Client):
|
|
|
55
55
|
except NotImplementedError:
|
|
56
56
|
pass
|
|
57
57
|
|
|
58
|
-
return cast(S3FileSystem, super().create_fs(**kwargs))
|
|
58
|
+
return cast("S3FileSystem", super().create_fs(**kwargs))
|
|
59
59
|
|
|
60
60
|
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
61
61
|
"""
|
|
@@ -119,6 +119,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
119
119
|
query_script: str = "",
|
|
120
120
|
schema: Optional[dict[str, Any]] = None,
|
|
121
121
|
ignore_if_exists: bool = False,
|
|
122
|
+
description: Optional[str] = None,
|
|
123
|
+
labels: Optional[list[str]] = None,
|
|
122
124
|
) -> DatasetRecord:
|
|
123
125
|
"""Creates new dataset."""
|
|
124
126
|
|
|
@@ -518,6 +520,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
518
520
|
query_script: str = "",
|
|
519
521
|
schema: Optional[dict[str, Any]] = None,
|
|
520
522
|
ignore_if_exists: bool = False,
|
|
523
|
+
description: Optional[str] = None,
|
|
524
|
+
labels: Optional[list[str]] = None,
|
|
521
525
|
**kwargs, # TODO registered = True / False
|
|
522
526
|
) -> DatasetRecord:
|
|
523
527
|
"""Creates new dataset."""
|
|
@@ -533,6 +537,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
533
537
|
sources="\n".join(sources) if sources else "",
|
|
534
538
|
query_script=query_script,
|
|
535
539
|
schema=json.dumps(schema or {}),
|
|
540
|
+
description=description,
|
|
541
|
+
labels=json.dumps(labels or []),
|
|
536
542
|
)
|
|
537
543
|
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
538
544
|
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
@@ -6,6 +6,7 @@ import sys
|
|
|
6
6
|
from collections.abc import Iterator, Sequence
|
|
7
7
|
from functools import wraps
|
|
8
8
|
from typing import (
|
|
9
|
+
IO,
|
|
9
10
|
TYPE_CHECKING,
|
|
10
11
|
Any,
|
|
11
12
|
BinaryIO,
|
|
@@ -270,6 +271,18 @@ class DataChain:
|
|
|
270
271
|
self._setup: dict = setup or {}
|
|
271
272
|
self._sys = _sys
|
|
272
273
|
|
|
274
|
+
def __repr__(self) -> str:
|
|
275
|
+
"""Return a string representation of the chain."""
|
|
276
|
+
classname = self.__class__.__name__
|
|
277
|
+
if not self._effective_signals_schema.values:
|
|
278
|
+
return f"Empty {classname}"
|
|
279
|
+
|
|
280
|
+
import io
|
|
281
|
+
|
|
282
|
+
file = io.StringIO()
|
|
283
|
+
self.print_schema(file=file)
|
|
284
|
+
return file.getvalue()
|
|
285
|
+
|
|
273
286
|
@property
|
|
274
287
|
def schema(self) -> dict[str, DataType]:
|
|
275
288
|
"""Get schema of the chain."""
|
|
@@ -323,9 +336,9 @@ class DataChain:
|
|
|
323
336
|
"""Return `self.union(other)`."""
|
|
324
337
|
return self.union(other)
|
|
325
338
|
|
|
326
|
-
def print_schema(self) -> None:
|
|
339
|
+
def print_schema(self, file: Optional[IO] = None) -> None:
|
|
327
340
|
"""Print schema of the chain."""
|
|
328
|
-
self._effective_signals_schema.print_tree()
|
|
341
|
+
self._effective_signals_schema.print_tree(file=file)
|
|
329
342
|
|
|
330
343
|
def clone(self) -> "Self":
|
|
331
344
|
"""Make a copy of the chain in a new table."""
|
|
@@ -629,7 +642,8 @@ class DataChain:
|
|
|
629
642
|
model_name=model_name,
|
|
630
643
|
jmespath=jmespath,
|
|
631
644
|
nrows=nrows,
|
|
632
|
-
)
|
|
645
|
+
),
|
|
646
|
+
"params": {"file": File},
|
|
633
647
|
}
|
|
634
648
|
# disable prefetch if nrows is set
|
|
635
649
|
settings = {"prefetch": 0} if nrows else {}
|
|
@@ -773,7 +787,12 @@ class DataChain:
|
|
|
773
787
|
)
|
|
774
788
|
|
|
775
789
|
def save( # type: ignore[override]
|
|
776
|
-
self,
|
|
790
|
+
self,
|
|
791
|
+
name: Optional[str] = None,
|
|
792
|
+
version: Optional[int] = None,
|
|
793
|
+
description: Optional[str] = None,
|
|
794
|
+
labels: Optional[list[str]] = None,
|
|
795
|
+
**kwargs,
|
|
777
796
|
) -> "Self":
|
|
778
797
|
"""Save to a Dataset. It returns the chain itself.
|
|
779
798
|
|
|
@@ -781,11 +800,18 @@ class DataChain:
|
|
|
781
800
|
name : dataset name. Empty name saves to a temporary dataset that will be
|
|
782
801
|
removed after process ends. Temp dataset are useful for optimization.
|
|
783
802
|
version : version of a dataset. Default - the last version that exist.
|
|
803
|
+
description : description of a dataset.
|
|
804
|
+
labels : labels of a dataset.
|
|
784
805
|
"""
|
|
785
806
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
786
807
|
return self._evolve(
|
|
787
808
|
query=self._query.save(
|
|
788
|
-
name=name,
|
|
809
|
+
name=name,
|
|
810
|
+
version=version,
|
|
811
|
+
description=description,
|
|
812
|
+
labels=labels,
|
|
813
|
+
feature_schema=schema,
|
|
814
|
+
**kwargs,
|
|
789
815
|
)
|
|
790
816
|
)
|
|
791
817
|
|
|
@@ -1003,8 +1029,9 @@ class DataChain:
|
|
|
1003
1029
|
func: Optional[Union[Callable, UDFObjT]],
|
|
1004
1030
|
params: Union[None, str, Sequence[str]],
|
|
1005
1031
|
output: OutputType,
|
|
1006
|
-
signal_map,
|
|
1032
|
+
signal_map: dict[str, Callable],
|
|
1007
1033
|
) -> UDFObjT:
|
|
1034
|
+
is_batch = target_class.is_input_batched
|
|
1008
1035
|
is_generator = target_class.is_output_batched
|
|
1009
1036
|
name = self.name or ""
|
|
1010
1037
|
|
|
@@ -1015,7 +1042,9 @@ class DataChain:
|
|
|
1015
1042
|
if self._sys:
|
|
1016
1043
|
signals_schema = SignalSchema({"sys": Sys}) | signals_schema
|
|
1017
1044
|
|
|
1018
|
-
params_schema = signals_schema.slice(
|
|
1045
|
+
params_schema = signals_schema.slice(
|
|
1046
|
+
sign.params, self._setup, is_batch=is_batch
|
|
1047
|
+
)
|
|
1019
1048
|
|
|
1020
1049
|
return target_class._create(sign, params_schema)
|
|
1021
1050
|
|
|
@@ -193,7 +193,14 @@ class File(DataModel):
|
|
|
193
193
|
"last_modified": DateTime,
|
|
194
194
|
"location": JSON,
|
|
195
195
|
}
|
|
196
|
-
_hidden_fields: ClassVar[list[str]] = [
|
|
196
|
+
_hidden_fields: ClassVar[list[str]] = [
|
|
197
|
+
"source",
|
|
198
|
+
"version",
|
|
199
|
+
"etag",
|
|
200
|
+
"is_latest",
|
|
201
|
+
"last_modified",
|
|
202
|
+
"location",
|
|
203
|
+
]
|
|
197
204
|
|
|
198
205
|
_unique_id_keys: ClassVar[list[str]] = [
|
|
199
206
|
"source",
|
|
@@ -10,7 +10,7 @@ import jmespath as jsp
|
|
|
10
10
|
from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
|
|
11
11
|
|
|
12
12
|
from datachain.lib.data_model import DataModel # noqa: F401
|
|
13
|
-
from datachain.lib.file import
|
|
13
|
+
from datachain.lib.file import TextFile
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class UserModel(BaseModel):
|
|
@@ -130,7 +130,7 @@ def read_meta( # noqa: C901
|
|
|
130
130
|
#
|
|
131
131
|
|
|
132
132
|
def parse_data(
|
|
133
|
-
file:
|
|
133
|
+
file: TextFile,
|
|
134
134
|
data_model=spec,
|
|
135
135
|
format=format,
|
|
136
136
|
jmespath=jmespath,
|
|
@@ -5,6 +5,7 @@ from dataclasses import dataclass
|
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from inspect import isclass
|
|
7
7
|
from typing import ( # noqa: UP035
|
|
8
|
+
IO,
|
|
8
9
|
TYPE_CHECKING,
|
|
9
10
|
Annotated,
|
|
10
11
|
Any,
|
|
@@ -154,9 +155,9 @@ class SignalSchema:
|
|
|
154
155
|
if not callable(func):
|
|
155
156
|
raise SetupError(key, "value must be function or callable class")
|
|
156
157
|
|
|
157
|
-
def _init_setup_values(self):
|
|
158
|
+
def _init_setup_values(self) -> None:
|
|
158
159
|
if self.setup_values is not None:
|
|
159
|
-
return
|
|
160
|
+
return
|
|
160
161
|
|
|
161
162
|
res = {}
|
|
162
163
|
for key, func in self.setup_func.items():
|
|
@@ -398,7 +399,7 @@ class SignalSchema:
|
|
|
398
399
|
return SignalSchema(signals)
|
|
399
400
|
|
|
400
401
|
@staticmethod
|
|
401
|
-
def get_flatten_hidden_fields(schema):
|
|
402
|
+
def get_flatten_hidden_fields(schema: dict):
|
|
402
403
|
custom_types = schema.get("_custom_types", {})
|
|
403
404
|
if not custom_types:
|
|
404
405
|
return []
|
|
@@ -464,19 +465,61 @@ class SignalSchema:
|
|
|
464
465
|
return False
|
|
465
466
|
|
|
466
467
|
def slice(
|
|
467
|
-
self,
|
|
468
|
+
self,
|
|
469
|
+
params: dict[str, Union[DataType, Any]],
|
|
470
|
+
setup: Optional[dict[str, Callable]] = None,
|
|
471
|
+
is_batch: bool = False,
|
|
468
472
|
) -> "SignalSchema":
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
for
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
473
|
+
"""
|
|
474
|
+
Returns new schema that combines current schema and setup signals.
|
|
475
|
+
"""
|
|
476
|
+
setup_params = setup.keys() if setup else []
|
|
477
|
+
schema: dict[str, DataType] = {}
|
|
478
|
+
|
|
479
|
+
for param, param_type in params.items():
|
|
480
|
+
# This is special case for setup params, they are always treated as strings
|
|
481
|
+
if param in setup_params:
|
|
482
|
+
schema[param] = str
|
|
483
|
+
continue
|
|
484
|
+
|
|
485
|
+
schema_type = self._find_in_tree(param.split("."))
|
|
486
|
+
|
|
487
|
+
if param_type is Any:
|
|
488
|
+
schema[param] = schema_type
|
|
489
|
+
continue
|
|
490
|
+
|
|
491
|
+
schema_origin = get_origin(schema_type)
|
|
492
|
+
param_origin = get_origin(param_type)
|
|
493
|
+
|
|
494
|
+
if schema_origin is Union and type(None) in get_args(schema_type):
|
|
495
|
+
schema_type = get_args(schema_type)[0]
|
|
496
|
+
if param_origin is Union and type(None) in get_args(param_type):
|
|
497
|
+
param_type = get_args(param_type)[0]
|
|
498
|
+
|
|
499
|
+
if is_batch:
|
|
500
|
+
if param_type is list:
|
|
501
|
+
schema[param] = schema_type
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
if param_origin is not list:
|
|
505
|
+
raise SignalResolvingError(param.split("."), "is not a list")
|
|
506
|
+
|
|
507
|
+
param_type = get_args(param_type)[0]
|
|
508
|
+
|
|
509
|
+
if param_type == schema_type or (
|
|
510
|
+
isclass(param_type)
|
|
511
|
+
and isclass(schema_type)
|
|
512
|
+
and issubclass(param_type, File)
|
|
513
|
+
and issubclass(schema_type, File)
|
|
514
|
+
):
|
|
515
|
+
schema[param] = schema_type
|
|
516
|
+
continue
|
|
517
|
+
|
|
518
|
+
raise SignalResolvingError(
|
|
519
|
+
param.split("."),
|
|
520
|
+
f"types mismatch: {param_type} != {schema_type}",
|
|
521
|
+
)
|
|
522
|
+
|
|
480
523
|
return SignalSchema(schema, setup)
|
|
481
524
|
|
|
482
525
|
def row_to_features(
|
|
@@ -696,16 +739,20 @@ class SignalSchema:
|
|
|
696
739
|
substree, new_prefix, depth + 1, include_hidden
|
|
697
740
|
)
|
|
698
741
|
|
|
699
|
-
def print_tree(self, indent: int =
|
|
742
|
+
def print_tree(self, indent: int = 2, start_at: int = 0, file: Optional[IO] = None):
|
|
700
743
|
for path, type_, _, depth in self.get_flat_tree():
|
|
701
744
|
total_indent = start_at + depth * indent
|
|
702
|
-
|
|
745
|
+
col_name = " " * total_indent + path[-1]
|
|
746
|
+
col_type = SignalSchema._type_to_str(type_)
|
|
747
|
+
print(col_name, col_type, sep=": ", file=file)
|
|
703
748
|
|
|
704
749
|
if get_origin(type_) is list:
|
|
705
750
|
args = get_args(type_)
|
|
706
751
|
if len(args) > 0 and ModelStore.is_pydantic(args[0]):
|
|
707
752
|
sub_schema = SignalSchema({"* list of": args[0]})
|
|
708
|
-
sub_schema.print_tree(
|
|
753
|
+
sub_schema.print_tree(
|
|
754
|
+
indent=indent, start_at=total_indent + indent, file=file
|
|
755
|
+
)
|
|
709
756
|
|
|
710
757
|
def get_headers_with_length(self, include_hidden: bool = True):
|
|
711
758
|
paths = [
|
|
@@ -159,6 +159,7 @@ class UDFBase(AbstractUDF):
|
|
|
159
159
|
```
|
|
160
160
|
"""
|
|
161
161
|
|
|
162
|
+
is_input_batched = False
|
|
162
163
|
is_output_batched = False
|
|
163
164
|
prefetch: int = 0
|
|
164
165
|
|
|
@@ -395,6 +396,7 @@ class Mapper(UDFBase):
|
|
|
395
396
|
class BatchMapper(UDFBase):
|
|
396
397
|
"""Inherit from this class to pass to `DataChain.batch_map()`."""
|
|
397
398
|
|
|
399
|
+
is_input_batched = True
|
|
398
400
|
is_output_batched = True
|
|
399
401
|
|
|
400
402
|
def run(
|
|
@@ -481,6 +483,7 @@ class Generator(UDFBase):
|
|
|
481
483
|
class Aggregator(UDFBase):
|
|
482
484
|
"""Inherit from this class to pass to `DataChain.agg()`."""
|
|
483
485
|
|
|
486
|
+
is_input_batched = True
|
|
484
487
|
is_output_batched = True
|
|
485
488
|
|
|
486
489
|
def run(
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from collections.abc import Generator, Iterator, Sequence
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import Callable, Union, get_args, get_origin
|
|
4
|
+
from typing import Any, Callable, Union, get_args, get_origin
|
|
5
5
|
|
|
6
6
|
from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
|
|
7
7
|
from datachain.lib.signal_schema import SignalSchema
|
|
@@ -18,7 +18,7 @@ class UdfSignatureError(DataChainParamsError):
|
|
|
18
18
|
@dataclass
|
|
19
19
|
class UdfSignature:
|
|
20
20
|
func: Union[Callable, UDFBase]
|
|
21
|
-
params:
|
|
21
|
+
params: dict[str, Union[DataType, Any]]
|
|
22
22
|
output_schema: SignalSchema
|
|
23
23
|
|
|
24
24
|
DEFAULT_RETURN_TYPE = str
|
|
@@ -58,15 +58,23 @@ class UdfSignature:
|
|
|
58
58
|
if not isinstance(udf_func, UDFBase) and not callable(udf_func):
|
|
59
59
|
raise UdfSignatureError(chain, f"UDF '{udf_func}' is not callable")
|
|
60
60
|
|
|
61
|
-
func_params_map_sign, func_outs_sign, is_iterator = (
|
|
62
|
-
|
|
61
|
+
func_params_map_sign, func_outs_sign, is_iterator = cls._func_signature(
|
|
62
|
+
chain, udf_func
|
|
63
63
|
)
|
|
64
|
+
|
|
65
|
+
udf_params: dict[str, Union[DataType, Any]] = {}
|
|
64
66
|
if params:
|
|
65
|
-
udf_params =
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
udf_params =
|
|
67
|
+
udf_params = (
|
|
68
|
+
{params: Any} if isinstance(params, str) else dict.fromkeys(params, Any)
|
|
69
|
+
)
|
|
70
|
+
elif func_params_map_sign:
|
|
71
|
+
udf_params = {
|
|
72
|
+
param: (
|
|
73
|
+
param_type if param_type is not inspect.Parameter.empty else Any
|
|
74
|
+
)
|
|
75
|
+
for param, param_type in func_params_map_sign.items()
|
|
76
|
+
}
|
|
77
|
+
|
|
70
78
|
if output:
|
|
71
79
|
udf_output_map = UdfSignature._validate_output(
|
|
72
80
|
chain, signal_name, func, func_outs_sign, output
|
|
@@ -1646,6 +1646,8 @@ class DatasetQuery:
|
|
|
1646
1646
|
name: Optional[str] = None,
|
|
1647
1647
|
version: Optional[int] = None,
|
|
1648
1648
|
feature_schema: Optional[dict] = None,
|
|
1649
|
+
description: Optional[str] = None,
|
|
1650
|
+
labels: Optional[list[str]] = None,
|
|
1649
1651
|
**kwargs,
|
|
1650
1652
|
) -> "Self":
|
|
1651
1653
|
"""Save the query as a dataset."""
|
|
@@ -1678,6 +1680,8 @@ class DatasetQuery:
|
|
|
1678
1680
|
version=version,
|
|
1679
1681
|
feature_schema=feature_schema,
|
|
1680
1682
|
columns=columns,
|
|
1683
|
+
description=description,
|
|
1684
|
+
labels=labels,
|
|
1681
1685
|
**kwargs,
|
|
1682
1686
|
)
|
|
1683
1687
|
version = version or dataset.latest_version
|
|
@@ -290,9 +290,9 @@ def adapt_datetime(val: datetime) -> str:
|
|
|
290
290
|
val = val.astimezone(timezone.utc)
|
|
291
291
|
except (OverflowError, ValueError, OSError):
|
|
292
292
|
if val.year == MAXYEAR:
|
|
293
|
-
val = datetime.max
|
|
293
|
+
val = datetime.max.replace(tzinfo=timezone.utc)
|
|
294
294
|
elif val.year == MINYEAR:
|
|
295
|
-
val = datetime.min
|
|
295
|
+
val = datetime.min.replace(tzinfo=timezone.utc)
|
|
296
296
|
else:
|
|
297
297
|
raise
|
|
298
298
|
return val.replace(tzinfo=None).isoformat(" ")
|
|
@@ -447,6 +447,21 @@ def test_show(capsys, test_session):
|
|
|
447
447
|
assert f"{i} {first_name[i]}" in normalized_output
|
|
448
448
|
|
|
449
449
|
|
|
450
|
+
def test_save(test_session):
|
|
451
|
+
chain = DataChain.from_values(key=["a", "b", "c"])
|
|
452
|
+
chain.save(
|
|
453
|
+
name="new_name",
|
|
454
|
+
version=1,
|
|
455
|
+
description="new description",
|
|
456
|
+
labels=["new_label", "old_label"],
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
ds = test_session.catalog.get_dataset("new_name")
|
|
460
|
+
assert ds.name == "new_name"
|
|
461
|
+
assert ds.description == "new description"
|
|
462
|
+
assert ds.labels == ["new_label", "old_label"]
|
|
463
|
+
|
|
464
|
+
|
|
450
465
|
def test_show_nested_empty(capsys, test_session):
|
|
451
466
|
files = [File(size=s, path=p) for p, s in zip(list("abcde"), range(5))]
|
|
452
467
|
DataChain.from_values(file=files, session=test_session).limit(0).show()
|
|
@@ -707,7 +722,7 @@ def test_udf_parallel_boostrap(test_session_tmpfile):
|
|
|
707
722
|
self.value = MyMapper.DEFAULT_VALUE
|
|
708
723
|
self._had_teardown = False
|
|
709
724
|
|
|
710
|
-
def process(self,
|
|
725
|
+
def process(self, key) -> int:
|
|
711
726
|
return self.value
|
|
712
727
|
|
|
713
728
|
def setup(self):
|
|
@@ -79,6 +79,55 @@ def sort_files(files):
|
|
|
79
79
|
return sorted(files, key=lambda f: (f.path, f.size))
|
|
80
80
|
|
|
81
81
|
|
|
82
|
+
def test_repr(test_session):
|
|
83
|
+
dc = DataChain.from_values(
|
|
84
|
+
sign1=features_nested, col1=["a", "b", "c"], session=test_session
|
|
85
|
+
)
|
|
86
|
+
assert (
|
|
87
|
+
repr(dc)
|
|
88
|
+
== """\
|
|
89
|
+
sign1: MyNested
|
|
90
|
+
label: str
|
|
91
|
+
fr: MyFr
|
|
92
|
+
nnn: str
|
|
93
|
+
count: int
|
|
94
|
+
col1: str
|
|
95
|
+
"""
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# datachain without any columns
|
|
99
|
+
assert repr(dc.select_except("col1", "sign1")) == "Empty DataChain"
|
|
100
|
+
|
|
101
|
+
dc = dc.map(col2=lambda col1: col1 * 2)
|
|
102
|
+
assert (
|
|
103
|
+
repr(dc)
|
|
104
|
+
== """\
|
|
105
|
+
sign1: MyNested
|
|
106
|
+
label: str
|
|
107
|
+
fr: MyFr
|
|
108
|
+
nnn: str
|
|
109
|
+
count: int
|
|
110
|
+
col1: str
|
|
111
|
+
col2: str
|
|
112
|
+
"""
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
dc = dc.mutate(countplusone=dc.column("sign1.fr.count") + 1)
|
|
116
|
+
assert (
|
|
117
|
+
repr(dc)
|
|
118
|
+
== """\
|
|
119
|
+
sign1: MyNested
|
|
120
|
+
label: str
|
|
121
|
+
fr: MyFr
|
|
122
|
+
nnn: str
|
|
123
|
+
count: int
|
|
124
|
+
col1: str
|
|
125
|
+
col2: str
|
|
126
|
+
countplusone: int
|
|
127
|
+
"""
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
82
131
|
def test_pandas_conversion(test_session):
|
|
83
132
|
df = pd.DataFrame(DF_DATA)
|
|
84
133
|
df1 = DataChain.from_pandas(df, session=test_session)
|
|
@@ -15,7 +15,7 @@ def test_udf():
|
|
|
15
15
|
self.value = MyMapper.DEFAULT_VALUE
|
|
16
16
|
self._had_teardown = False
|
|
17
17
|
|
|
18
|
-
def process(self,
|
|
18
|
+
def process(self, key) -> int:
|
|
19
19
|
return self.value
|
|
20
20
|
|
|
21
21
|
def setup(self):
|
|
@@ -40,7 +40,7 @@ def test_no_bootstrap_for_callable():
|
|
|
40
40
|
self._had_bootstrap = False
|
|
41
41
|
self._had_teardown = False
|
|
42
42
|
|
|
43
|
-
def __call__(self,
|
|
43
|
+
def __call__(self, key):
|
|
44
44
|
return None
|
|
45
45
|
|
|
46
46
|
def bootstrap(self):
|