datachain 0.15.0__tar.gz → 0.16.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.15.0/src/datachain.egg-info → datachain-0.16.1}/PKG-INFO +1 -1
- {datachain-0.15.0 → datachain-0.16.1}/docs/examples.md +5 -5
- {datachain-0.15.0 → datachain-0.16.1}/docs/quick-start.md +3 -3
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/catalog/catalog.py +9 -9
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/__init__.py +1 -1
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/datasets.py +3 -3
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/show.py +2 -2
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/parser/__init__.py +2 -2
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/metastore.py +5 -5
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/dataset.py +8 -8
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/aggregate.py +3 -3
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/values_to_tuples.py +6 -8
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dataset_info.py +18 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/datachain.py +20 -13
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/datasets.py +9 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/records.py +16 -10
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/utils.py +2 -2
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/signal_schema.py +1 -10
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/udf.py +2 -1
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/dataset.py +15 -8
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/schema.py +1 -4
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/remote/studio.py +2 -2
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/studio.py +2 -2
- {datachain-0.15.0 → datachain-0.16.1/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.15.0 → datachain-0.16.1}/tests/conftest.py +7 -7
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_datachain.py +4 -4
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_datasets.py +7 -7
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_pull.py +1 -1
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_read_database.py +31 -17
- {datachain-0.15.0 → datachain-0.16.1}/tests/test_cli_studio.py +4 -4
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_datachain.py +35 -0
- {datachain-0.15.0 → datachain-0.16.1}/.cruft.json +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.gitattributes +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.github/codecov.yaml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.github/dependabot.yml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.github/workflows/release.yml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.gitignore +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/.pre-commit-config.yaml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/LICENSE +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/README.rst +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/contributing.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/index.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/overrides/main.html +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/file.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/index.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/pose.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/segment.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/datachain.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/func.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/index.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/remotes.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/toolkit.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/torch.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/references/udf.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/docs/tutorials.md +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/mkdocs.yml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/noxfile.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/pyproject.toml +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/setup.cfg +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/__main__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/asyn.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cache.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/local.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/config.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/error.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/fs/reference.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/fs/utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/array.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/base.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/conditional.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/func.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/numeric.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/path.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/random.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/string.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/window.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/job.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/file.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/video.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/listing.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/bbox.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/pose.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/segment.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/node.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/progress.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/py.typed +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/batch.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/params.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/session.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/udf.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/script_meta.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain/utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/data.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/examples/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/data/lena.jpg +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_client.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_data_storage.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_file.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_hf.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_image.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_listing.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_ls.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_query.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_session.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_toolkit.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_video.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_warehouse.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/test_atomicity.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/test_import_time.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/test_telemetry.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/model/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_client.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_config.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_func.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_query.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_session.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.15.0 → datachain-0.16.1}/tests/utils.py +0 -0
|
@@ -94,7 +94,7 @@ dc.DataModel.register(MistralModel)
|
|
|
94
94
|
chain = (
|
|
95
95
|
dc
|
|
96
96
|
.read_storage("gs://datachain-demo/chatbot-KiT/", type="text")
|
|
97
|
-
.filter(dc.Column("file.
|
|
97
|
+
.filter(dc.Column("file.path").glob("*.txt"))
|
|
98
98
|
.limit(5)
|
|
99
99
|
.settings(parallel=4, cache=True)
|
|
100
100
|
.map(
|
|
@@ -228,7 +228,7 @@ Here is an example from MS COCO “captions” JSON which employs separate secti
|
|
|
228
228
|
|
|
229
229
|
Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations” array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
|
|
230
230
|
|
|
231
|
-
However,
|
|
231
|
+
However, DataChain can easily parse the entire COCO structure via several reading and merging operators:
|
|
232
232
|
|
|
233
233
|
```python
|
|
234
234
|
import datachain as dc
|
|
@@ -240,7 +240,7 @@ images = dc.read_storage(images_uri)
|
|
|
240
240
|
meta = dc.read_json(captions_uri, jmespath="images")
|
|
241
241
|
captions = dc.read_json(captions_uri, jmespath="annotations")
|
|
242
242
|
|
|
243
|
-
images_meta = images.merge(meta, on="file.
|
|
243
|
+
images_meta = images.merge(meta, on="file.path", right_on="images.file_name")
|
|
244
244
|
captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id")
|
|
245
245
|
```
|
|
246
246
|
|
|
@@ -248,12 +248,12 @@ The resulting dataset has image entries as files decorated with all the metadata
|
|
|
248
248
|
|
|
249
249
|
```python
|
|
250
250
|
images_with_dogs = captioned_images.filter(dc.Column("annotations.caption").glob("*dog*"))
|
|
251
|
-
images_with_dogs.select("annotations", "file.
|
|
251
|
+
images_with_dogs.select("annotations", "file.path").show()
|
|
252
252
|
```
|
|
253
253
|
|
|
254
254
|
```
|
|
255
255
|
captions captions captions file
|
|
256
|
-
image_id id caption
|
|
256
|
+
image_id id caption path
|
|
257
257
|
0 17029 778902 a dog jumping to catch a frisbee in a yard 000000017029.jpg
|
|
258
258
|
1 17029 779838 A dog jumping to catch a red frisbee in a garden 000000017029.jpg
|
|
259
259
|
2 17029 781941 The dog is catching the Frisbee in mid air in ... 000000017029.jpg
|
|
@@ -184,7 +184,7 @@ chain = (
|
|
|
184
184
|
.save("response")
|
|
185
185
|
)
|
|
186
186
|
|
|
187
|
-
chain.select("file.
|
|
187
|
+
chain.select("file.path", "status", "response.usage").show(5)
|
|
188
188
|
|
|
189
189
|
success_rate = chain.filter(dc.Column("status") == "success").count() / chain.count()
|
|
190
190
|
print(f"{100*success_rate:.1f}% dialogs were successful")
|
|
@@ -194,7 +194,7 @@ Output:
|
|
|
194
194
|
|
|
195
195
|
``` shell
|
|
196
196
|
file status response response response
|
|
197
|
-
|
|
197
|
+
path usage usage usage
|
|
198
198
|
prompt_tokens total_tokens completion_tokens
|
|
199
199
|
0 1.txt success 547 548 1
|
|
200
200
|
1 10.txt failure 3576 3578 2
|
|
@@ -277,7 +277,7 @@ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
|
277
277
|
|
|
278
278
|
chain = (
|
|
279
279
|
dc.read_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
|
|
280
|
-
.map(label=lambda name: name.split(".")[0], params=["file.
|
|
280
|
+
.map(label=lambda name: name.split(".")[0], params=["file.path"])
|
|
281
281
|
.select("file", "label").to_pytorch(
|
|
282
282
|
transform=processor.image_processor,
|
|
283
283
|
tokenizer=processor.tokenizer,
|
|
@@ -776,7 +776,7 @@ class Catalog:
|
|
|
776
776
|
listing: Optional[bool] = False,
|
|
777
777
|
uuid: Optional[str] = None,
|
|
778
778
|
description: Optional[str] = None,
|
|
779
|
-
|
|
779
|
+
attrs: Optional[list[str]] = None,
|
|
780
780
|
) -> "DatasetRecord":
|
|
781
781
|
"""
|
|
782
782
|
Creates new dataset of a specific version.
|
|
@@ -794,16 +794,16 @@ class Catalog:
|
|
|
794
794
|
dataset = self.get_dataset(name)
|
|
795
795
|
default_version = dataset.next_version
|
|
796
796
|
|
|
797
|
-
if (description or
|
|
798
|
-
dataset.description != description or dataset.
|
|
797
|
+
if (description or attrs) and (
|
|
798
|
+
dataset.description != description or dataset.attrs != attrs
|
|
799
799
|
):
|
|
800
800
|
description = description or dataset.description
|
|
801
|
-
|
|
801
|
+
attrs = attrs or dataset.attrs
|
|
802
802
|
|
|
803
803
|
self.update_dataset(
|
|
804
804
|
dataset,
|
|
805
805
|
description=description,
|
|
806
|
-
|
|
806
|
+
attrs=attrs,
|
|
807
807
|
)
|
|
808
808
|
|
|
809
809
|
except DatasetNotFoundError:
|
|
@@ -817,7 +817,7 @@ class Catalog:
|
|
|
817
817
|
schema=schema,
|
|
818
818
|
ignore_if_exists=True,
|
|
819
819
|
description=description,
|
|
820
|
-
|
|
820
|
+
attrs=attrs,
|
|
821
821
|
)
|
|
822
822
|
|
|
823
823
|
version = version or default_version
|
|
@@ -1334,15 +1334,15 @@ class Catalog:
|
|
|
1334
1334
|
name: str,
|
|
1335
1335
|
new_name: Optional[str] = None,
|
|
1336
1336
|
description: Optional[str] = None,
|
|
1337
|
-
|
|
1337
|
+
attrs: Optional[list[str]] = None,
|
|
1338
1338
|
) -> DatasetRecord:
|
|
1339
1339
|
update_data = {}
|
|
1340
1340
|
if new_name:
|
|
1341
1341
|
update_data["name"] = new_name
|
|
1342
1342
|
if description is not None:
|
|
1343
1343
|
update_data["description"] = description
|
|
1344
|
-
if
|
|
1345
|
-
update_data["
|
|
1344
|
+
if attrs is not None:
|
|
1345
|
+
update_data["attrs"] = attrs # type: ignore[assignment]
|
|
1346
1346
|
|
|
1347
1347
|
dataset = self.get_dataset(name)
|
|
1348
1348
|
return self.update_dataset(dataset, **update_data)
|
|
@@ -154,7 +154,7 @@ def edit_dataset(
|
|
|
154
154
|
name: str,
|
|
155
155
|
new_name: Optional[str] = None,
|
|
156
156
|
description: Optional[str] = None,
|
|
157
|
-
|
|
157
|
+
attrs: Optional[list[str]] = None,
|
|
158
158
|
studio: bool = False,
|
|
159
159
|
local: bool = False,
|
|
160
160
|
all: bool = True,
|
|
@@ -167,9 +167,9 @@ def edit_dataset(
|
|
|
167
167
|
|
|
168
168
|
if all or local:
|
|
169
169
|
try:
|
|
170
|
-
catalog.edit_dataset(name, new_name, description,
|
|
170
|
+
catalog.edit_dataset(name, new_name, description, attrs)
|
|
171
171
|
except DatasetNotFoundError:
|
|
172
172
|
print("Dataset not found in local", file=sys.stderr)
|
|
173
173
|
|
|
174
174
|
if (all or studio) and token:
|
|
175
|
-
edit_studio_dataset(team, name, new_name, description,
|
|
175
|
+
edit_studio_dataset(team, name, new_name, description, attrs)
|
|
@@ -42,8 +42,8 @@ def show(
|
|
|
42
42
|
print("Name: ", name)
|
|
43
43
|
if dataset.description:
|
|
44
44
|
print("Description: ", dataset.description)
|
|
45
|
-
if dataset.
|
|
46
|
-
print("
|
|
45
|
+
if dataset.attrs:
|
|
46
|
+
print("Attributes: ", ",".join(dataset.attrs))
|
|
47
47
|
print("\n")
|
|
48
48
|
|
|
49
49
|
show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
|
|
@@ -217,9 +217,9 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
217
217
|
help="Dataset description",
|
|
218
218
|
)
|
|
219
219
|
parse_edit_dataset.add_argument(
|
|
220
|
-
"--
|
|
220
|
+
"--attrs",
|
|
221
221
|
nargs="+",
|
|
222
|
-
help="Dataset
|
|
222
|
+
help="Dataset attributes",
|
|
223
223
|
)
|
|
224
224
|
parse_edit_dataset.add_argument(
|
|
225
225
|
"--studio",
|
|
@@ -120,7 +120,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
120
120
|
schema: Optional[dict[str, Any]] = None,
|
|
121
121
|
ignore_if_exists: bool = False,
|
|
122
122
|
description: Optional[str] = None,
|
|
123
|
-
|
|
123
|
+
attrs: Optional[list[str]] = None,
|
|
124
124
|
) -> DatasetRecord:
|
|
125
125
|
"""Creates new dataset."""
|
|
126
126
|
|
|
@@ -326,7 +326,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
326
326
|
Column("id", Integer, primary_key=True),
|
|
327
327
|
Column("name", Text, nullable=False),
|
|
328
328
|
Column("description", Text),
|
|
329
|
-
Column("
|
|
329
|
+
Column("attrs", JSON, nullable=True),
|
|
330
330
|
Column("status", Integer, nullable=False),
|
|
331
331
|
Column("feature_schema", JSON, nullable=True),
|
|
332
332
|
Column("created_at", DateTime(timezone=True)),
|
|
@@ -521,7 +521,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
521
521
|
schema: Optional[dict[str, Any]] = None,
|
|
522
522
|
ignore_if_exists: bool = False,
|
|
523
523
|
description: Optional[str] = None,
|
|
524
|
-
|
|
524
|
+
attrs: Optional[list[str]] = None,
|
|
525
525
|
**kwargs, # TODO registered = True / False
|
|
526
526
|
) -> DatasetRecord:
|
|
527
527
|
"""Creates new dataset."""
|
|
@@ -538,7 +538,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
538
538
|
query_script=query_script,
|
|
539
539
|
schema=json.dumps(schema or {}),
|
|
540
540
|
description=description,
|
|
541
|
-
|
|
541
|
+
attrs=json.dumps(attrs or []),
|
|
542
542
|
)
|
|
543
543
|
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
544
544
|
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
@@ -621,7 +621,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
621
621
|
dataset_values = {}
|
|
622
622
|
for field, value in kwargs.items():
|
|
623
623
|
if field in self._dataset_fields[1:]:
|
|
624
|
-
if field in ["
|
|
624
|
+
if field in ["attrs", "schema"]:
|
|
625
625
|
values[field] = json.dumps(value) if value else None
|
|
626
626
|
else:
|
|
627
627
|
values[field] = value
|
|
@@ -329,7 +329,7 @@ class DatasetRecord:
|
|
|
329
329
|
id: int
|
|
330
330
|
name: str
|
|
331
331
|
description: Optional[str]
|
|
332
|
-
|
|
332
|
+
attrs: list[str]
|
|
333
333
|
schema: dict[str, Union[SQLType, type[SQLType]]]
|
|
334
334
|
feature_schema: dict
|
|
335
335
|
versions: list[DatasetVersion]
|
|
@@ -357,7 +357,7 @@ class DatasetRecord:
|
|
|
357
357
|
id: int,
|
|
358
358
|
name: str,
|
|
359
359
|
description: Optional[str],
|
|
360
|
-
|
|
360
|
+
attrs: str,
|
|
361
361
|
status: int,
|
|
362
362
|
feature_schema: Optional[str],
|
|
363
363
|
created_at: datetime,
|
|
@@ -387,7 +387,7 @@ class DatasetRecord:
|
|
|
387
387
|
version_schema: str,
|
|
388
388
|
version_job_id: Optional[str] = None,
|
|
389
389
|
) -> "DatasetRecord":
|
|
390
|
-
|
|
390
|
+
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
391
391
|
schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
|
|
392
392
|
version_schema_dct: dict[str, str] = (
|
|
393
393
|
json.loads(version_schema) if version_schema else {}
|
|
@@ -418,7 +418,7 @@ class DatasetRecord:
|
|
|
418
418
|
id,
|
|
419
419
|
name,
|
|
420
420
|
description,
|
|
421
|
-
|
|
421
|
+
attrs_lst,
|
|
422
422
|
cls.parse_schema(schema_dct), # type: ignore[arg-type]
|
|
423
423
|
json.loads(feature_schema) if feature_schema else {},
|
|
424
424
|
[dataset_version],
|
|
@@ -562,7 +562,7 @@ class DatasetListRecord:
|
|
|
562
562
|
id: int
|
|
563
563
|
name: str
|
|
564
564
|
description: Optional[str]
|
|
565
|
-
|
|
565
|
+
attrs: list[str]
|
|
566
566
|
versions: list[DatasetListVersion]
|
|
567
567
|
created_at: Optional[datetime] = None
|
|
568
568
|
|
|
@@ -572,7 +572,7 @@ class DatasetListRecord:
|
|
|
572
572
|
id: int,
|
|
573
573
|
name: str,
|
|
574
574
|
description: Optional[str],
|
|
575
|
-
|
|
575
|
+
attrs: str,
|
|
576
576
|
created_at: datetime,
|
|
577
577
|
version_id: int,
|
|
578
578
|
version_uuid: str,
|
|
@@ -588,7 +588,7 @@ class DatasetListRecord:
|
|
|
588
588
|
version_query_script: Optional[str],
|
|
589
589
|
version_job_id: Optional[str] = None,
|
|
590
590
|
) -> "DatasetListRecord":
|
|
591
|
-
|
|
591
|
+
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
592
592
|
|
|
593
593
|
dataset_version = DatasetListVersion.parse(
|
|
594
594
|
version_id,
|
|
@@ -610,7 +610,7 @@ class DatasetListRecord:
|
|
|
610
610
|
id,
|
|
611
611
|
name,
|
|
612
612
|
description,
|
|
613
|
-
|
|
613
|
+
attrs_lst,
|
|
614
614
|
[dataset_version],
|
|
615
615
|
created_at,
|
|
616
616
|
)
|
|
@@ -165,7 +165,7 @@ def any_value(col: str) -> Func:
|
|
|
165
165
|
Example:
|
|
166
166
|
```py
|
|
167
167
|
dc.group_by(
|
|
168
|
-
file_example=func.any_value("file.
|
|
168
|
+
file_example=func.any_value("file.path"),
|
|
169
169
|
partition_by="signal.category",
|
|
170
170
|
)
|
|
171
171
|
```
|
|
@@ -227,7 +227,7 @@ def concat(col: str, separator="") -> Func:
|
|
|
227
227
|
Example:
|
|
228
228
|
```py
|
|
229
229
|
dc.group_by(
|
|
230
|
-
files=func.concat("file.
|
|
230
|
+
files=func.concat("file.path", separator=", "),
|
|
231
231
|
partition_by="signal.category",
|
|
232
232
|
)
|
|
233
233
|
```
|
|
@@ -343,7 +343,7 @@ def first(col: str) -> Func:
|
|
|
343
343
|
```py
|
|
344
344
|
window = func.window(partition_by="signal.category", order_by="created_at")
|
|
345
345
|
dc.mutate(
|
|
346
|
-
first_file=func.first("file.
|
|
346
|
+
first_file=func.first("file.path").over(window),
|
|
347
347
|
)
|
|
348
348
|
```
|
|
349
349
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
from collections.abc import Sequence
|
|
3
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, Union
|
|
4
4
|
|
|
5
5
|
from datachain.lib.data_model import (
|
|
6
6
|
DataType,
|
|
@@ -71,14 +71,13 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
71
71
|
# If a non-None value appears early, it won't check the remaining items for
|
|
72
72
|
# `None` values.
|
|
73
73
|
try:
|
|
74
|
-
|
|
75
|
-
itertools.dropwhile(lambda
|
|
74
|
+
first_not_none_element = next(
|
|
75
|
+
itertools.dropwhile(lambda i: i is None, v)
|
|
76
76
|
)
|
|
77
77
|
except StopIteration:
|
|
78
|
-
|
|
79
|
-
|
|
78
|
+
# set default type to `str` if column is empty or all values are `None`
|
|
79
|
+
typ = str
|
|
80
80
|
else:
|
|
81
|
-
nullable = pos > 0
|
|
82
81
|
typ = type(first_not_none_element) # type: ignore[assignment]
|
|
83
82
|
if not is_chain_type(typ):
|
|
84
83
|
raise ValuesToTupleError(
|
|
@@ -88,8 +87,7 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
88
87
|
)
|
|
89
88
|
if isinstance(first_not_none_element, list):
|
|
90
89
|
typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
|
|
91
|
-
|
|
92
|
-
types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
|
|
90
|
+
types_map[k] = typ
|
|
93
91
|
|
|
94
92
|
if length < 0:
|
|
95
93
|
length = len_
|
|
@@ -32,11 +32,28 @@ class DatasetInfo(DataModel):
|
|
|
32
32
|
metrics: dict[str, Any] = Field(default={})
|
|
33
33
|
error_message: str = Field(default="")
|
|
34
34
|
error_stack: str = Field(default="")
|
|
35
|
+
attrs: list[str] = Field(default=[])
|
|
35
36
|
|
|
36
37
|
@property
|
|
37
38
|
def is_temp(self) -> bool:
|
|
38
39
|
return Session.is_temp_dataset(self.name)
|
|
39
40
|
|
|
41
|
+
def has_attr(self, attr: str) -> bool:
|
|
42
|
+
s = attr.split("=")
|
|
43
|
+
if len(s) == 1:
|
|
44
|
+
return attr in self.attrs
|
|
45
|
+
|
|
46
|
+
name = s[0]
|
|
47
|
+
value = s[1]
|
|
48
|
+
for a in self.attrs:
|
|
49
|
+
s = a.split("=")
|
|
50
|
+
if value == "*" and s[0] == name:
|
|
51
|
+
return True
|
|
52
|
+
if len(s) == 2 and s[0] == name and s[1] == value:
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
return False
|
|
56
|
+
|
|
40
57
|
@staticmethod
|
|
41
58
|
def _validate_dict(
|
|
42
59
|
v: Optional[Union[str, dict]],
|
|
@@ -83,4 +100,5 @@ class DatasetInfo(DataModel):
|
|
|
83
100
|
metrics=job.metrics if job else {},
|
|
84
101
|
error_message=version.error_message,
|
|
85
102
|
error_stack=version.error_stack,
|
|
103
|
+
attrs=dataset.attrs,
|
|
86
104
|
)
|
|
@@ -459,7 +459,7 @@ class DataChain:
|
|
|
459
459
|
name: str,
|
|
460
460
|
version: Optional[int] = None,
|
|
461
461
|
description: Optional[str] = None,
|
|
462
|
-
|
|
462
|
+
attrs: Optional[list[str]] = None,
|
|
463
463
|
**kwargs,
|
|
464
464
|
) -> "Self":
|
|
465
465
|
"""Save to a Dataset. It returns the chain itself.
|
|
@@ -468,7 +468,8 @@ class DataChain:
|
|
|
468
468
|
name : dataset name.
|
|
469
469
|
version : version of a dataset. Default - the last version that exist.
|
|
470
470
|
description : description of a dataset.
|
|
471
|
-
|
|
471
|
+
attrs : attributes of a dataset. They can be without value, e.g "NLP",
|
|
472
|
+
or with a value, e.g "location=US".
|
|
472
473
|
"""
|
|
473
474
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
474
475
|
return self._evolve(
|
|
@@ -476,7 +477,7 @@ class DataChain:
|
|
|
476
477
|
name=name,
|
|
477
478
|
version=version,
|
|
478
479
|
description=description,
|
|
479
|
-
|
|
480
|
+
attrs=attrs,
|
|
480
481
|
feature_schema=schema,
|
|
481
482
|
**kwargs,
|
|
482
483
|
)
|
|
@@ -755,7 +756,7 @@ class DataChain:
|
|
|
755
756
|
|
|
756
757
|
Example:
|
|
757
758
|
```py
|
|
758
|
-
dc.distinct("file.
|
|
759
|
+
dc.distinct("file.path")
|
|
759
760
|
```
|
|
760
761
|
"""
|
|
761
762
|
return self._evolve(
|
|
@@ -881,7 +882,7 @@ class DataChain:
|
|
|
881
882
|
```py
|
|
882
883
|
dc.mutate(
|
|
883
884
|
area=Column("image.height") * Column("image.width"),
|
|
884
|
-
extension=file_ext(Column("file.
|
|
885
|
+
extension=file_ext(Column("file.path")),
|
|
885
886
|
dist=cosine_distance(embedding_text, embedding_image)
|
|
886
887
|
)
|
|
887
888
|
```
|
|
@@ -1070,13 +1071,13 @@ class DataChain:
|
|
|
1070
1071
|
|
|
1071
1072
|
Iterating over all rows with selected columns:
|
|
1072
1073
|
```py
|
|
1073
|
-
for name, size in dc.collect("file.
|
|
1074
|
+
for name, size in dc.collect("file.path", "file.size"):
|
|
1074
1075
|
print(name, size)
|
|
1075
1076
|
```
|
|
1076
1077
|
|
|
1077
1078
|
Iterating over a single column:
|
|
1078
1079
|
```py
|
|
1079
|
-
for file in dc.collect("file.
|
|
1080
|
+
for file in dc.collect("file.path"):
|
|
1080
1081
|
print(file)
|
|
1081
1082
|
```
|
|
1082
1083
|
"""
|
|
@@ -1629,7 +1630,7 @@ class DataChain:
|
|
|
1629
1630
|
import datachain as dc
|
|
1630
1631
|
|
|
1631
1632
|
chain = dc.read_storage("s3://mybucket")
|
|
1632
|
-
chain = chain.filter(dc.C("file.
|
|
1633
|
+
chain = chain.filter(dc.C("file.path").glob("*.jsonl"))
|
|
1633
1634
|
chain = chain.parse_tabular(format="json")
|
|
1634
1635
|
```
|
|
1635
1636
|
"""
|
|
@@ -2088,25 +2089,31 @@ class DataChain:
|
|
|
2088
2089
|
|
|
2089
2090
|
Using glob to match patterns
|
|
2090
2091
|
```py
|
|
2091
|
-
dc.filter(C("file.
|
|
2092
|
+
dc.filter(C("file.path").glob("*.jpg"))
|
|
2093
|
+
```
|
|
2094
|
+
|
|
2095
|
+
Using in to match lists
|
|
2096
|
+
```py
|
|
2097
|
+
ids = [1,2,3]
|
|
2098
|
+
dc.filter(C("experiment_id").in_(ids))
|
|
2092
2099
|
```
|
|
2093
2100
|
|
|
2094
2101
|
Using `datachain.func`
|
|
2095
2102
|
```py
|
|
2096
2103
|
from datachain.func import string
|
|
2097
|
-
dc.filter(string.length(C("file.
|
|
2104
|
+
dc.filter(string.length(C("file.path")) > 5)
|
|
2098
2105
|
```
|
|
2099
2106
|
|
|
2100
2107
|
Combining filters with "or"
|
|
2101
2108
|
```py
|
|
2102
|
-
dc.filter(C("file.
|
|
2109
|
+
dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
|
|
2103
2110
|
```
|
|
2104
2111
|
|
|
2105
2112
|
Combining filters with "and"
|
|
2106
2113
|
```py
|
|
2107
2114
|
dc.filter(
|
|
2108
|
-
C("file.
|
|
2109
|
-
(string.length(C("file.
|
|
2115
|
+
C("file.path").glob("*.jpg) &
|
|
2116
|
+
(string.length(C("file.path")) > 5)
|
|
2110
2117
|
)
|
|
2111
2118
|
```
|
|
2112
2119
|
"""
|
|
@@ -102,6 +102,7 @@ def datasets(
|
|
|
102
102
|
column: Optional[str] = None,
|
|
103
103
|
include_listing: bool = False,
|
|
104
104
|
studio: bool = False,
|
|
105
|
+
attrs: Optional[list[str]] = None,
|
|
105
106
|
) -> "DataChain":
|
|
106
107
|
"""Generate chain with list of registered datasets.
|
|
107
108
|
|
|
@@ -114,6 +115,10 @@ def datasets(
|
|
|
114
115
|
include_listing: If True, includes listing datasets. Defaults to False.
|
|
115
116
|
studio: If True, returns datasets from Studio only,
|
|
116
117
|
otherwise returns all local datasets. Defaults to False.
|
|
118
|
+
attrs: Optional list of attributes to filter datasets on. It can be just
|
|
119
|
+
attribute without value e.g "NLP", or attribute with value
|
|
120
|
+
e.g "location=US". Attribute with value can also accept "*" to target
|
|
121
|
+
all that have specific name e.g "location=*"
|
|
117
122
|
|
|
118
123
|
Returns:
|
|
119
124
|
DataChain: A new DataChain instance containing dataset information.
|
|
@@ -139,6 +144,10 @@ def datasets(
|
|
|
139
144
|
]
|
|
140
145
|
datasets_values = [d for d in datasets_values if not d.is_temp]
|
|
141
146
|
|
|
147
|
+
if attrs:
|
|
148
|
+
for attr in attrs:
|
|
149
|
+
datasets_values = [d for d in datasets_values if d.has_attr(attr)]
|
|
150
|
+
|
|
142
151
|
if not column:
|
|
143
152
|
# flattening dataset fields
|
|
144
153
|
schema = {
|
|
@@ -4,12 +4,9 @@ from typing import TYPE_CHECKING, Optional, Union
|
|
|
4
4
|
import sqlalchemy
|
|
5
5
|
|
|
6
6
|
from datachain.lib.data_model import DataType
|
|
7
|
-
from datachain.lib.file import
|
|
8
|
-
File,
|
|
9
|
-
)
|
|
7
|
+
from datachain.lib.file import File
|
|
10
8
|
from datachain.lib.signal_schema import SignalSchema
|
|
11
9
|
from datachain.query import Session
|
|
12
|
-
from datachain.query.schema import Column
|
|
13
10
|
|
|
14
11
|
if TYPE_CHECKING:
|
|
15
12
|
from typing_extensions import ParamSpec
|
|
@@ -41,6 +38,9 @@ def read_records(
|
|
|
41
38
|
single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
|
|
42
39
|
```
|
|
43
40
|
"""
|
|
41
|
+
from datachain.query.dataset import adjust_outputs, get_col_types
|
|
42
|
+
from datachain.sql.types import SQLType
|
|
43
|
+
|
|
44
44
|
from .datasets import read_dataset
|
|
45
45
|
|
|
46
46
|
session = Session.get(session, in_memory=in_memory)
|
|
@@ -52,11 +52,10 @@ def read_records(
|
|
|
52
52
|
|
|
53
53
|
if schema:
|
|
54
54
|
signal_schema = SignalSchema(schema)
|
|
55
|
-
columns = [
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
columns.append(sqlalchemy.Column(c.name, c.type, **kw))
|
|
55
|
+
columns = [
|
|
56
|
+
sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
|
|
57
|
+
for c in signal_schema.db_signals(as_columns=True)
|
|
58
|
+
]
|
|
60
59
|
else:
|
|
61
60
|
columns = [
|
|
62
61
|
sqlalchemy.Column(name, typ)
|
|
@@ -83,6 +82,13 @@ def read_records(
|
|
|
83
82
|
warehouse = catalog.warehouse
|
|
84
83
|
dr = warehouse.dataset_rows(dsr)
|
|
85
84
|
table = dr.get_table()
|
|
86
|
-
|
|
85
|
+
|
|
86
|
+
# Optimization: Compute row types once, rather than for every row.
|
|
87
|
+
col_types = get_col_types(
|
|
88
|
+
warehouse,
|
|
89
|
+
{c.name: c.type for c in columns if isinstance(c.type, SQLType)},
|
|
90
|
+
)
|
|
91
|
+
records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
|
|
92
|
+
warehouse.insert_rows(table, records)
|
|
87
93
|
warehouse.insert_rows_done(table)
|
|
88
94
|
return read_dataset(name=dsr.name, session=session, settings=settings)
|
|
@@ -31,8 +31,8 @@ def resolve_columns(
|
|
|
31
31
|
) -> "Callable[Concatenate[D, P], D]":
|
|
32
32
|
"""Decorator that resolvs input column names to their actual DB names. This is
|
|
33
33
|
specially important for nested columns as user works with them by using dot
|
|
34
|
-
notation e.g (file.
|
|
35
|
-
in DB, e.g
|
|
34
|
+
notation e.g (file.path) but are actually defined with default delimiter
|
|
35
|
+
in DB, e.g file__path.
|
|
36
36
|
If there are any sql functions in arguments, they will just be transferred as is
|
|
37
37
|
to a method.
|
|
38
38
|
"""
|
|
@@ -581,11 +581,7 @@ class SignalSchema:
|
|
|
581
581
|
signals = [
|
|
582
582
|
DEFAULT_DELIMITER.join(path)
|
|
583
583
|
if not as_columns
|
|
584
|
-
else Column(
|
|
585
|
-
DEFAULT_DELIMITER.join(path),
|
|
586
|
-
python_to_sql(_type),
|
|
587
|
-
nullable=is_optional(_type),
|
|
588
|
-
)
|
|
584
|
+
else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
|
|
589
585
|
for path, _type, has_subtree, _ in self.get_flat_tree(
|
|
590
586
|
include_hidden=include_hidden
|
|
591
587
|
)
|
|
@@ -994,8 +990,3 @@ class SignalSchema:
|
|
|
994
990
|
}
|
|
995
991
|
|
|
996
992
|
return SignalSchema.deserialize(schema)
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
def is_optional(type_: Any) -> bool:
|
|
1000
|
-
"""Check if a type is Optional."""
|
|
1001
|
-
return get_origin(type_) is Union and type(None) in get_args(type_)
|
|
@@ -474,8 +474,9 @@ class Generator(UDFBase):
|
|
|
474
474
|
remove_prefetched=bool(self.prefetch) and not cache,
|
|
475
475
|
)
|
|
476
476
|
with closing(prepared_inputs):
|
|
477
|
-
for row in
|
|
477
|
+
for row in prepared_inputs:
|
|
478
478
|
yield _process_row(row)
|
|
479
|
+
processed_cb.relative_update(1)
|
|
479
480
|
|
|
480
481
|
self.teardown()
|
|
481
482
|
|