datachain 0.16.0__tar.gz → 0.16.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.16.0/src/datachain.egg-info → datachain-0.16.1}/PKG-INFO +1 -1
- {datachain-0.16.0 → datachain-0.16.1}/docs/examples.md +5 -5
- {datachain-0.16.0 → datachain-0.16.1}/docs/quick-start.md +3 -3
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/aggregate.py +3 -3
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/values_to_tuples.py +6 -8
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/datachain.py +16 -10
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/records.py +16 -10
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/utils.py +2 -2
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/signal_schema.py +1 -10
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/dataset.py +13 -6
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/schema.py +1 -4
- {datachain-0.16.0 → datachain-0.16.1/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_read_database.py +31 -17
- {datachain-0.16.0 → datachain-0.16.1}/.cruft.json +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.gitattributes +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.github/codecov.yaml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.github/dependabot.yml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.github/workflows/release.yml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.gitignore +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/.pre-commit-config.yaml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/LICENSE +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/README.rst +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/contributing.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/index.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/overrides/main.html +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/file.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/index.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/pose.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/segment.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/datachain.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/func.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/index.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/remotes.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/toolkit.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/torch.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/references/udf.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/docs/tutorials.md +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/mkdocs.yml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/noxfile.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/pyproject.toml +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/setup.cfg +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/__main__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/asyn.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cache.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/local.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/config.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/dataset.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/error.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/fs/reference.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/fs/utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/array.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/base.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/conditional.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/func.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/numeric.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/path.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/random.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/string.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/window.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/job.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/file.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/udf.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/video.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/listing.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/bbox.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/pose.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/segment.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/node.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/progress.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/py.typed +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/batch.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/params.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/session.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/udf.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/script_meta.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/studio.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain/utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/conftest.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/data.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/examples/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/data/lena.jpg +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_client.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_data_storage.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_datachain.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_datasets.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_file.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_hf.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_image.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_listing.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_ls.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_pull.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_query.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_session.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_toolkit.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_video.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_warehouse.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/test_atomicity.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/test_cli_studio.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/test_import_time.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/test_telemetry.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/model/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_client.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_config.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_func.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_query.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_session.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.16.0 → datachain-0.16.1}/tests/utils.py +0 -0
|
@@ -94,7 +94,7 @@ dc.DataModel.register(MistralModel)
|
|
|
94
94
|
chain = (
|
|
95
95
|
dc
|
|
96
96
|
.read_storage("gs://datachain-demo/chatbot-KiT/", type="text")
|
|
97
|
-
.filter(dc.Column("file.
|
|
97
|
+
.filter(dc.Column("file.path").glob("*.txt"))
|
|
98
98
|
.limit(5)
|
|
99
99
|
.settings(parallel=4, cache=True)
|
|
100
100
|
.map(
|
|
@@ -228,7 +228,7 @@ Here is an example from MS COCO “captions” JSON which employs separate secti
|
|
|
228
228
|
|
|
229
229
|
Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations” array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
|
|
230
230
|
|
|
231
|
-
However,
|
|
231
|
+
However, DataChain can easily parse the entire COCO structure via several reading and merging operators:
|
|
232
232
|
|
|
233
233
|
```python
|
|
234
234
|
import datachain as dc
|
|
@@ -240,7 +240,7 @@ images = dc.read_storage(images_uri)
|
|
|
240
240
|
meta = dc.read_json(captions_uri, jmespath="images")
|
|
241
241
|
captions = dc.read_json(captions_uri, jmespath="annotations")
|
|
242
242
|
|
|
243
|
-
images_meta = images.merge(meta, on="file.
|
|
243
|
+
images_meta = images.merge(meta, on="file.path", right_on="images.file_name")
|
|
244
244
|
captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id")
|
|
245
245
|
```
|
|
246
246
|
|
|
@@ -248,12 +248,12 @@ The resulting dataset has image entries as files decorated with all the metadata
|
|
|
248
248
|
|
|
249
249
|
```python
|
|
250
250
|
images_with_dogs = captioned_images.filter(dc.Column("annotations.caption").glob("*dog*"))
|
|
251
|
-
images_with_dogs.select("annotations", "file.
|
|
251
|
+
images_with_dogs.select("annotations", "file.path").show()
|
|
252
252
|
```
|
|
253
253
|
|
|
254
254
|
```
|
|
255
255
|
captions captions captions file
|
|
256
|
-
image_id id caption
|
|
256
|
+
image_id id caption path
|
|
257
257
|
0 17029 778902 a dog jumping to catch a frisbee in a yard 000000017029.jpg
|
|
258
258
|
1 17029 779838 A dog jumping to catch a red frisbee in a garden 000000017029.jpg
|
|
259
259
|
2 17029 781941 The dog is catching the Frisbee in mid air in ... 000000017029.jpg
|
|
@@ -184,7 +184,7 @@ chain = (
|
|
|
184
184
|
.save("response")
|
|
185
185
|
)
|
|
186
186
|
|
|
187
|
-
chain.select("file.
|
|
187
|
+
chain.select("file.path", "status", "response.usage").show(5)
|
|
188
188
|
|
|
189
189
|
success_rate = chain.filter(dc.Column("status") == "success").count() / chain.count()
|
|
190
190
|
print(f"{100*success_rate:.1f}% dialogs were successful")
|
|
@@ -194,7 +194,7 @@ Output:
|
|
|
194
194
|
|
|
195
195
|
``` shell
|
|
196
196
|
file status response response response
|
|
197
|
-
|
|
197
|
+
path usage usage usage
|
|
198
198
|
prompt_tokens total_tokens completion_tokens
|
|
199
199
|
0 1.txt success 547 548 1
|
|
200
200
|
1 10.txt failure 3576 3578 2
|
|
@@ -277,7 +277,7 @@ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
|
277
277
|
|
|
278
278
|
chain = (
|
|
279
279
|
dc.read_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
|
|
280
|
-
.map(label=lambda name: name.split(".")[0], params=["file.
|
|
280
|
+
.map(label=lambda name: name.split(".")[0], params=["file.path"])
|
|
281
281
|
.select("file", "label").to_pytorch(
|
|
282
282
|
transform=processor.image_processor,
|
|
283
283
|
tokenizer=processor.tokenizer,
|
|
@@ -165,7 +165,7 @@ def any_value(col: str) -> Func:
|
|
|
165
165
|
Example:
|
|
166
166
|
```py
|
|
167
167
|
dc.group_by(
|
|
168
|
-
file_example=func.any_value("file.
|
|
168
|
+
file_example=func.any_value("file.path"),
|
|
169
169
|
partition_by="signal.category",
|
|
170
170
|
)
|
|
171
171
|
```
|
|
@@ -227,7 +227,7 @@ def concat(col: str, separator="") -> Func:
|
|
|
227
227
|
Example:
|
|
228
228
|
```py
|
|
229
229
|
dc.group_by(
|
|
230
|
-
files=func.concat("file.
|
|
230
|
+
files=func.concat("file.path", separator=", "),
|
|
231
231
|
partition_by="signal.category",
|
|
232
232
|
)
|
|
233
233
|
```
|
|
@@ -343,7 +343,7 @@ def first(col: str) -> Func:
|
|
|
343
343
|
```py
|
|
344
344
|
window = func.window(partition_by="signal.category", order_by="created_at")
|
|
345
345
|
dc.mutate(
|
|
346
|
-
first_file=func.first("file.
|
|
346
|
+
first_file=func.first("file.path").over(window),
|
|
347
347
|
)
|
|
348
348
|
```
|
|
349
349
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
from collections.abc import Sequence
|
|
3
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, Union
|
|
4
4
|
|
|
5
5
|
from datachain.lib.data_model import (
|
|
6
6
|
DataType,
|
|
@@ -71,14 +71,13 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
71
71
|
# If a non-None value appears early, it won't check the remaining items for
|
|
72
72
|
# `None` values.
|
|
73
73
|
try:
|
|
74
|
-
|
|
75
|
-
itertools.dropwhile(lambda
|
|
74
|
+
first_not_none_element = next(
|
|
75
|
+
itertools.dropwhile(lambda i: i is None, v)
|
|
76
76
|
)
|
|
77
77
|
except StopIteration:
|
|
78
|
-
|
|
79
|
-
|
|
78
|
+
# set default type to `str` if column is empty or all values are `None`
|
|
79
|
+
typ = str
|
|
80
80
|
else:
|
|
81
|
-
nullable = pos > 0
|
|
82
81
|
typ = type(first_not_none_element) # type: ignore[assignment]
|
|
83
82
|
if not is_chain_type(typ):
|
|
84
83
|
raise ValuesToTupleError(
|
|
@@ -88,8 +87,7 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
88
87
|
)
|
|
89
88
|
if isinstance(first_not_none_element, list):
|
|
90
89
|
typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
|
|
91
|
-
|
|
92
|
-
types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
|
|
90
|
+
types_map[k] = typ
|
|
93
91
|
|
|
94
92
|
if length < 0:
|
|
95
93
|
length = len_
|
|
@@ -756,7 +756,7 @@ class DataChain:
|
|
|
756
756
|
|
|
757
757
|
Example:
|
|
758
758
|
```py
|
|
759
|
-
dc.distinct("file.
|
|
759
|
+
dc.distinct("file.path")
|
|
760
760
|
```
|
|
761
761
|
"""
|
|
762
762
|
return self._evolve(
|
|
@@ -882,7 +882,7 @@ class DataChain:
|
|
|
882
882
|
```py
|
|
883
883
|
dc.mutate(
|
|
884
884
|
area=Column("image.height") * Column("image.width"),
|
|
885
|
-
extension=file_ext(Column("file.
|
|
885
|
+
extension=file_ext(Column("file.path")),
|
|
886
886
|
dist=cosine_distance(embedding_text, embedding_image)
|
|
887
887
|
)
|
|
888
888
|
```
|
|
@@ -1071,13 +1071,13 @@ class DataChain:
|
|
|
1071
1071
|
|
|
1072
1072
|
Iterating over all rows with selected columns:
|
|
1073
1073
|
```py
|
|
1074
|
-
for name, size in dc.collect("file.
|
|
1074
|
+
for name, size in dc.collect("file.path", "file.size"):
|
|
1075
1075
|
print(name, size)
|
|
1076
1076
|
```
|
|
1077
1077
|
|
|
1078
1078
|
Iterating over a single column:
|
|
1079
1079
|
```py
|
|
1080
|
-
for file in dc.collect("file.
|
|
1080
|
+
for file in dc.collect("file.path"):
|
|
1081
1081
|
print(file)
|
|
1082
1082
|
```
|
|
1083
1083
|
"""
|
|
@@ -1630,7 +1630,7 @@ class DataChain:
|
|
|
1630
1630
|
import datachain as dc
|
|
1631
1631
|
|
|
1632
1632
|
chain = dc.read_storage("s3://mybucket")
|
|
1633
|
-
chain = chain.filter(dc.C("file.
|
|
1633
|
+
chain = chain.filter(dc.C("file.path").glob("*.jsonl"))
|
|
1634
1634
|
chain = chain.parse_tabular(format="json")
|
|
1635
1635
|
```
|
|
1636
1636
|
"""
|
|
@@ -2089,25 +2089,31 @@ class DataChain:
|
|
|
2089
2089
|
|
|
2090
2090
|
Using glob to match patterns
|
|
2091
2091
|
```py
|
|
2092
|
-
dc.filter(C("file.
|
|
2092
|
+
dc.filter(C("file.path").glob("*.jpg"))
|
|
2093
|
+
```
|
|
2094
|
+
|
|
2095
|
+
Using in to match lists
|
|
2096
|
+
```py
|
|
2097
|
+
ids = [1,2,3]
|
|
2098
|
+
dc.filter(C("experiment_id").in_(ids))
|
|
2093
2099
|
```
|
|
2094
2100
|
|
|
2095
2101
|
Using `datachain.func`
|
|
2096
2102
|
```py
|
|
2097
2103
|
from datachain.func import string
|
|
2098
|
-
dc.filter(string.length(C("file.
|
|
2104
|
+
dc.filter(string.length(C("file.path")) > 5)
|
|
2099
2105
|
```
|
|
2100
2106
|
|
|
2101
2107
|
Combining filters with "or"
|
|
2102
2108
|
```py
|
|
2103
|
-
dc.filter(C("file.
|
|
2109
|
+
dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
|
|
2104
2110
|
```
|
|
2105
2111
|
|
|
2106
2112
|
Combining filters with "and"
|
|
2107
2113
|
```py
|
|
2108
2114
|
dc.filter(
|
|
2109
|
-
C("file.
|
|
2110
|
-
(string.length(C("file.
|
|
2115
|
+
C("file.path").glob("*.jpg) &
|
|
2116
|
+
(string.length(C("file.path")) > 5)
|
|
2111
2117
|
)
|
|
2112
2118
|
```
|
|
2113
2119
|
"""
|
|
@@ -4,12 +4,9 @@ from typing import TYPE_CHECKING, Optional, Union
|
|
|
4
4
|
import sqlalchemy
|
|
5
5
|
|
|
6
6
|
from datachain.lib.data_model import DataType
|
|
7
|
-
from datachain.lib.file import
|
|
8
|
-
File,
|
|
9
|
-
)
|
|
7
|
+
from datachain.lib.file import File
|
|
10
8
|
from datachain.lib.signal_schema import SignalSchema
|
|
11
9
|
from datachain.query import Session
|
|
12
|
-
from datachain.query.schema import Column
|
|
13
10
|
|
|
14
11
|
if TYPE_CHECKING:
|
|
15
12
|
from typing_extensions import ParamSpec
|
|
@@ -41,6 +38,9 @@ def read_records(
|
|
|
41
38
|
single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
|
|
42
39
|
```
|
|
43
40
|
"""
|
|
41
|
+
from datachain.query.dataset import adjust_outputs, get_col_types
|
|
42
|
+
from datachain.sql.types import SQLType
|
|
43
|
+
|
|
44
44
|
from .datasets import read_dataset
|
|
45
45
|
|
|
46
46
|
session = Session.get(session, in_memory=in_memory)
|
|
@@ -52,11 +52,10 @@ def read_records(
|
|
|
52
52
|
|
|
53
53
|
if schema:
|
|
54
54
|
signal_schema = SignalSchema(schema)
|
|
55
|
-
columns = [
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
columns.append(sqlalchemy.Column(c.name, c.type, **kw))
|
|
55
|
+
columns = [
|
|
56
|
+
sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
|
|
57
|
+
for c in signal_schema.db_signals(as_columns=True)
|
|
58
|
+
]
|
|
60
59
|
else:
|
|
61
60
|
columns = [
|
|
62
61
|
sqlalchemy.Column(name, typ)
|
|
@@ -83,6 +82,13 @@ def read_records(
|
|
|
83
82
|
warehouse = catalog.warehouse
|
|
84
83
|
dr = warehouse.dataset_rows(dsr)
|
|
85
84
|
table = dr.get_table()
|
|
86
|
-
|
|
85
|
+
|
|
86
|
+
# Optimization: Compute row types once, rather than for every row.
|
|
87
|
+
col_types = get_col_types(
|
|
88
|
+
warehouse,
|
|
89
|
+
{c.name: c.type for c in columns if isinstance(c.type, SQLType)},
|
|
90
|
+
)
|
|
91
|
+
records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
|
|
92
|
+
warehouse.insert_rows(table, records)
|
|
87
93
|
warehouse.insert_rows_done(table)
|
|
88
94
|
return read_dataset(name=dsr.name, session=session, settings=settings)
|
|
@@ -31,8 +31,8 @@ def resolve_columns(
|
|
|
31
31
|
) -> "Callable[Concatenate[D, P], D]":
|
|
32
32
|
"""Decorator that resolvs input column names to their actual DB names. This is
|
|
33
33
|
specially important for nested columns as user works with them by using dot
|
|
34
|
-
notation e.g (file.
|
|
35
|
-
in DB, e.g
|
|
34
|
+
notation e.g (file.path) but are actually defined with default delimiter
|
|
35
|
+
in DB, e.g file__path.
|
|
36
36
|
If there are any sql functions in arguments, they will just be transferred as is
|
|
37
37
|
to a method.
|
|
38
38
|
"""
|
|
@@ -581,11 +581,7 @@ class SignalSchema:
|
|
|
581
581
|
signals = [
|
|
582
582
|
DEFAULT_DELIMITER.join(path)
|
|
583
583
|
if not as_columns
|
|
584
|
-
else Column(
|
|
585
|
-
DEFAULT_DELIMITER.join(path),
|
|
586
|
-
python_to_sql(_type),
|
|
587
|
-
nullable=is_optional(_type),
|
|
588
|
-
)
|
|
584
|
+
else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
|
|
589
585
|
for path, _type, has_subtree, _ in self.get_flat_tree(
|
|
590
586
|
include_hidden=include_hidden
|
|
591
587
|
)
|
|
@@ -994,8 +990,3 @@ class SignalSchema:
|
|
|
994
990
|
}
|
|
995
991
|
|
|
996
992
|
return SignalSchema.deserialize(schema)
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
def is_optional(type_: Any) -> bool:
|
|
1000
|
-
"""Check if a type is Optional."""
|
|
1001
|
-
return get_origin(type_) is Union and type(None) in get_args(type_)
|
|
@@ -57,6 +57,7 @@ from datachain.query.schema import C, UDFParamSpec, normalize_param
|
|
|
57
57
|
from datachain.query.session import Session
|
|
58
58
|
from datachain.query.udf import UdfInfo
|
|
59
59
|
from datachain.sql.functions.random import rand
|
|
60
|
+
from datachain.sql.types import SQLType
|
|
60
61
|
from datachain.utils import (
|
|
61
62
|
batched,
|
|
62
63
|
determine_processes,
|
|
@@ -67,6 +68,8 @@ from datachain.utils import (
|
|
|
67
68
|
)
|
|
68
69
|
|
|
69
70
|
if TYPE_CHECKING:
|
|
71
|
+
from collections.abc import Mapping
|
|
72
|
+
|
|
70
73
|
from sqlalchemy.sql.elements import ClauseElement
|
|
71
74
|
from sqlalchemy.sql.schema import Table
|
|
72
75
|
from sqlalchemy.sql.selectable import GenerativeSelect
|
|
@@ -273,7 +276,9 @@ class Subtract(DatasetDiffOperation):
|
|
|
273
276
|
|
|
274
277
|
|
|
275
278
|
def adjust_outputs(
|
|
276
|
-
warehouse: "AbstractWarehouse",
|
|
279
|
+
warehouse: "AbstractWarehouse",
|
|
280
|
+
row: dict[str, Any],
|
|
281
|
+
col_types: list[tuple[str, SQLType, type, str, Any]],
|
|
277
282
|
) -> dict[str, Any]:
|
|
278
283
|
"""
|
|
279
284
|
This function does a couple of things to prepare a row for inserting into the db:
|
|
@@ -289,7 +294,7 @@ def adjust_outputs(
|
|
|
289
294
|
col_python_type,
|
|
290
295
|
col_type_name,
|
|
291
296
|
default_value,
|
|
292
|
-
) in
|
|
297
|
+
) in col_types:
|
|
293
298
|
row_val = row.get(col_name)
|
|
294
299
|
|
|
295
300
|
# Fill None or missing values with defaults (get returns None if not in the row)
|
|
@@ -304,8 +309,10 @@ def adjust_outputs(
|
|
|
304
309
|
return row
|
|
305
310
|
|
|
306
311
|
|
|
307
|
-
def
|
|
308
|
-
""
|
|
312
|
+
def get_col_types(
|
|
313
|
+
warehouse: "AbstractWarehouse", output: "Mapping[str, Any]"
|
|
314
|
+
) -> list[tuple]:
|
|
315
|
+
"""Optimization: Precompute column types so these don't have to be computed
|
|
309
316
|
in the convert_type function for each row in a loop."""
|
|
310
317
|
dialect = warehouse.db.dialect
|
|
311
318
|
return [
|
|
@@ -317,7 +324,7 @@ def get_udf_col_types(warehouse: "AbstractWarehouse", udf: "UDFAdapter") -> list
|
|
|
317
324
|
type(col_type_inst).__name__,
|
|
318
325
|
col_type.default_value(dialect),
|
|
319
326
|
)
|
|
320
|
-
for col_name, col_type in
|
|
327
|
+
for col_name, col_type in output.items()
|
|
321
328
|
]
|
|
322
329
|
|
|
323
330
|
|
|
@@ -333,7 +340,7 @@ def process_udf_outputs(
|
|
|
333
340
|
|
|
334
341
|
rows: list[UDFResult] = []
|
|
335
342
|
# Optimization: Compute row types once, rather than for every row.
|
|
336
|
-
udf_col_types =
|
|
343
|
+
udf_col_types = get_col_types(warehouse, udf.output)
|
|
337
344
|
|
|
338
345
|
for udf_output in udf_results:
|
|
339
346
|
if not udf_output:
|
|
@@ -40,15 +40,12 @@ class ColumnMeta(type):
|
|
|
40
40
|
class Column(sa.ColumnClause, metaclass=ColumnMeta):
|
|
41
41
|
inherit_cache: Optional[bool] = True
|
|
42
42
|
|
|
43
|
-
def __init__(
|
|
44
|
-
self, text, type_=None, is_literal=False, nullable=None, _selectable=None
|
|
45
|
-
):
|
|
43
|
+
def __init__(self, text, type_=None, is_literal=False, _selectable=None):
|
|
46
44
|
"""Dataset column."""
|
|
47
45
|
self.name = ColumnMeta.to_db_name(text)
|
|
48
46
|
super().__init__(
|
|
49
47
|
self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
|
|
50
48
|
)
|
|
51
|
-
self.nullable = nullable
|
|
52
49
|
|
|
53
50
|
def __getattr__(self, name: str):
|
|
54
51
|
return Column(self.name + DEFAULT_DELIMITER + name)
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import os
|
|
2
3
|
import sqlite3
|
|
3
4
|
from contextlib import closing
|
|
4
|
-
from typing import Optional
|
|
5
5
|
|
|
6
6
|
import pytest
|
|
7
7
|
import sqlalchemy
|
|
8
8
|
from sqlalchemy.orm import Session
|
|
9
9
|
|
|
10
10
|
from datachain import read_database
|
|
11
|
+
from datachain.data_storage.sqlite import SQLiteWarehouse
|
|
11
12
|
from datachain.lib.dc import database
|
|
12
|
-
from tests.utils import skip_if_not_sqlite
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@pytest.fixture
|
|
@@ -81,14 +81,7 @@ def test(sqlite3_connection, connection, test_session):
|
|
|
81
81
|
]
|
|
82
82
|
|
|
83
83
|
|
|
84
|
-
|
|
85
|
-
# nullable, setting `nullable=True` is not enough.
|
|
86
|
-
# https://github.com/xzkostyan/clickhouse-sqlalchemy/issues/189#issuecomment-1274736713
|
|
87
|
-
# Also, was not able to figure out how to read nullable columns back from clickhouse.
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
@skip_if_not_sqlite
|
|
91
|
-
def test_nullable(sqlite3_connection, test_session):
|
|
84
|
+
def test_nullable(sqlite3_connection, test_session, warehouse):
|
|
92
85
|
"""
|
|
93
86
|
Verify that a column containing a sequence of NULL values is handled correctly
|
|
94
87
|
when the number of leading NULLs is less than `infer_schema_length`.
|
|
@@ -101,14 +94,14 @@ def test_nullable(sqlite3_connection, test_session):
|
|
|
101
94
|
sqlite3_connection.commit()
|
|
102
95
|
|
|
103
96
|
chain = read_database("select * from tbl", sqlite3_connection, session=test_session)
|
|
104
|
-
assert chain.schema == {"id": int, "value":
|
|
97
|
+
assert chain.schema == {"id": int, "value": str}
|
|
98
|
+
default_value = None if isinstance(warehouse, SQLiteWarehouse) else ""
|
|
105
99
|
assert sorted(chain.to_records(), key=lambda r: r["id"]) == [
|
|
106
|
-
{"id": i, "value":
|
|
100
|
+
{"id": i, "value": default_value if i < 50 else str(i)} for i in range(1, 1000)
|
|
107
101
|
]
|
|
108
102
|
|
|
109
103
|
|
|
110
|
-
|
|
111
|
-
def test_all_null_values(sqlite3_connection, test_session):
|
|
104
|
+
def test_all_null_values(sqlite3_connection, test_session, warehouse):
|
|
112
105
|
sqlite3_connection.execute("CREATE TABLE tbl (id INTEGER PRIMARY KEY, num INTEGER)")
|
|
113
106
|
sqlite3_connection.executemany(
|
|
114
107
|
"INSERT INTO tbl(num) VALUES(?)", [(None,) for _ in range(1, 1000)]
|
|
@@ -117,9 +110,10 @@ def test_all_null_values(sqlite3_connection, test_session):
|
|
|
117
110
|
|
|
118
111
|
chain = read_database("select * from tbl", sqlite3_connection, session=test_session)
|
|
119
112
|
# if all values are null, the column type defaults to str
|
|
120
|
-
assert chain.schema == {"id": int, "num":
|
|
113
|
+
assert chain.schema == {"id": int, "num": str}
|
|
114
|
+
default_value = None if isinstance(warehouse, SQLiteWarehouse) else ""
|
|
121
115
|
assert sorted(chain.to_records(), key=lambda r: r["id"]) == [
|
|
122
|
-
{"id": i, "num":
|
|
116
|
+
{"id": i, "num": default_value} for i in range(1, 1000)
|
|
123
117
|
]
|
|
124
118
|
|
|
125
119
|
|
|
@@ -128,7 +122,7 @@ def test_empty(sqlite3_connection, test_session):
|
|
|
128
122
|
|
|
129
123
|
chain = read_database("select * from tbl", sqlite3_connection, session=test_session)
|
|
130
124
|
# if the table is empty, the column type defaults to str
|
|
131
|
-
assert chain.schema == {"id":
|
|
125
|
+
assert chain.schema == {"id": str, "value": str}
|
|
132
126
|
assert chain.to_records() == []
|
|
133
127
|
|
|
134
128
|
|
|
@@ -173,3 +167,23 @@ def test_schema_is_not_inferred_when_all_types_are_provided(
|
|
|
173
167
|
)
|
|
174
168
|
spy.assert_called_once_with(mocker.ANY, [], 100)
|
|
175
169
|
assert chain.schema == {"id": int, "value": int}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def test_json_type(sqlite3_connection, test_session):
|
|
173
|
+
sqlite3_connection.execute("CREATE TABLE tbl (id INTEGER PRIMARY KEY, value TEXT)")
|
|
174
|
+
sqlite3_connection.executemany(
|
|
175
|
+
"INSERT INTO tbl(value) VALUES(?)",
|
|
176
|
+
[(json.dumps({"i": i}),) for i in range(1, 10)],
|
|
177
|
+
)
|
|
178
|
+
sqlite3_connection.commit()
|
|
179
|
+
|
|
180
|
+
chain = read_database(
|
|
181
|
+
"select * from tbl",
|
|
182
|
+
sqlite3_connection,
|
|
183
|
+
output={"value": dict},
|
|
184
|
+
session=test_session,
|
|
185
|
+
)
|
|
186
|
+
assert chain.schema == {"id": int, "value": dict}
|
|
187
|
+
assert sorted(chain.to_records(), key=lambda r: r["id"]) == [
|
|
188
|
+
{"id": i, "value": {"i": i}} for i in range(1, 10)
|
|
189
|
+
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|