datachain 0.14.4__tar.gz → 0.14.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.14.4 → datachain-0.14.5}/.pre-commit-config.yaml +1 -1
- {datachain-0.14.4/src/datachain.egg-info → datachain-0.14.5}/PKG-INFO +3 -3
- {datachain-0.14.4 → datachain-0.14.5}/README.rst +2 -2
- {datachain-0.14.4 → datachain-0.14.5}/docs/quick-start.md +4 -4
- {datachain-0.14.4 → datachain-0.14.5}/examples/get_started/json-csv-reader.py +2 -2
- {datachain-0.14.4 → datachain-0.14.5}/examples/multimodal/hf_pipeline.py +1 -1
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/catalog/catalog.py +3 -5
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/data_storage/schema.py +21 -23
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/data_storage/sqlite.py +1 -1
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/data_storage/warehouse.py +6 -8
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/csv.py +3 -3
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/datachain.py +10 -10
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/datasets.py +27 -10
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/hf.py +5 -5
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/json.py +7 -7
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/listings.py +3 -3
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/pandas.py +5 -5
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/parquet.py +3 -3
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/storage.py +6 -6
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/values.py +3 -3
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/listing.py +2 -2
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/signal_schema.py +24 -9
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/listing.py +4 -4
- {datachain-0.14.4 → datachain-0.14.5/src/datachain.egg-info}/PKG-INFO +3 -3
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_data_storage.py +1 -1
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_datachain.py +3 -3
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_datachain.py +36 -32
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_signal_schema.py +2 -1
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_listing.py +1 -1
- {datachain-0.14.4 → datachain-0.14.5}/.cruft.json +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.gitattributes +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.github/codecov.yaml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.github/dependabot.yml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.github/workflows/release.yml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.github/workflows/tests.yml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/.gitignore +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/LICENSE +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/assets/datachain.svg +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/contributing.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/examples.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/index.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/overrides/main.html +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/data-types/file.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/data-types/index.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/data-types/pose.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/data-types/segment.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/datachain.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/func.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/index.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/remotes.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/toolkit.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/torch.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/references/udf.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/docs/tutorials.md +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/multimodal/wds.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/mkdocs.yml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/noxfile.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/pyproject.toml +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/setup.cfg +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/__main__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/asyn.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cache.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/cli/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/client/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/client/azure.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/client/gcs.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/client/hf.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/client/local.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/client/s3.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/config.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/dataset.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/error.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/fs/reference.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/fs/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/array.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/base.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/conditional.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/func.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/numeric.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/path.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/random.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/string.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/func/window.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/job.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/clip.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/file.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/hf.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/image.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/settings.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/tar.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/text.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/udf.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/video.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/model/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/model/bbox.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/model/pose.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/model/segment.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/model/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/node.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/progress.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/py.typed +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/batch.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/dataset.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/metrics.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/params.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/queue.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/schema.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/session.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/udf.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/query/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/remote/studio.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/script_meta.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/types.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/sql/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/studio.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/telemetry.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/conftest.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/data.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/examples/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/examples/test_examples.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/examples/wds_data.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/data/lena.jpg +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/model/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_catalog.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_client.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_datasets.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_file.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_hf.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_image.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_listing.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_ls.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_metrics.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_pull.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_pytorch.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_query.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_session.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_toolkit.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_video.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/func/test_warehouse.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/scripts/feature_class.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/test_atomicity.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/test_cli_e2e.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/test_cli_studio.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/test_import_time.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/test_query_e2e.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/test_telemetry.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/model/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_asyn.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_cache.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_catalog.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_client.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_config.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_dataset.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_func.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_metastore.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_query.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_query_params.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_serializer.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_session.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_utils.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.14.4 → datachain-0.14.5}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.14.
|
|
3
|
+
Version: 0.14.5
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -171,7 +171,7 @@ high confidence scores.
|
|
|
171
171
|
|
|
172
172
|
import datachain as dc
|
|
173
173
|
|
|
174
|
-
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json",
|
|
174
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
|
|
175
175
|
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
176
176
|
|
|
177
177
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
@@ -213,7 +213,7 @@ Python code:
|
|
|
213
213
|
return result.lower().startswith("success")
|
|
214
214
|
|
|
215
215
|
chain = (
|
|
216
|
-
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
216
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
|
|
217
217
|
.settings(parallel=4, cache=True)
|
|
218
218
|
.map(is_success=eval_dialogue)
|
|
219
219
|
.save("mistral_files")
|
|
@@ -60,7 +60,7 @@ high confidence scores.
|
|
|
60
60
|
|
|
61
61
|
import datachain as dc
|
|
62
62
|
|
|
63
|
-
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json",
|
|
63
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
|
|
64
64
|
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
65
65
|
|
|
66
66
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
@@ -102,7 +102,7 @@ Python code:
|
|
|
102
102
|
return result.lower().startswith("success")
|
|
103
103
|
|
|
104
104
|
chain = (
|
|
105
|
-
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
105
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
|
|
106
106
|
.settings(parallel=4, cache=True)
|
|
107
107
|
.map(is_success=eval_dialogue)
|
|
108
108
|
.save("mistral_files")
|
|
@@ -39,7 +39,7 @@ using JSON metadata:
|
|
|
39
39
|
``` py
|
|
40
40
|
import datachain as dc
|
|
41
41
|
|
|
42
|
-
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json",
|
|
42
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
|
|
43
43
|
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
44
44
|
|
|
45
45
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
@@ -78,7 +78,7 @@ def is_positive_dialogue_ending(file) -> bool:
|
|
|
78
78
|
|
|
79
79
|
chain = (
|
|
80
80
|
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
81
|
-
|
|
81
|
+
column="file", type="text", anon=True)
|
|
82
82
|
.settings(parallel=8, cache=True)
|
|
83
83
|
.map(is_positive=is_positive_dialogue_ending)
|
|
84
84
|
.save("file_response")
|
|
@@ -132,7 +132,7 @@ def eval_dialogue(file: dc.File) -> bool:
|
|
|
132
132
|
return result.lower().startswith("success")
|
|
133
133
|
|
|
134
134
|
chain = (
|
|
135
|
-
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
135
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
|
|
136
136
|
.map(is_success=eval_dialogue)
|
|
137
137
|
.save("mistral_files")
|
|
138
138
|
)
|
|
@@ -177,7 +177,7 @@ def eval_dialog(file: dc.File) -> ChatCompletionResponse:
|
|
|
177
177
|
{"role": "user", "content": file.read()}])
|
|
178
178
|
|
|
179
179
|
chain = (
|
|
180
|
-
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
180
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
|
|
181
181
|
.settings(parallel=4, cache=True)
|
|
182
182
|
.map(response=eval_dialog)
|
|
183
183
|
.map(status=lambda response: response.choices[0].message.content.lower()[:7])
|
|
@@ -63,13 +63,13 @@ def main():
|
|
|
63
63
|
|
|
64
64
|
# Static CSV with header schema test parsing 3.5K objects
|
|
65
65
|
uri = "gs://datachain-demo/chatbot-csv/"
|
|
66
|
-
static_csv_ds = dc.read_csv(uri, output=ChatDialog,
|
|
66
|
+
static_csv_ds = dc.read_csv(uri, output=ChatDialog, column="chat", anon="True")
|
|
67
67
|
static_csv_ds.print_schema()
|
|
68
68
|
static_csv_ds.show()
|
|
69
69
|
|
|
70
70
|
# Dynamic CSV with header schema test parsing 3/3M objects
|
|
71
71
|
uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
|
|
72
|
-
dynamic_csv_ds = dc.read_csv(uri,
|
|
72
|
+
dynamic_csv_ds = dc.read_csv(uri, column="laion", nrows=3, anon="True")
|
|
73
73
|
dynamic_csv_ds.print_schema()
|
|
74
74
|
dynamic_csv_ds.show()
|
|
75
75
|
|
|
@@ -580,15 +580,13 @@ class Catalog:
|
|
|
580
580
|
source: str,
|
|
581
581
|
update=False,
|
|
582
582
|
client_config=None,
|
|
583
|
-
|
|
583
|
+
column="file",
|
|
584
584
|
skip_indexing=False,
|
|
585
585
|
) -> tuple[Optional["Listing"], "Client", str]:
|
|
586
586
|
from datachain import read_storage
|
|
587
587
|
from datachain.listing import Listing
|
|
588
588
|
|
|
589
|
-
read_storage(
|
|
590
|
-
source, session=self.session, update=update, object_name=object_name
|
|
591
|
-
).exec()
|
|
589
|
+
read_storage(source, session=self.session, update=update, column=column).exec()
|
|
592
590
|
|
|
593
591
|
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
594
592
|
source, self.session, update=update
|
|
@@ -602,7 +600,7 @@ class Catalog:
|
|
|
602
600
|
self.warehouse.clone(),
|
|
603
601
|
client,
|
|
604
602
|
dataset_name=list_ds_name,
|
|
605
|
-
|
|
603
|
+
column=column,
|
|
606
604
|
)
|
|
607
605
|
|
|
608
606
|
return lst, client, list_path
|
|
@@ -30,8 +30,8 @@ if TYPE_CHECKING:
|
|
|
30
30
|
DEFAULT_DELIMITER = "__"
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def col_name(name: str,
|
|
34
|
-
return f"{
|
|
33
|
+
def col_name(name: str, column: str = "file") -> str:
|
|
34
|
+
return f"{column}{DEFAULT_DELIMITER}{name}"
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
@@ -84,19 +84,19 @@ def convert_rows_custom_column_types(
|
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
class DirExpansion:
|
|
87
|
-
def __init__(self,
|
|
88
|
-
self.
|
|
87
|
+
def __init__(self, column: str):
|
|
88
|
+
self.column = column
|
|
89
89
|
|
|
90
|
-
def col_name(self, name: str,
|
|
91
|
-
|
|
92
|
-
return col_name(name,
|
|
90
|
+
def col_name(self, name: str, column: Optional[str] = None) -> str:
|
|
91
|
+
column = column or self.column
|
|
92
|
+
return col_name(name, column)
|
|
93
93
|
|
|
94
|
-
def c(self, query, name: str,
|
|
95
|
-
return getattr(query.c, self.col_name(name,
|
|
94
|
+
def c(self, query, name: str, column: Optional[str] = None) -> str:
|
|
95
|
+
return getattr(query.c, self.col_name(name, column=column))
|
|
96
96
|
|
|
97
97
|
def base_select(self, q):
|
|
98
98
|
return sa.select(
|
|
99
|
-
self.c(q, "id",
|
|
99
|
+
self.c(q, "id", column="sys"),
|
|
100
100
|
false().label(self.col_name("is_dir")),
|
|
101
101
|
self.c(q, "source"),
|
|
102
102
|
self.c(q, "path"),
|
|
@@ -153,12 +153,12 @@ class DataTable:
|
|
|
153
153
|
name: str,
|
|
154
154
|
engine: "DatabaseEngine",
|
|
155
155
|
column_types: Optional[dict[str, SQLType]] = None,
|
|
156
|
-
|
|
156
|
+
column: str = "file",
|
|
157
157
|
):
|
|
158
158
|
self.name: str = name
|
|
159
159
|
self.engine = engine
|
|
160
160
|
self.column_types: dict[str, SQLType] = column_types or {}
|
|
161
|
-
self.
|
|
161
|
+
self.column = column
|
|
162
162
|
|
|
163
163
|
@staticmethod
|
|
164
164
|
def copy_column(
|
|
@@ -224,18 +224,16 @@ class DataTable:
|
|
|
224
224
|
def columns(self) -> "ReadOnlyColumnCollection[str, sa.Column[Any]]":
|
|
225
225
|
return self.table.columns
|
|
226
226
|
|
|
227
|
-
def col_name(self, name: str,
|
|
228
|
-
|
|
229
|
-
return col_name(name,
|
|
227
|
+
def col_name(self, name: str, column: Optional[str] = None) -> str:
|
|
228
|
+
column = column or self.column
|
|
229
|
+
return col_name(name, column)
|
|
230
230
|
|
|
231
|
-
def without_object(
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
object_name = object_name or self.object_name
|
|
235
|
-
return column_name.removeprefix(f"{object_name}{DEFAULT_DELIMITER}")
|
|
231
|
+
def without_object(self, column_name: str, column: Optional[str] = None) -> str:
|
|
232
|
+
column = column or self.column
|
|
233
|
+
return column_name.removeprefix(f"{column}{DEFAULT_DELIMITER}")
|
|
236
234
|
|
|
237
|
-
def c(self, name: str,
|
|
238
|
-
return getattr(self.columns, self.col_name(name,
|
|
235
|
+
def c(self, name: str, column: Optional[str] = None):
|
|
236
|
+
return getattr(self.columns, self.col_name(name, column=column))
|
|
239
237
|
|
|
240
238
|
@property
|
|
241
239
|
def table(self) -> "sa.Table":
|
|
@@ -275,7 +273,7 @@ class DataTable:
|
|
|
275
273
|
]
|
|
276
274
|
|
|
277
275
|
def dir_expansion(self):
|
|
278
|
-
return DirExpansion(self.
|
|
276
|
+
return DirExpansion(self.column)
|
|
279
277
|
|
|
280
278
|
|
|
281
279
|
PARTITION_COLUMN_ID = "partition_id"
|
|
@@ -489,7 +489,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
489
489
|
self, dataset: DatasetRecord, version: int
|
|
490
490
|
) -> list[StorageURI]:
|
|
491
491
|
dr = self.dataset_rows(dataset, version)
|
|
492
|
-
query = dr.select(dr.c("source",
|
|
492
|
+
query = dr.select(dr.c("source", column="file")).distinct()
|
|
493
493
|
cur = self.db.cursor()
|
|
494
494
|
cur.row_factory = sqlite3.Row # type: ignore[assignment]
|
|
495
495
|
|
|
@@ -179,7 +179,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
179
179
|
self,
|
|
180
180
|
dataset: DatasetRecord,
|
|
181
181
|
version: Optional[int] = None,
|
|
182
|
-
|
|
182
|
+
column: str = "file",
|
|
183
183
|
):
|
|
184
184
|
version = version or dataset.latest_version
|
|
185
185
|
|
|
@@ -188,7 +188,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
188
188
|
table_name,
|
|
189
189
|
self.db,
|
|
190
190
|
dataset.get_schema(version),
|
|
191
|
-
|
|
191
|
+
column=column,
|
|
192
192
|
)
|
|
193
193
|
|
|
194
194
|
@property
|
|
@@ -487,7 +487,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
487
487
|
dataset_rows: "DataTable",
|
|
488
488
|
path_list: list[str],
|
|
489
489
|
glob_name: str,
|
|
490
|
-
|
|
490
|
+
column="file",
|
|
491
491
|
) -> Iterator[Node]:
|
|
492
492
|
"""Finds all Nodes that correspond to GLOB like path pattern."""
|
|
493
493
|
dr = dataset_rows
|
|
@@ -521,7 +521,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
521
521
|
de = dr.dir_expansion()
|
|
522
522
|
q = de.query(
|
|
523
523
|
dr.select().where(dr.c("is_latest") == true()).subquery(),
|
|
524
|
-
|
|
524
|
+
column=dr.column,
|
|
525
525
|
).subquery()
|
|
526
526
|
q = self.expand_query(de, q, dr)
|
|
527
527
|
|
|
@@ -597,12 +597,10 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
597
597
|
with_default(dr.c("is_latest")),
|
|
598
598
|
dr.c("last_modified"),
|
|
599
599
|
with_default(dr.c("size")),
|
|
600
|
-
with_default(dr.c("rand",
|
|
600
|
+
with_default(dr.c("rand", column="sys")),
|
|
601
601
|
dr.c("location"),
|
|
602
602
|
de.c(q, "source"),
|
|
603
|
-
).select_from(
|
|
604
|
-
q.outerjoin(dr.table, q.c.sys__id == dr.c("id", object_name="sys"))
|
|
605
|
-
)
|
|
603
|
+
).select_from(q.outerjoin(dr.table, q.c.sys__id == dr.c("id", column="sys")))
|
|
606
604
|
|
|
607
605
|
def get_node_by_path(self, dataset_rows: "DataTable", path: str) -> Node:
|
|
608
606
|
"""Gets node that corresponds to some path"""
|
|
@@ -21,7 +21,7 @@ def read_csv(
|
|
|
21
21
|
delimiter: Optional[str] = None,
|
|
22
22
|
header: bool = True,
|
|
23
23
|
output: OutputType = None,
|
|
24
|
-
|
|
24
|
+
column: str = "",
|
|
25
25
|
model_name: str = "",
|
|
26
26
|
source: bool = True,
|
|
27
27
|
nrows=None,
|
|
@@ -42,7 +42,7 @@ def read_csv(
|
|
|
42
42
|
output : Dictionary or feature class defining column names and their
|
|
43
43
|
corresponding types. List of column names is also accepted, in which
|
|
44
44
|
case types will be inferred.
|
|
45
|
-
|
|
45
|
+
column : Created column name.
|
|
46
46
|
model_name : Generated model name.
|
|
47
47
|
source : Whether to include info about the source file.
|
|
48
48
|
nrows : Optional row limit.
|
|
@@ -119,7 +119,7 @@ def read_csv(
|
|
|
119
119
|
)
|
|
120
120
|
return chain.parse_tabular(
|
|
121
121
|
output=output,
|
|
122
|
-
|
|
122
|
+
column=column,
|
|
123
123
|
model_name=model_name,
|
|
124
124
|
source=source,
|
|
125
125
|
nrows=nrows,
|
|
@@ -357,7 +357,7 @@ class DataChain:
|
|
|
357
357
|
self,
|
|
358
358
|
col: str,
|
|
359
359
|
model_name: Optional[str] = None,
|
|
360
|
-
|
|
360
|
+
column: Optional[str] = None,
|
|
361
361
|
schema_sample_size: int = 1,
|
|
362
362
|
) -> "DataChain":
|
|
363
363
|
"""Explodes a column containing JSON objects (dict or str DataChain type) into
|
|
@@ -368,7 +368,7 @@ class DataChain:
|
|
|
368
368
|
col: the name of the column containing JSON to be exploded.
|
|
369
369
|
model_name: optional generated model name. By default generates the name
|
|
370
370
|
automatically.
|
|
371
|
-
|
|
371
|
+
column: optional generated column name. By default generates the
|
|
372
372
|
name automatically.
|
|
373
373
|
schema_sample_size: the number of rows to use for inferring the schema of
|
|
374
374
|
the JSON (in case some fields are optional and it's not enough to
|
|
@@ -406,10 +406,10 @@ class DataChain:
|
|
|
406
406
|
)
|
|
407
407
|
return model.model_validate(json_dict)
|
|
408
408
|
|
|
409
|
-
if not
|
|
410
|
-
|
|
409
|
+
if not column:
|
|
410
|
+
column = f"{col}_expl"
|
|
411
411
|
|
|
412
|
-
return self.map(json_to_model, params=col, output={
|
|
412
|
+
return self.map(json_to_model, params=col, output={column: model})
|
|
413
413
|
|
|
414
414
|
@classmethod
|
|
415
415
|
def datasets(
|
|
@@ -1588,7 +1588,7 @@ class DataChain:
|
|
|
1588
1588
|
def parse_tabular(
|
|
1589
1589
|
self,
|
|
1590
1590
|
output: OutputType = None,
|
|
1591
|
-
|
|
1591
|
+
column: str = "",
|
|
1592
1592
|
model_name: str = "",
|
|
1593
1593
|
source: bool = True,
|
|
1594
1594
|
nrows: Optional[int] = None,
|
|
@@ -1600,7 +1600,7 @@ class DataChain:
|
|
|
1600
1600
|
output : Dictionary or feature class defining column names and their
|
|
1601
1601
|
corresponding types. List of column names is also accepted, in which
|
|
1602
1602
|
case types will be inferred.
|
|
1603
|
-
|
|
1603
|
+
column : Generated column name.
|
|
1604
1604
|
model_name : Generated model name.
|
|
1605
1605
|
source : Whether to include info about the source file.
|
|
1606
1606
|
nrows : Optional row limit.
|
|
@@ -1651,14 +1651,14 @@ class DataChain:
|
|
|
1651
1651
|
raise DatasetPrepareError(self.name, e) from e
|
|
1652
1652
|
|
|
1653
1653
|
if isinstance(output, dict):
|
|
1654
|
-
model_name = model_name or
|
|
1654
|
+
model_name = model_name or column or ""
|
|
1655
1655
|
model = dict_to_data_model(model_name, output)
|
|
1656
1656
|
output = model
|
|
1657
1657
|
else:
|
|
1658
1658
|
model = output # type: ignore[assignment]
|
|
1659
1659
|
|
|
1660
|
-
if
|
|
1661
|
-
output = {
|
|
1660
|
+
if column:
|
|
1661
|
+
output = {column: model} # type: ignore[dict-item]
|
|
1662
1662
|
elif isinstance(output, type(BaseModel)):
|
|
1663
1663
|
output = {
|
|
1664
1664
|
name: info.annotation # type: ignore[misc]
|
|
@@ -1,7 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
TYPE_CHECKING,
|
|
3
|
-
Optional,
|
|
4
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, get_origin, get_type_hints
|
|
5
2
|
|
|
6
3
|
from datachain.lib.dataset_info import DatasetInfo
|
|
7
4
|
from datachain.lib.file import (
|
|
@@ -102,7 +99,7 @@ def datasets(
|
|
|
102
99
|
session: Optional[Session] = None,
|
|
103
100
|
settings: Optional[dict] = None,
|
|
104
101
|
in_memory: bool = False,
|
|
105
|
-
|
|
102
|
+
column: Optional[str] = None,
|
|
106
103
|
include_listing: bool = False,
|
|
107
104
|
studio: bool = False,
|
|
108
105
|
) -> "DataChain":
|
|
@@ -112,7 +109,8 @@ def datasets(
|
|
|
112
109
|
session: Optional session instance. If not provided, uses default session.
|
|
113
110
|
settings: Optional dictionary of settings to configure the chain.
|
|
114
111
|
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
115
|
-
|
|
112
|
+
column: Name of the output column in the chain. Defaults to None which
|
|
113
|
+
means no top level column will be created.
|
|
116
114
|
include_listing: If True, includes listing datasets. Defaults to False.
|
|
117
115
|
studio: If True, returns datasets from Studio only,
|
|
118
116
|
otherwise returns all local datasets. Defaults to False.
|
|
@@ -124,7 +122,7 @@ def datasets(
|
|
|
124
122
|
```py
|
|
125
123
|
import datachain as dc
|
|
126
124
|
|
|
127
|
-
chain = dc.datasets()
|
|
125
|
+
chain = dc.datasets(column="dataset")
|
|
128
126
|
for ds in chain.collect("dataset"):
|
|
129
127
|
print(f"{ds.name}@v{ds.version}")
|
|
130
128
|
```
|
|
@@ -139,13 +137,32 @@ def datasets(
|
|
|
139
137
|
include_listing=include_listing, studio=studio
|
|
140
138
|
)
|
|
141
139
|
]
|
|
142
|
-
|
|
143
140
|
datasets_values = [d for d in datasets_values if not d.is_temp]
|
|
144
141
|
|
|
142
|
+
if not column:
|
|
143
|
+
# flattening dataset fields
|
|
144
|
+
schema = {
|
|
145
|
+
k: get_origin(v) if get_origin(v) is dict else v
|
|
146
|
+
for k, v in get_type_hints(DatasetInfo).items()
|
|
147
|
+
if k in DatasetInfo.model_fields
|
|
148
|
+
}
|
|
149
|
+
data = {k: [] for k in DatasetInfo.model_fields} # type: ignore[var-annotated]
|
|
150
|
+
for d in [d.model_dump() for d in datasets_values]:
|
|
151
|
+
for field, value in d.items():
|
|
152
|
+
data[field].append(value)
|
|
153
|
+
|
|
154
|
+
return read_values(
|
|
155
|
+
session=session,
|
|
156
|
+
settings=settings,
|
|
157
|
+
in_memory=in_memory,
|
|
158
|
+
output=schema,
|
|
159
|
+
**data, # type: ignore[arg-type]
|
|
160
|
+
)
|
|
161
|
+
|
|
145
162
|
return read_values(
|
|
146
163
|
session=session,
|
|
147
164
|
settings=settings,
|
|
148
165
|
in_memory=in_memory,
|
|
149
|
-
output={
|
|
150
|
-
**{
|
|
166
|
+
output={column: DatasetInfo},
|
|
167
|
+
**{column: datasets_values}, # type: ignore[arg-type]
|
|
151
168
|
)
|
|
@@ -23,7 +23,7 @@ def read_hf(
|
|
|
23
23
|
*args,
|
|
24
24
|
session: Optional[Session] = None,
|
|
25
25
|
settings: Optional[dict] = None,
|
|
26
|
-
|
|
26
|
+
column: str = "",
|
|
27
27
|
model_name: str = "",
|
|
28
28
|
**kwargs,
|
|
29
29
|
) -> "DataChain":
|
|
@@ -34,7 +34,7 @@ def read_hf(
|
|
|
34
34
|
or an instance of `datasets.Dataset`-like object.
|
|
35
35
|
session : Session to use for the chain.
|
|
36
36
|
settings : Settings to use for the chain.
|
|
37
|
-
|
|
37
|
+
column : Generated object column name.
|
|
38
38
|
model_name : Generated model name.
|
|
39
39
|
kwargs : Parameters to pass to datasets.load_dataset.
|
|
40
40
|
|
|
@@ -62,12 +62,12 @@ def read_hf(
|
|
|
62
62
|
if len(ds_dict) > 1:
|
|
63
63
|
output = {"split": str}
|
|
64
64
|
|
|
65
|
-
model_name = model_name or
|
|
65
|
+
model_name = model_name or column or ""
|
|
66
66
|
hf_features = next(iter(ds_dict.values())).features
|
|
67
67
|
output = output | get_output_schema(hf_features)
|
|
68
68
|
model = dict_to_data_model(model_name, output)
|
|
69
|
-
if
|
|
70
|
-
output = {
|
|
69
|
+
if column:
|
|
70
|
+
output = {column: model}
|
|
71
71
|
|
|
72
72
|
chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
|
|
73
73
|
return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
|
|
@@ -28,7 +28,7 @@ def read_json(
|
|
|
28
28
|
spec: Optional[DataType] = None,
|
|
29
29
|
schema_from: Optional[str] = "auto",
|
|
30
30
|
jmespath: Optional[str] = None,
|
|
31
|
-
|
|
31
|
+
column: Optional[str] = "",
|
|
32
32
|
model_name: Optional[str] = None,
|
|
33
33
|
format: Optional[str] = "json",
|
|
34
34
|
nrows=None,
|
|
@@ -42,7 +42,7 @@ def read_json(
|
|
|
42
42
|
type : read file as "binary", "text", or "image" data. Default is "text".
|
|
43
43
|
spec : optional Data Model
|
|
44
44
|
schema_from : path to sample to infer spec (if schema not provided)
|
|
45
|
-
|
|
45
|
+
column : generated column name
|
|
46
46
|
model_name : optional generated model name
|
|
47
47
|
format: "json", "jsonl"
|
|
48
48
|
jmespath : optional JMESPATH expression to reduce JSON
|
|
@@ -70,13 +70,13 @@ def read_json(
|
|
|
70
70
|
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
71
71
|
return s[:name_end]
|
|
72
72
|
|
|
73
|
-
if (not
|
|
74
|
-
|
|
75
|
-
if not
|
|
76
|
-
|
|
73
|
+
if (not column) and jmespath:
|
|
74
|
+
column = jmespath_to_name(jmespath)
|
|
75
|
+
if not column:
|
|
76
|
+
column = format
|
|
77
77
|
chain = read_storage(uri=path, type=type, **kwargs)
|
|
78
78
|
signal_dict = {
|
|
79
|
-
|
|
79
|
+
column: read_meta(
|
|
80
80
|
schema_from=schema_from,
|
|
81
81
|
format=format,
|
|
82
82
|
spec=spec,
|
|
@@ -19,7 +19,7 @@ if TYPE_CHECKING:
|
|
|
19
19
|
def listings(
|
|
20
20
|
session: Optional[Session] = None,
|
|
21
21
|
in_memory: bool = False,
|
|
22
|
-
|
|
22
|
+
column: str = "listing",
|
|
23
23
|
**kwargs,
|
|
24
24
|
) -> "DataChain":
|
|
25
25
|
"""Generate chain with list of cached listings.
|
|
@@ -38,6 +38,6 @@ def listings(
|
|
|
38
38
|
return read_values(
|
|
39
39
|
session=session,
|
|
40
40
|
in_memory=in_memory,
|
|
41
|
-
output={
|
|
42
|
-
**{
|
|
41
|
+
output={column: ListingInfo},
|
|
42
|
+
**{column: catalog.listings()}, # type: ignore[arg-type]
|
|
43
43
|
)
|
|
@@ -22,7 +22,7 @@ def read_pandas( # type: ignore[override]
|
|
|
22
22
|
session: Optional[Session] = None,
|
|
23
23
|
settings: Optional[dict] = None,
|
|
24
24
|
in_memory: bool = False,
|
|
25
|
-
|
|
25
|
+
column: str = "",
|
|
26
26
|
) -> "DataChain":
|
|
27
27
|
"""Generate chain from pandas data-frame.
|
|
28
28
|
|
|
@@ -39,18 +39,18 @@ def read_pandas( # type: ignore[override]
|
|
|
39
39
|
|
|
40
40
|
fr_map = {col.lower(): df[col].tolist() for col in df.columns}
|
|
41
41
|
|
|
42
|
-
for
|
|
43
|
-
if not
|
|
42
|
+
for c in fr_map:
|
|
43
|
+
if not c.isidentifier():
|
|
44
44
|
raise DatasetPrepareError(
|
|
45
45
|
name,
|
|
46
|
-
f"import from pandas error - '{
|
|
46
|
+
f"import from pandas error - '{c}' cannot be a column name",
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
return read_values(
|
|
50
50
|
name,
|
|
51
51
|
session,
|
|
52
52
|
settings=settings,
|
|
53
|
-
|
|
53
|
+
column=column,
|
|
54
54
|
in_memory=in_memory,
|
|
55
55
|
**fr_map,
|
|
56
56
|
)
|
|
@@ -19,7 +19,7 @@ def read_parquet(
|
|
|
19
19
|
path,
|
|
20
20
|
partitioning: Any = "hive",
|
|
21
21
|
output: Optional[dict[str, DataType]] = None,
|
|
22
|
-
|
|
22
|
+
column: str = "",
|
|
23
23
|
model_name: str = "",
|
|
24
24
|
source: bool = True,
|
|
25
25
|
session: Optional[Session] = None,
|
|
@@ -33,7 +33,7 @@ def read_parquet(
|
|
|
33
33
|
as `s3://`, `gs://`, `az://` or "file:///".
|
|
34
34
|
partitioning : Any pyarrow partitioning schema.
|
|
35
35
|
output : Dictionary defining column names and their corresponding types.
|
|
36
|
-
|
|
36
|
+
column : Created column name.
|
|
37
37
|
model_name : Generated model name.
|
|
38
38
|
source : Whether to include info about the source file.
|
|
39
39
|
session : Session to use for the chain.
|
|
@@ -57,7 +57,7 @@ def read_parquet(
|
|
|
57
57
|
chain = read_storage(path, session=session, settings=settings, **kwargs)
|
|
58
58
|
return chain.parse_tabular(
|
|
59
59
|
output=output,
|
|
60
|
-
|
|
60
|
+
column=column,
|
|
61
61
|
model_name=model_name,
|
|
62
62
|
source=source,
|
|
63
63
|
format="parquet",
|