datachain 0.3.14__tar.gz → 0.3.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.14/src/datachain.egg-info → datachain-0.3.16}/PKG-INFO +1 -1
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/catalog/catalog.py +18 -29
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/client/fsspec.py +9 -8
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/data_storage/sqlite.py +19 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/data_storage/warehouse.py +19 -3
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/dataset.py +1 -1
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/arrow.py +51 -16
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/dc.py +7 -2
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/file.py +76 -2
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/hf.py +23 -6
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/listing.py +7 -5
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/listing_info.py +2 -2
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/signal_schema.py +11 -2
- datachain-0.3.16/src/datachain/lib/tar.py +33 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/webdataset.py +3 -59
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/dataset.py +40 -25
- {datachain-0.3.14 → datachain-0.3.16/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_datachain.py +34 -2
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_dataset_query.py +117 -19
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_datasets.py +4 -3
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_listing.py +2 -1
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_arrow.py +24 -5
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_datachain.py +2 -2
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_datachain_bootstrap.py +38 -19
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_file.py +84 -1
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_hf.py +8 -8
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_signal_schema.py +16 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_client.py +32 -24
- {datachain-0.3.14 → datachain-0.3.16}/.cruft.json +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.gitattributes +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.github/codecov.yaml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.github/dependabot.yml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.github/workflows/release.yml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.github/workflows/tests.yml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.gitignore +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/.pre-commit-config.yaml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/LICENSE +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/README.rst +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/assets/datachain.svg +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/index.md +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/references/datachain.md +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/references/datatype.md +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/references/file.md +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/references/index.md +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/references/sql.md +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/references/torch.md +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/docs/references/udf.md +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/mkdocs.yml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/noxfile.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/pyproject.toml +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/setup.cfg +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/__main__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/asyn.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/cache.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/cli.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/client/hf.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/client/local.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/config.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/error.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/job.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/listing.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/node.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/progress.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/py.typed +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/builtins.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/params.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/schema.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/session.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/storage.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain/utils.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/conftest.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/data.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/examples/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_catalog.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_client.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_ls.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_pull.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/func/test_query.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_session.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_udf.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.14 → datachain-0.3.16}/tests/utils.py +0 -0
|
@@ -621,10 +621,6 @@ class Catalog:
|
|
|
621
621
|
code_ast.body[-1:] = new_expressions
|
|
622
622
|
return code_ast
|
|
623
623
|
|
|
624
|
-
def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
|
|
625
|
-
config = config or self.client_config
|
|
626
|
-
return Client.parse_url(uri, self.cache, **config)
|
|
627
|
-
|
|
628
624
|
def get_client(self, uri: StorageURI, **config: Any) -> Client:
|
|
629
625
|
"""
|
|
630
626
|
Return the client corresponding to the given source `uri`.
|
|
@@ -651,17 +647,16 @@ class Catalog:
|
|
|
651
647
|
partial_path: Optional[str]
|
|
652
648
|
|
|
653
649
|
client_config = client_config or self.client_config
|
|
654
|
-
|
|
650
|
+
uri, path = Client.parse_url(source)
|
|
651
|
+
client = Client.get_client(source, self.cache, **client_config)
|
|
655
652
|
stem = os.path.basename(os.path.normpath(path))
|
|
656
653
|
prefix = (
|
|
657
654
|
posixpath.dirname(path)
|
|
658
655
|
if glob.has_magic(stem) or client.fs.isfile(source)
|
|
659
656
|
else path
|
|
660
657
|
)
|
|
661
|
-
storage_dataset_name = Storage.dataset_name(
|
|
662
|
-
|
|
663
|
-
)
|
|
664
|
-
source_metastore = self.metastore.clone(client.uri)
|
|
658
|
+
storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
|
|
659
|
+
source_metastore = self.metastore.clone(uri)
|
|
665
660
|
|
|
666
661
|
columns = [
|
|
667
662
|
Column("path", String),
|
|
@@ -675,15 +670,13 @@ class Catalog:
|
|
|
675
670
|
]
|
|
676
671
|
|
|
677
672
|
if skip_indexing:
|
|
678
|
-
source_metastore.create_storage_if_not_registered(
|
|
679
|
-
storage = source_metastore.get_storage(
|
|
680
|
-
source_metastore.init_partial_id(
|
|
681
|
-
partial_id = source_metastore.get_next_partial_id(
|
|
673
|
+
source_metastore.create_storage_if_not_registered(uri)
|
|
674
|
+
storage = source_metastore.get_storage(uri)
|
|
675
|
+
source_metastore.init_partial_id(uri)
|
|
676
|
+
partial_id = source_metastore.get_next_partial_id(uri)
|
|
682
677
|
|
|
683
|
-
source_metastore = self.metastore.clone(
|
|
684
|
-
|
|
685
|
-
)
|
|
686
|
-
source_metastore.init(client.uri)
|
|
678
|
+
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
679
|
+
source_metastore.init(uri)
|
|
687
680
|
|
|
688
681
|
source_warehouse = self.warehouse.clone()
|
|
689
682
|
dataset = self.create_dataset(
|
|
@@ -701,20 +694,16 @@ class Catalog:
|
|
|
701
694
|
in_progress,
|
|
702
695
|
partial_id,
|
|
703
696
|
partial_path,
|
|
704
|
-
) = source_metastore.register_storage_for_indexing(
|
|
705
|
-
client.uri, force_update, prefix
|
|
706
|
-
)
|
|
697
|
+
) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
|
|
707
698
|
if in_progress:
|
|
708
699
|
raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
|
|
709
700
|
|
|
710
701
|
if not need_index:
|
|
711
702
|
assert partial_id is not None
|
|
712
703
|
assert partial_path is not None
|
|
713
|
-
source_metastore = self.metastore.clone(
|
|
714
|
-
uri=client.uri, partial_id=partial_id
|
|
715
|
-
)
|
|
704
|
+
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
716
705
|
source_warehouse = self.warehouse.clone()
|
|
717
|
-
dataset = self.get_dataset(Storage.dataset_name(
|
|
706
|
+
dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
718
707
|
lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
|
|
719
708
|
logger.debug(
|
|
720
709
|
"Using cached listing %s. Valid till: %s",
|
|
@@ -731,11 +720,11 @@ class Catalog:
|
|
|
731
720
|
|
|
732
721
|
return lst, path
|
|
733
722
|
|
|
734
|
-
source_metastore.init_partial_id(
|
|
735
|
-
partial_id = source_metastore.get_next_partial_id(
|
|
723
|
+
source_metastore.init_partial_id(uri)
|
|
724
|
+
partial_id = source_metastore.get_next_partial_id(uri)
|
|
736
725
|
|
|
737
|
-
source_metastore.init(
|
|
738
|
-
source_metastore = self.metastore.clone(uri=
|
|
726
|
+
source_metastore.init(uri)
|
|
727
|
+
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
739
728
|
|
|
740
729
|
source_warehouse = self.warehouse.clone()
|
|
741
730
|
|
|
@@ -1370,7 +1359,7 @@ class Catalog:
|
|
|
1370
1359
|
|
|
1371
1360
|
def signed_url(self, source: str, path: str, client_config=None) -> str:
|
|
1372
1361
|
client_config = client_config or self.client_config
|
|
1373
|
-
client
|
|
1362
|
+
client = Client.get_client(source, self.cache, **client_config)
|
|
1374
1363
|
return client.url(path)
|
|
1375
1364
|
|
|
1376
1365
|
def export_dataset_table(
|
|
@@ -116,15 +116,16 @@ class Client(ABC):
|
|
|
116
116
|
return DATA_SOURCE_URI_PATTERN.match(name) is not None
|
|
117
117
|
|
|
118
118
|
@staticmethod
|
|
119
|
-
def parse_url(
|
|
120
|
-
source
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
119
|
+
def parse_url(source: str) -> tuple[StorageURI, str]:
|
|
120
|
+
cls = Client.get_implementation(source)
|
|
121
|
+
storage_name, rel_path = cls.split_url(source)
|
|
122
|
+
return cls.get_uri(storage_name), rel_path
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
|
|
124
126
|
cls = Client.get_implementation(source)
|
|
125
|
-
storage_url,
|
|
126
|
-
|
|
127
|
-
return client, rel_path
|
|
127
|
+
storage_url, _ = cls.split_url(source)
|
|
128
|
+
return cls.from_name(storage_url, cache, kwargs)
|
|
128
129
|
|
|
129
130
|
@classmethod
|
|
130
131
|
def create_fs(cls, **kwargs) -> "AbstractFileSystem":
|
|
@@ -40,7 +40,9 @@ if TYPE_CHECKING:
|
|
|
40
40
|
from sqlalchemy.dialects.sqlite import Insert
|
|
41
41
|
from sqlalchemy.engine.base import Engine
|
|
42
42
|
from sqlalchemy.schema import SchemaItem
|
|
43
|
+
from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
|
|
43
44
|
from sqlalchemy.sql.elements import ColumnElement
|
|
45
|
+
from sqlalchemy.sql.selectable import Join
|
|
44
46
|
from sqlalchemy.types import TypeEngine
|
|
45
47
|
|
|
46
48
|
from datachain.lib.file import File
|
|
@@ -788,6 +790,23 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
788
790
|
if progress_cb:
|
|
789
791
|
progress_cb(len(batch_ids))
|
|
790
792
|
|
|
793
|
+
def join(
|
|
794
|
+
self,
|
|
795
|
+
left: "_FromClauseArgument",
|
|
796
|
+
right: "_FromClauseArgument",
|
|
797
|
+
onclause: "_OnClauseArgument",
|
|
798
|
+
inner: bool = True,
|
|
799
|
+
) -> "Join":
|
|
800
|
+
"""
|
|
801
|
+
Join two tables together.
|
|
802
|
+
"""
|
|
803
|
+
return sqlalchemy.join(
|
|
804
|
+
left,
|
|
805
|
+
right,
|
|
806
|
+
onclause,
|
|
807
|
+
isouter=not inner,
|
|
808
|
+
)
|
|
809
|
+
|
|
791
810
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
792
811
|
"""
|
|
793
812
|
Create a temporary table from a query for use in a UDF.
|
|
@@ -27,8 +27,12 @@ from datachain.storage import StorageURI
|
|
|
27
27
|
from datachain.utils import sql_escape_like
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
30
|
-
from sqlalchemy.sql._typing import
|
|
31
|
-
|
|
30
|
+
from sqlalchemy.sql._typing import (
|
|
31
|
+
_ColumnsClauseArgument,
|
|
32
|
+
_FromClauseArgument,
|
|
33
|
+
_OnClauseArgument,
|
|
34
|
+
)
|
|
35
|
+
from sqlalchemy.sql.selectable import Join, Select
|
|
32
36
|
from sqlalchemy.types import TypeEngine
|
|
33
37
|
|
|
34
38
|
from datachain.data_storage import AbstractIDGenerator, schema
|
|
@@ -894,6 +898,18 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
894
898
|
Copy the results of a query into a table.
|
|
895
899
|
"""
|
|
896
900
|
|
|
901
|
+
@abstractmethod
|
|
902
|
+
def join(
|
|
903
|
+
self,
|
|
904
|
+
left: "_FromClauseArgument",
|
|
905
|
+
right: "_FromClauseArgument",
|
|
906
|
+
onclause: "_OnClauseArgument",
|
|
907
|
+
inner: bool = True,
|
|
908
|
+
) -> "Join":
|
|
909
|
+
"""
|
|
910
|
+
Join two tables together.
|
|
911
|
+
"""
|
|
912
|
+
|
|
897
913
|
@abstractmethod
|
|
898
914
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
899
915
|
"""
|
|
@@ -922,7 +938,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
922
938
|
are cleaned up as soon as they are no longer needed.
|
|
923
939
|
"""
|
|
924
940
|
with tqdm(desc="Cleanup", unit=" tables") as pbar:
|
|
925
|
-
for name in names:
|
|
941
|
+
for name in set(names):
|
|
926
942
|
self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
|
|
927
943
|
pbar.update(1)
|
|
928
944
|
|
|
@@ -112,7 +112,7 @@ class DatasetDependency:
|
|
|
112
112
|
|
|
113
113
|
if is_listing_dataset(dataset_name):
|
|
114
114
|
dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
|
|
115
|
-
dependency_name = listing_uri_from_name(dataset_name)
|
|
115
|
+
dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
|
|
116
116
|
|
|
117
117
|
return cls(
|
|
118
118
|
id,
|
|
@@ -13,8 +13,10 @@ from datachain.lib.model_store import ModelStore
|
|
|
13
13
|
from datachain.lib.udf import Generator
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
|
+
from datasets.features.features import Features
|
|
16
17
|
from pydantic import BaseModel
|
|
17
18
|
|
|
19
|
+
from datachain.lib.data_model import DataType
|
|
18
20
|
from datachain.lib.dc import DataChain
|
|
19
21
|
|
|
20
22
|
|
|
@@ -46,7 +48,10 @@ class ArrowGenerator(Generator):
|
|
|
46
48
|
self.kwargs = kwargs
|
|
47
49
|
|
|
48
50
|
def process(self, file: File):
|
|
49
|
-
if
|
|
51
|
+
if file._caching_enabled:
|
|
52
|
+
path = file.get_local_path(download=True)
|
|
53
|
+
ds = dataset(path, schema=self.input_schema, **self.kwargs)
|
|
54
|
+
elif self.nrows:
|
|
50
55
|
path = _nrows_file(file, self.nrows)
|
|
51
56
|
ds = dataset(path, schema=self.input_schema, **self.kwargs)
|
|
52
57
|
else:
|
|
@@ -54,6 +59,7 @@ class ArrowGenerator(Generator):
|
|
|
54
59
|
ds = dataset(
|
|
55
60
|
path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
|
|
56
61
|
)
|
|
62
|
+
hf_schema = _get_hf_schema(ds.schema)
|
|
57
63
|
index = 0
|
|
58
64
|
with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
|
|
59
65
|
for record_batch in ds.to_batches():
|
|
@@ -62,9 +68,17 @@ class ArrowGenerator(Generator):
|
|
|
62
68
|
if self.output_schema:
|
|
63
69
|
fields = self.output_schema.model_fields
|
|
64
70
|
vals_dict = {}
|
|
65
|
-
for (field, field_info), val in
|
|
66
|
-
|
|
67
|
-
|
|
71
|
+
for i, ((field, field_info), val) in enumerate(
|
|
72
|
+
zip(fields.items(), vals)
|
|
73
|
+
):
|
|
74
|
+
anno = field_info.annotation
|
|
75
|
+
if hf_schema:
|
|
76
|
+
from datachain.lib.hf import convert_feature
|
|
77
|
+
|
|
78
|
+
feat = list(hf_schema[0].values())[i]
|
|
79
|
+
vals_dict[field] = convert_feature(val, feat, anno)
|
|
80
|
+
elif ModelStore.is_pydantic(anno):
|
|
81
|
+
vals_dict[field] = anno(**val) # type: ignore[misc]
|
|
68
82
|
else:
|
|
69
83
|
vals_dict[field] = val
|
|
70
84
|
vals = [self.output_schema(**vals_dict)]
|
|
@@ -91,26 +105,36 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
91
105
|
"Error generating output from Arrow schema - "
|
|
92
106
|
f"Schema has {len(schema)} columns but got {len(col_names)} column names."
|
|
93
107
|
)
|
|
94
|
-
|
|
108
|
+
if not col_names:
|
|
109
|
+
col_names = schema.names
|
|
110
|
+
columns = _convert_col_names(col_names) # type: ignore[arg-type]
|
|
111
|
+
hf_schema = _get_hf_schema(schema)
|
|
112
|
+
if hf_schema:
|
|
113
|
+
return {
|
|
114
|
+
column: hf_type for hf_type, column in zip(hf_schema[1].values(), columns)
|
|
115
|
+
}
|
|
95
116
|
output = {}
|
|
96
|
-
for
|
|
97
|
-
if col_names:
|
|
98
|
-
column = col_names[i]
|
|
99
|
-
else:
|
|
100
|
-
column = field.name
|
|
101
|
-
column = column.lower()
|
|
102
|
-
column = re.sub("[^0-9a-z_]+", "", column)
|
|
103
|
-
if not column:
|
|
104
|
-
column = f"c{default_column}"
|
|
105
|
-
default_column += 1
|
|
117
|
+
for field, column in zip(schema, columns):
|
|
106
118
|
dtype = arrow_type_mapper(field.type, column) # type: ignore[assignment]
|
|
107
119
|
if field.nullable and not ModelStore.is_pydantic(dtype):
|
|
108
120
|
dtype = Optional[dtype] # type: ignore[assignment]
|
|
109
121
|
output[column] = dtype
|
|
110
|
-
|
|
111
122
|
return output
|
|
112
123
|
|
|
113
124
|
|
|
125
|
+
def _convert_col_names(col_names: Sequence[str]) -> list[str]:
|
|
126
|
+
default_column = 0
|
|
127
|
+
converted_col_names = []
|
|
128
|
+
for column in col_names:
|
|
129
|
+
column = column.lower()
|
|
130
|
+
column = re.sub("[^0-9a-z_]+", "", column)
|
|
131
|
+
if not column:
|
|
132
|
+
column = f"c{default_column}"
|
|
133
|
+
default_column += 1
|
|
134
|
+
converted_col_names.append(column)
|
|
135
|
+
return converted_col_names
|
|
136
|
+
|
|
137
|
+
|
|
114
138
|
def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
|
|
115
139
|
"""Convert pyarrow types to basic types."""
|
|
116
140
|
from datetime import datetime
|
|
@@ -156,3 +180,14 @@ def _nrows_file(file: File, nrows: int) -> str:
|
|
|
156
180
|
writer.write(line)
|
|
157
181
|
writer.write("\n")
|
|
158
182
|
return tf.name
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _get_hf_schema(
|
|
186
|
+
schema: "pa.Schema",
|
|
187
|
+
) -> Optional[tuple["Features", dict[str, "DataType"]]]:
|
|
188
|
+
if schema.metadata and b"huggingface" in schema.metadata:
|
|
189
|
+
from datachain.lib.hf import get_output_schema, schema_from_arrow
|
|
190
|
+
|
|
191
|
+
features = schema_from_arrow(schema)
|
|
192
|
+
return features, get_output_schema(features)
|
|
193
|
+
return None
|
|
@@ -408,7 +408,11 @@ class DataChain(DatasetQuery):
|
|
|
408
408
|
in_memory=in_memory,
|
|
409
409
|
)
|
|
410
410
|
.gen(
|
|
411
|
-
list_bucket(
|
|
411
|
+
list_bucket(
|
|
412
|
+
list_uri,
|
|
413
|
+
session.catalog.cache,
|
|
414
|
+
client_config=session.catalog.client_config,
|
|
415
|
+
),
|
|
412
416
|
output={f"{object_name}": File},
|
|
413
417
|
)
|
|
414
418
|
.save(list_dataset_name, listing=True)
|
|
@@ -1523,7 +1527,8 @@ class DataChain(DatasetQuery):
|
|
|
1523
1527
|
output = {"split": str}
|
|
1524
1528
|
|
|
1525
1529
|
model_name = model_name or object_name or ""
|
|
1526
|
-
|
|
1530
|
+
hf_features = next(iter(ds_dict.values())).features
|
|
1531
|
+
output = output | get_output_schema(hf_features, model_name)
|
|
1527
1532
|
model = dict_to_data_model(model_name, output)
|
|
1528
1533
|
if object_name:
|
|
1529
1534
|
output = {object_name: model}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
|
+
import logging
|
|
3
4
|
import os
|
|
4
5
|
import posixpath
|
|
5
6
|
from abc import ABC, abstractmethod
|
|
@@ -15,6 +16,9 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
|
15
16
|
from PIL import Image
|
|
16
17
|
from pydantic import Field, field_validator
|
|
17
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from typing_extensions import Self
|
|
21
|
+
|
|
18
22
|
from datachain.cache import UniqueId
|
|
19
23
|
from datachain.client.fileslice import FileSlice
|
|
20
24
|
from datachain.lib.data_model import DataModel
|
|
@@ -25,6 +29,8 @@ from datachain.utils import TIME_ZERO
|
|
|
25
29
|
if TYPE_CHECKING:
|
|
26
30
|
from datachain.catalog import Catalog
|
|
27
31
|
|
|
32
|
+
logger = logging.getLogger("datachain")
|
|
33
|
+
|
|
28
34
|
# how to create file path when exporting
|
|
29
35
|
ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
|
|
30
36
|
|
|
@@ -251,14 +257,18 @@ class File(DataModel):
|
|
|
251
257
|
dump = self.model_dump()
|
|
252
258
|
return UniqueId(*(dump[k] for k in self._unique_id_keys))
|
|
253
259
|
|
|
254
|
-
def get_local_path(self) -> Optional[str]:
|
|
260
|
+
def get_local_path(self, download: bool = False) -> Optional[str]:
|
|
255
261
|
"""Returns path to a file in a local cache.
|
|
256
262
|
Return None if file is not cached. Throws an exception if cache is not setup."""
|
|
257
263
|
if self._catalog is None:
|
|
258
264
|
raise RuntimeError(
|
|
259
265
|
"cannot resolve local file path because catalog is not setup"
|
|
260
266
|
)
|
|
261
|
-
|
|
267
|
+
uid = self.get_uid()
|
|
268
|
+
if download:
|
|
269
|
+
client = self._catalog.get_client(self.source)
|
|
270
|
+
client.download(uid, callback=self._download_cb)
|
|
271
|
+
return self._catalog.cache.get_path(uid)
|
|
262
272
|
|
|
263
273
|
def get_file_suffix(self):
|
|
264
274
|
"""Returns last part of file name with `.`."""
|
|
@@ -313,6 +323,70 @@ class File(DataModel):
|
|
|
313
323
|
"""Returns `fsspec` filesystem for the file."""
|
|
314
324
|
return self._catalog.get_client(self.source).fs
|
|
315
325
|
|
|
326
|
+
def resolve(self) -> "Self":
|
|
327
|
+
"""
|
|
328
|
+
Resolve a File object by checking its existence and updating its metadata.
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
File: The resolved File object with updated metadata.
|
|
332
|
+
"""
|
|
333
|
+
if self._catalog is None:
|
|
334
|
+
raise RuntimeError("Cannot resolve file: catalog is not set")
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
client = self._catalog.get_client(self.source)
|
|
338
|
+
except NotImplementedError as e:
|
|
339
|
+
raise RuntimeError(
|
|
340
|
+
f"Unsupported protocol for file source: {self.source}"
|
|
341
|
+
) from e
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
info = client.fs.info(client.get_full_path(self.path))
|
|
345
|
+
converted_info = client.info_to_file(info, self.source)
|
|
346
|
+
return type(self)(
|
|
347
|
+
path=self.path,
|
|
348
|
+
source=self.source,
|
|
349
|
+
size=converted_info.size,
|
|
350
|
+
etag=converted_info.etag,
|
|
351
|
+
version=converted_info.version,
|
|
352
|
+
is_latest=converted_info.is_latest,
|
|
353
|
+
last_modified=converted_info.last_modified,
|
|
354
|
+
location=self.location,
|
|
355
|
+
)
|
|
356
|
+
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
357
|
+
logger.warning("File system error when resolving %s: %s", self.path, str(e))
|
|
358
|
+
|
|
359
|
+
return type(self)(
|
|
360
|
+
path=self.path,
|
|
361
|
+
source=self.source,
|
|
362
|
+
size=0,
|
|
363
|
+
etag="",
|
|
364
|
+
version="",
|
|
365
|
+
is_latest=True,
|
|
366
|
+
last_modified=TIME_ZERO,
|
|
367
|
+
location=self.location,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def resolve(file: File) -> File:
|
|
372
|
+
"""
|
|
373
|
+
Resolve a File object by checking its existence and updating its metadata.
|
|
374
|
+
|
|
375
|
+
This function is a wrapper around the File.resolve() method, designed to be
|
|
376
|
+
used as a mapper in DataChain operations.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
file (File): The File object to resolve.
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
File: The resolved File object with updated metadata.
|
|
383
|
+
|
|
384
|
+
Raises:
|
|
385
|
+
RuntimeError: If the file's catalog is not set or if
|
|
386
|
+
the file source protocol is unsupported.
|
|
387
|
+
"""
|
|
388
|
+
return file.resolve()
|
|
389
|
+
|
|
316
390
|
|
|
317
391
|
class TextFile(File):
|
|
318
392
|
"""`DataModel` for reading text files."""
|
|
@@ -15,7 +15,7 @@ try:
|
|
|
15
15
|
Value,
|
|
16
16
|
load_dataset,
|
|
17
17
|
)
|
|
18
|
-
from datasets.features.features import string_to_arrow
|
|
18
|
+
from datasets.features.features import Features, string_to_arrow
|
|
19
19
|
from datasets.features.image import image_to_bytes
|
|
20
20
|
|
|
21
21
|
except ImportError as exc:
|
|
@@ -36,6 +36,7 @@ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
|
36
36
|
from datachain.lib.udf import Generator
|
|
37
37
|
|
|
38
38
|
if TYPE_CHECKING:
|
|
39
|
+
import pyarrow as pa
|
|
39
40
|
from pydantic import BaseModel
|
|
40
41
|
|
|
41
42
|
|
|
@@ -71,6 +72,15 @@ class HFGenerator(Generator):
|
|
|
71
72
|
*args,
|
|
72
73
|
**kwargs,
|
|
73
74
|
):
|
|
75
|
+
"""
|
|
76
|
+
Generator for chain from huggingface datasets.
|
|
77
|
+
|
|
78
|
+
Parameters:
|
|
79
|
+
|
|
80
|
+
ds : Path or name of the dataset to read from Hugging Face Hub,
|
|
81
|
+
or an instance of `datasets.Dataset`-like object.
|
|
82
|
+
output_schema : Pydantic model for validation.
|
|
83
|
+
"""
|
|
74
84
|
super().__init__()
|
|
75
85
|
self.ds = ds
|
|
76
86
|
self.output_schema = output_schema
|
|
@@ -92,7 +102,7 @@ class HFGenerator(Generator):
|
|
|
92
102
|
output_dict["split"] = split
|
|
93
103
|
for name, feat in ds.features.items():
|
|
94
104
|
anno = self.output_schema.model_fields[name].annotation
|
|
95
|
-
output_dict[name] =
|
|
105
|
+
output_dict[name] = convert_feature(row[name], feat, anno)
|
|
96
106
|
yield self.output_schema(**output_dict)
|
|
97
107
|
pbar.update(1)
|
|
98
108
|
|
|
@@ -106,7 +116,7 @@ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
|
|
|
106
116
|
return {"": ds}
|
|
107
117
|
|
|
108
118
|
|
|
109
|
-
def
|
|
119
|
+
def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
|
|
110
120
|
if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
|
|
111
121
|
return val
|
|
112
122
|
if isinstance(feat, ClassLabel):
|
|
@@ -117,20 +127,23 @@ def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
|
|
|
117
127
|
for sname in val:
|
|
118
128
|
sfeat = feat.feature[sname]
|
|
119
129
|
sanno = anno.model_fields[sname].annotation
|
|
120
|
-
sdict[sname] = [
|
|
130
|
+
sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
|
|
121
131
|
return anno(**sdict)
|
|
122
132
|
return val
|
|
123
133
|
if isinstance(feat, Image):
|
|
134
|
+
if isinstance(val, dict):
|
|
135
|
+
return HFImage(img=val["bytes"])
|
|
124
136
|
return HFImage(img=image_to_bytes(val))
|
|
125
137
|
if isinstance(feat, Audio):
|
|
126
138
|
return HFAudio(**val)
|
|
127
139
|
|
|
128
140
|
|
|
129
141
|
def get_output_schema(
|
|
130
|
-
|
|
142
|
+
features: Features, model_name: str = "", stream: bool = True
|
|
131
143
|
) -> dict[str, DataType]:
|
|
144
|
+
"""Generate UDF output schema from huggingface datasets features."""
|
|
132
145
|
fields_dict = {}
|
|
133
|
-
for name, val in
|
|
146
|
+
for name, val in features.items():
|
|
134
147
|
fields_dict[name] = _feature_to_chain_type(name, val) # type: ignore[assignment]
|
|
135
148
|
return fields_dict # type: ignore[return-value]
|
|
136
149
|
|
|
@@ -165,3 +178,7 @@ def _feature_to_chain_type(name: str, val: Any) -> type: # noqa: PLR0911
|
|
|
165
178
|
if isinstance(val, Audio):
|
|
166
179
|
return HFAudio
|
|
167
180
|
raise TypeError(f"Unknown huggingface datasets type {type(val)}")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def schema_from_arrow(schema: "pa.Schema"):
|
|
184
|
+
return Features.from_arrow_schema(schema)
|
|
@@ -20,7 +20,7 @@ LISTING_TTL = 4 * 60 * 60 # cached listing lasts 4 hours
|
|
|
20
20
|
LISTING_PREFIX = "lst__" # listing datasets start with this name
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def list_bucket(uri: str, client_config=None) -> Callable:
|
|
23
|
+
def list_bucket(uri: str, cache, client_config=None) -> Callable:
|
|
24
24
|
"""
|
|
25
25
|
Function that returns another generator function that yields File objects
|
|
26
26
|
from bucket where each File represents one bucket entry.
|
|
@@ -28,7 +28,8 @@ def list_bucket(uri: str, client_config=None) -> Callable:
|
|
|
28
28
|
|
|
29
29
|
def list_func() -> Iterator[File]:
|
|
30
30
|
config = client_config or {}
|
|
31
|
-
client
|
|
31
|
+
client = Client.get_client(uri, cache, **config) # type: ignore[arg-type]
|
|
32
|
+
_, path = Client.parse_url(uri)
|
|
32
33
|
for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
|
|
33
34
|
yield from entries
|
|
34
35
|
|
|
@@ -76,16 +77,17 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
|
|
|
76
77
|
"""
|
|
77
78
|
Parsing uri and returns listing dataset name, listing uri and listing path
|
|
78
79
|
"""
|
|
79
|
-
client
|
|
80
|
+
client = Client.get_client(uri, cache, **client_config)
|
|
81
|
+
storage_uri, path = Client.parse_url(uri)
|
|
80
82
|
|
|
81
83
|
# clean path without globs
|
|
82
84
|
lst_uri_path = (
|
|
83
85
|
posixpath.dirname(path) if uses_glob(path) or client.fs.isfile(uri) else path
|
|
84
86
|
)
|
|
85
87
|
|
|
86
|
-
lst_uri = f"{
|
|
88
|
+
lst_uri = f"{storage_uri}/{lst_uri_path.lstrip('/')}"
|
|
87
89
|
ds_name = (
|
|
88
|
-
f"{LISTING_PREFIX}{
|
|
90
|
+
f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
|
|
89
91
|
)
|
|
90
92
|
|
|
91
93
|
return ds_name, lst_uri, path
|
|
@@ -13,8 +13,8 @@ class ListingInfo(DatasetInfo):
|
|
|
13
13
|
|
|
14
14
|
@property
|
|
15
15
|
def storage_uri(self) -> str:
|
|
16
|
-
|
|
17
|
-
return
|
|
16
|
+
uri, _ = Client.parse_url(self.uri)
|
|
17
|
+
return uri
|
|
18
18
|
|
|
19
19
|
@property
|
|
20
20
|
def expires(self) -> Optional[datetime]:
|
|
@@ -386,11 +386,20 @@ class SignalSchema:
|
|
|
386
386
|
else:
|
|
387
387
|
json, pos = unflatten_to_json_pos(fr, row, pos) # type: ignore[union-attr]
|
|
388
388
|
obj = fr(**json)
|
|
389
|
-
|
|
390
|
-
obj._set_stream(catalog, caching_enabled=cache)
|
|
389
|
+
SignalSchema._set_file_stream(obj, catalog, cache)
|
|
391
390
|
res.append(obj)
|
|
392
391
|
return res
|
|
393
392
|
|
|
393
|
+
@staticmethod
|
|
394
|
+
def _set_file_stream(
|
|
395
|
+
obj: BaseModel, catalog: "Catalog", cache: bool = False
|
|
396
|
+
) -> None:
|
|
397
|
+
if isinstance(obj, File):
|
|
398
|
+
obj._set_stream(catalog, caching_enabled=cache)
|
|
399
|
+
for field, finfo in obj.model_fields.items():
|
|
400
|
+
if ModelStore.is_pydantic(finfo.annotation):
|
|
401
|
+
SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
|
|
402
|
+
|
|
394
403
|
def db_signals(
|
|
395
404
|
self, name: Optional[str] = None, as_columns=False
|
|
396
405
|
) -> Union[list[str], list[Column]]:
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import tarfile
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
|
|
5
|
+
from datachain.lib.file import File, TarVFile
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
|
|
9
|
+
new_parent = parent.get_full_name()
|
|
10
|
+
etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
|
|
11
|
+
etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
|
|
12
|
+
return File(
|
|
13
|
+
source=parent.source,
|
|
14
|
+
path=f"{new_parent}/{info.name}",
|
|
15
|
+
version=parent.version,
|
|
16
|
+
size=info.size,
|
|
17
|
+
etag=etag,
|
|
18
|
+
location=[
|
|
19
|
+
{
|
|
20
|
+
"vtype": TarVFile.get_vtype(),
|
|
21
|
+
"parent": parent.model_dump_custom(),
|
|
22
|
+
"size": info.size,
|
|
23
|
+
"offset": info.offset_data,
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def process_tar(file: File) -> Iterator[File]:
|
|
30
|
+
with file.open() as fd:
|
|
31
|
+
with tarfile.open(fileobj=fd) as tar:
|
|
32
|
+
for entry in tar.getmembers():
|
|
33
|
+
yield build_tar_member(file, entry)
|