datachain 0.3.8__tar.gz → 0.3.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.8 → datachain-0.3.10}/.github/workflows/tests.yml +1 -1
- {datachain-0.3.8 → datachain-0.3.10}/.pre-commit-config.yaml +1 -1
- {datachain-0.3.8/src/datachain.egg-info → datachain-0.3.10}/PKG-INFO +14 -14
- {datachain-0.3.8 → datachain-0.3.10}/README.rst +11 -12
- {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/udfs/parallel.py +1 -1
- {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/udfs/simple.py +1 -1
- {datachain-0.3.8 → datachain-0.3.10}/examples/llm_and_nlp/unstructured-text.py +1 -1
- {datachain-0.3.8 → datachain-0.3.10}/examples/multimodal/wds_filtered.py +1 -3
- {datachain-0.3.8 → datachain-0.3.10}/pyproject.toml +3 -2
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/catalog/catalog.py +13 -91
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/cli.py +6 -38
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/fsspec.py +3 -0
- datachain-0.3.10/src/datachain/client/hf.py +47 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/metastore.py +2 -29
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/sqlite.py +3 -12
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/warehouse.py +20 -29
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/dataset.py +44 -32
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/arrow.py +22 -6
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/dataset_info.py +4 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/dc.py +149 -35
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/file.py +10 -33
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/hf.py +2 -1
- datachain-0.3.10/src/datachain/lib/listing.py +119 -0
- datachain-0.3.10/src/datachain/lib/listing_info.py +32 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/meta_formats.py +4 -4
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/signal_schema.py +5 -2
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/webdataset.py +1 -1
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/node.py +13 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/dataset.py +25 -87
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/metrics.py +8 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/utils.py +5 -0
- {datachain-0.3.8 → datachain-0.3.10/src/datachain.egg-info}/PKG-INFO +14 -14
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain.egg-info/SOURCES.txt +3 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain.egg-info/requires.txt +2 -1
- {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/test_datachain.py +4 -6
- {datachain-0.3.8 → datachain-0.3.10}/tests/conftest.py +4 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_catalog.py +49 -24
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_datachain.py +147 -11
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_dataset_query.py +20 -4
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_datasets.py +18 -13
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_feature_pickling.py +21 -16
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_ls.py +7 -4
- datachain-0.3.10/tests/func/test_metrics.py +14 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_query.py +15 -23
- {datachain-0.3.8 → datachain-0.3.10}/tests/scripts/feature_class.py +2 -2
- {datachain-0.3.8 → datachain-0.3.10}/tests/scripts/feature_class_parallel.py +1 -1
- {datachain-0.3.8 → datachain-0.3.10}/tests/scripts/feature_class_parallel_data_model.py +1 -1
- {datachain-0.3.8 → datachain-0.3.10}/tests/test_query_e2e.py +5 -4
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_arrow.py +38 -1
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_datachain.py +95 -4
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_signal_schema.py +20 -3
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_dataset.py +28 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_listing.py +86 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_storage.py +0 -34
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_utils.py +17 -0
- datachain-0.3.8/src/datachain/lib/listing.py +0 -111
- {datachain-0.3.8 → datachain-0.3.10}/.cruft.json +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.gitattributes +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.github/codecov.yaml +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.github/dependabot.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.github/workflows/release.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/.gitignore +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/LICENSE +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/assets/datachain.png +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/index.md +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/references/datachain.md +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/references/datatype.md +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/references/file.md +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/references/index.md +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/references/sql.md +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/references/torch.md +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/docs/references/udf.md +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/mkdocs.yml +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/noxfile.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/setup.cfg +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/__main__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/asyn.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/cache.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/local.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/config.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/error.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/job.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/listing.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/progress.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/py.typed +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/builtins.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/params.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/schema.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/session.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/storage.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/data.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/examples/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_client.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_listing.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_pull.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_client.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_session.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_udf.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.8 → datachain-0.3.10}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.10
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -43,6 +43,7 @@ Requires-Dist: datamodel-code-generator>=0.25
|
|
|
43
43
|
Requires-Dist: Pillow<11,>=10.0.0
|
|
44
44
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
45
45
|
Requires-Dist: psutil
|
|
46
|
+
Requires-Dist: huggingface_hub
|
|
46
47
|
Provides-Extra: docs
|
|
47
48
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
48
49
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -61,7 +62,7 @@ Provides-Extra: vector
|
|
|
61
62
|
Requires-Dist: usearch; extra == "vector"
|
|
62
63
|
Provides-Extra: hf
|
|
63
64
|
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
64
|
-
Requires-Dist: datasets[audio,vision]; extra == "hf"
|
|
65
|
+
Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
|
|
65
66
|
Provides-Extra: tests
|
|
66
67
|
Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
|
|
67
68
|
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
@@ -115,31 +116,30 @@ AI 🔗 DataChain
|
|
|
115
116
|
|
|
116
117
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
117
118
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
118
|
-
your local machine.
|
|
119
|
+
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
119
120
|
|
|
120
121
|
Key Features
|
|
121
122
|
============
|
|
122
123
|
|
|
123
124
|
📂 **Storage as a Source of Truth.**
|
|
124
|
-
- Process unstructured data without redundant copies
|
|
125
|
+
- Process unstructured data without redundant copies from S3, GCP, Azure, and local
|
|
125
126
|
file systems.
|
|
126
|
-
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
127
|
-
-
|
|
127
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
128
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
128
129
|
|
|
129
130
|
🐍 **Python-friendly data pipelines.**
|
|
130
131
|
- Operate on Python objects and object fields.
|
|
131
|
-
- Built-in parallelization and out-of-memory compute without
|
|
132
|
-
Spark jobs.
|
|
132
|
+
- Built-in parallelization and out-of-memory compute without SQL or Spark.
|
|
133
133
|
|
|
134
134
|
🧠 **Data Enrichment and Processing.**
|
|
135
|
-
- Generate metadata
|
|
136
|
-
- Filter, join, and group by
|
|
137
|
-
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
135
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
136
|
+
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
137
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
138
138
|
|
|
139
139
|
🚀 **Efficiency.**
|
|
140
140
|
- Parallelization, out-of-memory workloads and data caching.
|
|
141
141
|
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
142
|
-
-
|
|
142
|
+
- Optimized vector search.
|
|
143
143
|
|
|
144
144
|
|
|
145
145
|
Quick Start
|
|
@@ -164,7 +164,7 @@ where each image has a matching JSON file like `cat.1009.json`:
|
|
|
164
164
|
"inference": {"class": "dog", "confidence": 0.68}
|
|
165
165
|
}
|
|
166
166
|
|
|
167
|
-
Example of downloading only high-confidence cat images using JSON metadata:
|
|
167
|
+
Example of downloading only "high-confidence cat" inferred images using JSON metadata:
|
|
168
168
|
|
|
169
169
|
|
|
170
170
|
.. code:: py
|
|
@@ -234,7 +234,7 @@ detected are then copied to the local directory.
|
|
|
234
234
|
LLM judging chatbots
|
|
235
235
|
=============================
|
|
236
236
|
|
|
237
|
-
LLMs can work as
|
|
237
|
+
LLMs can work as universal classifiers. In the example below,
|
|
238
238
|
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
239
239
|
Mistral API key at https://console.mistral.ai
|
|
240
240
|
|
|
@@ -18,31 +18,30 @@ AI 🔗 DataChain
|
|
|
18
18
|
|
|
19
19
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
20
20
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
21
|
-
your local machine.
|
|
21
|
+
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
22
22
|
|
|
23
23
|
Key Features
|
|
24
24
|
============
|
|
25
25
|
|
|
26
26
|
📂 **Storage as a Source of Truth.**
|
|
27
|
-
- Process unstructured data without redundant copies
|
|
27
|
+
- Process unstructured data without redundant copies from S3, GCP, Azure, and local
|
|
28
28
|
file systems.
|
|
29
|
-
- Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
30
|
-
-
|
|
29
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
30
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
31
31
|
|
|
32
32
|
🐍 **Python-friendly data pipelines.**
|
|
33
33
|
- Operate on Python objects and object fields.
|
|
34
|
-
- Built-in parallelization and out-of-memory compute without
|
|
35
|
-
Spark jobs.
|
|
34
|
+
- Built-in parallelization and out-of-memory compute without SQL or Spark.
|
|
36
35
|
|
|
37
36
|
🧠 **Data Enrichment and Processing.**
|
|
38
|
-
- Generate metadata
|
|
39
|
-
- Filter, join, and group by
|
|
40
|
-
- Pass datasets to Pytorch and Tensorflow, or export back into storage.
|
|
37
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
38
|
+
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
39
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
41
40
|
|
|
42
41
|
🚀 **Efficiency.**
|
|
43
42
|
- Parallelization, out-of-memory workloads and data caching.
|
|
44
43
|
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
45
|
-
-
|
|
44
|
+
- Optimized vector search.
|
|
46
45
|
|
|
47
46
|
|
|
48
47
|
Quick Start
|
|
@@ -67,7 +66,7 @@ where each image has a matching JSON file like `cat.1009.json`:
|
|
|
67
66
|
"inference": {"class": "dog", "confidence": 0.68}
|
|
68
67
|
}
|
|
69
68
|
|
|
70
|
-
Example of downloading only high-confidence cat images using JSON metadata:
|
|
69
|
+
Example of downloading only "high-confidence cat" inferred images using JSON metadata:
|
|
71
70
|
|
|
72
71
|
|
|
73
72
|
.. code:: py
|
|
@@ -137,7 +136,7 @@ detected are then copied to the local directory.
|
|
|
137
136
|
LLM judging chatbots
|
|
138
137
|
=============================
|
|
139
138
|
|
|
140
|
-
LLMs can work as
|
|
139
|
+
LLMs can work as universal classifiers. In the example below,
|
|
141
140
|
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
142
141
|
Mistral API key at https://console.mistral.ai
|
|
143
142
|
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
import datachain.error
|
|
2
2
|
from datachain import C, DataChain
|
|
3
|
-
from datachain.lib.model_store import ModelStore
|
|
4
3
|
from datachain.lib.webdataset import process_webdataset
|
|
5
|
-
from datachain.lib.webdataset_laion import
|
|
4
|
+
from datachain.lib.webdataset_laion import WDSLaion
|
|
6
5
|
from datachain.sql import literal
|
|
7
6
|
from datachain.sql.functions import array, greatest, least, string
|
|
8
7
|
|
|
9
8
|
name = "wds"
|
|
10
|
-
ModelStore.register(LaionMeta)
|
|
11
9
|
try:
|
|
12
10
|
wds = DataChain.from_dataset(name=name)
|
|
13
11
|
except datachain.error.DatasetNotFoundError:
|
|
@@ -45,7 +45,8 @@ dependencies = [
|
|
|
45
45
|
"datamodel-code-generator>=0.25",
|
|
46
46
|
"Pillow>=10.0.0,<11",
|
|
47
47
|
"msgpack>=1.0.4,<2",
|
|
48
|
-
"psutil"
|
|
48
|
+
"psutil",
|
|
49
|
+
"huggingface_hub"
|
|
49
50
|
]
|
|
50
51
|
|
|
51
52
|
[project.optional-dependencies]
|
|
@@ -71,7 +72,7 @@ vector = [
|
|
|
71
72
|
]
|
|
72
73
|
hf = [
|
|
73
74
|
"numba>=0.60.0",
|
|
74
|
-
"datasets[audio,vision]"
|
|
75
|
+
"datasets[audio,vision]>=2.21.0"
|
|
75
76
|
]
|
|
76
77
|
tests = [
|
|
77
78
|
"datachain[torch,remote,vector,hf]",
|
|
@@ -156,8 +156,6 @@ class QueryResult(NamedTuple):
|
|
|
156
156
|
dataset: Optional[DatasetRecord]
|
|
157
157
|
version: Optional[int]
|
|
158
158
|
output: str
|
|
159
|
-
preview: Optional[list[dict]]
|
|
160
|
-
metrics: dict[str, Any]
|
|
161
159
|
|
|
162
160
|
|
|
163
161
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -1020,20 +1018,6 @@ class Catalog:
|
|
|
1020
1018
|
|
|
1021
1019
|
return node_groups
|
|
1022
1020
|
|
|
1023
|
-
def unlist_source(self, uri: StorageURI) -> None:
|
|
1024
|
-
self.metastore.clone(uri=uri).mark_storage_not_indexed(uri)
|
|
1025
|
-
|
|
1026
|
-
def storage_stats(self, uri: StorageURI) -> Optional[DatasetStats]:
|
|
1027
|
-
"""
|
|
1028
|
-
Returns tuple with storage stats: total number of rows and total dataset size.
|
|
1029
|
-
"""
|
|
1030
|
-
partial_path = self.metastore.get_last_partial_path(uri)
|
|
1031
|
-
if partial_path is None:
|
|
1032
|
-
return None
|
|
1033
|
-
dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
1034
|
-
|
|
1035
|
-
return self.dataset_stats(dataset.name, dataset.latest_version)
|
|
1036
|
-
|
|
1037
1021
|
def create_dataset(
|
|
1038
1022
|
self,
|
|
1039
1023
|
name: str,
|
|
@@ -1297,19 +1281,6 @@ class Catalog:
|
|
|
1297
1281
|
|
|
1298
1282
|
return self.get_dataset(name)
|
|
1299
1283
|
|
|
1300
|
-
def register_new_dataset(
|
|
1301
|
-
self,
|
|
1302
|
-
source_dataset: DatasetRecord,
|
|
1303
|
-
source_version: int,
|
|
1304
|
-
target_name: str,
|
|
1305
|
-
) -> DatasetRecord:
|
|
1306
|
-
target_dataset = self.metastore.create_dataset(
|
|
1307
|
-
target_name,
|
|
1308
|
-
query_script=source_dataset.query_script,
|
|
1309
|
-
schema=source_dataset.serialized_schema,
|
|
1310
|
-
)
|
|
1311
|
-
return self.register_dataset(source_dataset, source_version, target_dataset, 1)
|
|
1312
|
-
|
|
1313
1284
|
def register_dataset(
|
|
1314
1285
|
self,
|
|
1315
1286
|
dataset: DatasetRecord,
|
|
@@ -1422,17 +1393,18 @@ class Catalog:
|
|
|
1422
1393
|
|
|
1423
1394
|
return direct_dependencies
|
|
1424
1395
|
|
|
1425
|
-
def ls_datasets(self) -> Iterator[DatasetRecord]:
|
|
1396
|
+
def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetRecord]:
|
|
1426
1397
|
datasets = self.metastore.list_datasets()
|
|
1427
1398
|
for d in datasets:
|
|
1428
|
-
if not d.is_bucket_listing:
|
|
1399
|
+
if not d.is_bucket_listing or include_listing:
|
|
1429
1400
|
yield d
|
|
1430
1401
|
|
|
1431
1402
|
def list_datasets_versions(
|
|
1432
1403
|
self,
|
|
1404
|
+
include_listing: bool = False,
|
|
1433
1405
|
) -> Iterator[tuple[DatasetRecord, "DatasetVersion", Optional["Job"]]]:
|
|
1434
1406
|
"""Iterate over all dataset versions with related jobs."""
|
|
1435
|
-
datasets = list(self.ls_datasets())
|
|
1407
|
+
datasets = list(self.ls_datasets(include_listing=include_listing))
|
|
1436
1408
|
|
|
1437
1409
|
# preselect dataset versions jobs from db to avoid multiple queries
|
|
1438
1410
|
jobs_ids: set[str] = {
|
|
@@ -1560,17 +1532,8 @@ class Catalog:
|
|
|
1560
1532
|
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1561
1533
|
|
|
1562
1534
|
file_signals_values = {}
|
|
1563
|
-
file_schemas = {}
|
|
1564
|
-
# TODO: To remove after we properly fix deserialization
|
|
1565
|
-
for signal, type_name in version.feature_schema.items():
|
|
1566
|
-
from datachain.lib.model_store import ModelStore
|
|
1567
|
-
|
|
1568
|
-
type_name_parsed, v = ModelStore.parse_name_version(type_name)
|
|
1569
|
-
fr = ModelStore.get(type_name_parsed, v)
|
|
1570
|
-
if fr and issubclass(fr, File):
|
|
1571
|
-
file_schemas[signal] = type_name
|
|
1572
1535
|
|
|
1573
|
-
schema = SignalSchema.deserialize(
|
|
1536
|
+
schema = SignalSchema.deserialize(version.feature_schema)
|
|
1574
1537
|
for file_signals in schema.get_signals(File):
|
|
1575
1538
|
prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
|
|
1576
1539
|
file_signals_values[file_signals] = {
|
|
@@ -1641,15 +1604,6 @@ class Catalog:
|
|
|
1641
1604
|
for source in data_sources: # type: ignore [union-attr]
|
|
1642
1605
|
yield source, source.ls(fields)
|
|
1643
1606
|
|
|
1644
|
-
def ls_storage_uris(self) -> Iterator[str]:
|
|
1645
|
-
yield from self.metastore.get_all_storage_uris()
|
|
1646
|
-
|
|
1647
|
-
def get_storage(self, uri: StorageURI) -> Storage:
|
|
1648
|
-
return self.metastore.get_storage(uri)
|
|
1649
|
-
|
|
1650
|
-
def ls_storages(self) -> list[Storage]:
|
|
1651
|
-
return self.metastore.list_storages()
|
|
1652
|
-
|
|
1653
1607
|
def pull_dataset(
|
|
1654
1608
|
self,
|
|
1655
1609
|
dataset_uri: str,
|
|
@@ -1883,10 +1837,6 @@ class Catalog:
|
|
|
1883
1837
|
envs: Optional[Mapping[str, str]] = None,
|
|
1884
1838
|
python_executable: Optional[str] = None,
|
|
1885
1839
|
save: bool = False,
|
|
1886
|
-
save_as: Optional[str] = None,
|
|
1887
|
-
preview_limit: int = 10,
|
|
1888
|
-
preview_offset: int = 0,
|
|
1889
|
-
preview_columns: Optional[list[str]] = None,
|
|
1890
1840
|
capture_output: bool = True,
|
|
1891
1841
|
output_hook: Callable[[str], None] = noop,
|
|
1892
1842
|
params: Optional[dict[str, str]] = None,
|
|
@@ -1914,9 +1864,8 @@ class Catalog:
|
|
|
1914
1864
|
C.size > 1000
|
|
1915
1865
|
)
|
|
1916
1866
|
"""
|
|
1917
|
-
from datachain.query.dataset import ExecutionResult
|
|
1918
1867
|
|
|
1919
|
-
feature_file = tempfile.NamedTemporaryFile(
|
|
1868
|
+
feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
|
|
1920
1869
|
dir=os.getcwd(), suffix=".py", delete=False
|
|
1921
1870
|
)
|
|
1922
1871
|
_, feature_module = os.path.split(feature_file.name)
|
|
@@ -1931,11 +1880,7 @@ class Catalog:
|
|
|
1931
1880
|
feature_module,
|
|
1932
1881
|
output_hook,
|
|
1933
1882
|
params,
|
|
1934
|
-
preview_columns,
|
|
1935
|
-
preview_limit,
|
|
1936
|
-
preview_offset,
|
|
1937
1883
|
save,
|
|
1938
|
-
save_as,
|
|
1939
1884
|
job_id,
|
|
1940
1885
|
)
|
|
1941
1886
|
finally:
|
|
@@ -1964,25 +1909,18 @@ class Catalog:
|
|
|
1964
1909
|
)
|
|
1965
1910
|
|
|
1966
1911
|
try:
|
|
1967
|
-
|
|
1912
|
+
result = json.loads(response_text)
|
|
1968
1913
|
except ValueError:
|
|
1969
|
-
|
|
1970
|
-
exec_result = ExecutionResult(**response)
|
|
1914
|
+
result = None
|
|
1971
1915
|
|
|
1972
1916
|
dataset: Optional[DatasetRecord] = None
|
|
1973
1917
|
version: Optional[int] = None
|
|
1974
|
-
if save
|
|
1918
|
+
if save:
|
|
1975
1919
|
dataset, version = self.save_result(
|
|
1976
|
-
query_script,
|
|
1920
|
+
query_script, result, output, version, job_id
|
|
1977
1921
|
)
|
|
1978
1922
|
|
|
1979
|
-
return QueryResult(
|
|
1980
|
-
dataset=dataset,
|
|
1981
|
-
version=version,
|
|
1982
|
-
output=output,
|
|
1983
|
-
preview=exec_result.preview,
|
|
1984
|
-
metrics=exec_result.metrics,
|
|
1985
|
-
)
|
|
1923
|
+
return QueryResult(dataset=dataset, version=version, output=output)
|
|
1986
1924
|
|
|
1987
1925
|
def run_query(
|
|
1988
1926
|
self,
|
|
@@ -1994,11 +1932,7 @@ class Catalog:
|
|
|
1994
1932
|
feature_module: str,
|
|
1995
1933
|
output_hook: Callable[[str], None],
|
|
1996
1934
|
params: Optional[dict[str, str]],
|
|
1997
|
-
preview_columns: Optional[list[str]],
|
|
1998
|
-
preview_limit: int,
|
|
1999
|
-
preview_offset: int,
|
|
2000
1935
|
save: bool,
|
|
2001
|
-
save_as: Optional[str],
|
|
2002
1936
|
job_id: Optional[str],
|
|
2003
1937
|
) -> tuple[list[str], subprocess.Popen, str]:
|
|
2004
1938
|
try:
|
|
@@ -2013,10 +1947,6 @@ class Catalog:
|
|
|
2013
1947
|
raise QueryScriptCompileError(
|
|
2014
1948
|
f"Query script failed to compile, reason: {exc}"
|
|
2015
1949
|
) from exc
|
|
2016
|
-
if save_as and save_as.startswith(QUERY_DATASET_PREFIX):
|
|
2017
|
-
raise ValueError(
|
|
2018
|
-
f"Cannot use {QUERY_DATASET_PREFIX} prefix for dataset name"
|
|
2019
|
-
)
|
|
2020
1950
|
r, w = os.pipe()
|
|
2021
1951
|
if os.name == "nt":
|
|
2022
1952
|
import msvcrt
|
|
@@ -2039,15 +1969,7 @@ class Catalog:
|
|
|
2039
1969
|
{
|
|
2040
1970
|
"DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
|
|
2041
1971
|
"PYTHONPATH": os.getcwd(), # For local imports
|
|
2042
|
-
"DATACHAIN_QUERY_PREVIEW_ARGS": json.dumps(
|
|
2043
|
-
{
|
|
2044
|
-
"limit": preview_limit,
|
|
2045
|
-
"offset": preview_offset,
|
|
2046
|
-
"columns": preview_columns,
|
|
2047
|
-
}
|
|
2048
|
-
),
|
|
2049
1972
|
"DATACHAIN_QUERY_SAVE": "1" if save else "",
|
|
2050
|
-
"DATACHAIN_QUERY_SAVE_AS": save_as or "",
|
|
2051
1973
|
"PYTHONUNBUFFERED": "1",
|
|
2052
1974
|
"DATACHAIN_OUTPUT_FD": str(handle),
|
|
2053
1975
|
"DATACHAIN_JOB_ID": job_id or "",
|
|
@@ -2077,12 +1999,12 @@ class Catalog:
|
|
|
2077
1999
|
return lines, proc, response_text
|
|
2078
2000
|
|
|
2079
2001
|
def save_result(self, query_script, exec_result, output, version, job_id):
|
|
2080
|
-
if not exec_result
|
|
2002
|
+
if not exec_result:
|
|
2081
2003
|
raise QueryScriptDatasetNotFound(
|
|
2082
2004
|
"No dataset found after running Query script",
|
|
2083
2005
|
output=output,
|
|
2084
2006
|
)
|
|
2085
|
-
name, version = exec_result
|
|
2007
|
+
name, version = exec_result
|
|
2086
2008
|
# finding returning dataset
|
|
2087
2009
|
try:
|
|
2088
2010
|
dataset = self.get_dataset(name)
|
|
@@ -14,6 +14,7 @@ import shtab
|
|
|
14
14
|
|
|
15
15
|
from datachain import utils
|
|
16
16
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
17
|
+
from datachain.lib.dc import DataChain
|
|
17
18
|
from datachain.utils import DataChainDir
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
@@ -472,9 +473,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
472
473
|
query_parser.add_argument(
|
|
473
474
|
"script", metavar="<script.py>", type=str, help="Filepath for script"
|
|
474
475
|
)
|
|
475
|
-
query_parser.add_argument(
|
|
476
|
-
"dataset_name", nargs="?", type=str, help="Save result dataset as"
|
|
477
|
-
)
|
|
478
476
|
query_parser.add_argument(
|
|
479
477
|
"--parallel",
|
|
480
478
|
nargs="?",
|
|
@@ -487,7 +485,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
487
485
|
"N defaults to the CPU count."
|
|
488
486
|
),
|
|
489
487
|
)
|
|
490
|
-
add_show_args(query_parser)
|
|
491
488
|
query_parser.add_argument(
|
|
492
489
|
"-p",
|
|
493
490
|
"--param",
|
|
@@ -619,18 +616,6 @@ def _ls_urls_flat(
|
|
|
619
616
|
raise FileNotFoundError(f"No such file or directory: {source}")
|
|
620
617
|
|
|
621
618
|
|
|
622
|
-
def ls_indexed_storages(catalog: "Catalog", long: bool = False) -> Iterator[str]:
|
|
623
|
-
from datachain.node import long_line_str
|
|
624
|
-
|
|
625
|
-
storage_uris = catalog.ls_storage_uris()
|
|
626
|
-
if long:
|
|
627
|
-
for uri in storage_uris:
|
|
628
|
-
# TODO: add Storage.created so it can be used here
|
|
629
|
-
yield long_line_str(uri, None, "")
|
|
630
|
-
else:
|
|
631
|
-
yield from storage_uris
|
|
632
|
-
|
|
633
|
-
|
|
634
619
|
def ls_local(
|
|
635
620
|
sources,
|
|
636
621
|
long: bool = False,
|
|
@@ -661,8 +646,9 @@ def ls_local(
|
|
|
661
646
|
for entry in entries:
|
|
662
647
|
print(format_ls_entry(entry))
|
|
663
648
|
else:
|
|
664
|
-
|
|
665
|
-
|
|
649
|
+
chain = DataChain.listings()
|
|
650
|
+
for ls in chain.collect("listing"):
|
|
651
|
+
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
666
652
|
|
|
667
653
|
|
|
668
654
|
def format_ls_entry(entry: str) -> str:
|
|
@@ -813,16 +799,10 @@ def show(
|
|
|
813
799
|
def query(
|
|
814
800
|
catalog: "Catalog",
|
|
815
801
|
script: str,
|
|
816
|
-
dataset_name: Optional[str] = None,
|
|
817
802
|
parallel: Optional[int] = None,
|
|
818
|
-
limit: int = 10,
|
|
819
|
-
offset: int = 0,
|
|
820
|
-
columns: Optional[list[str]] = None,
|
|
821
|
-
no_collapse: bool = False,
|
|
822
803
|
params: Optional[dict[str, str]] = None,
|
|
823
804
|
) -> None:
|
|
824
805
|
from datachain.data_storage import JobQueryType, JobStatus
|
|
825
|
-
from datachain.utils import show_records
|
|
826
806
|
|
|
827
807
|
with open(script, encoding="utf-8") as f:
|
|
828
808
|
script_content = f.read()
|
|
@@ -843,13 +823,9 @@ def query(
|
|
|
843
823
|
)
|
|
844
824
|
|
|
845
825
|
try:
|
|
846
|
-
|
|
826
|
+
catalog.query(
|
|
847
827
|
script_content,
|
|
848
828
|
python_executable=python_executable,
|
|
849
|
-
save_as=dataset_name,
|
|
850
|
-
preview_limit=limit,
|
|
851
|
-
preview_offset=offset,
|
|
852
|
-
preview_columns=columns,
|
|
853
829
|
capture_output=False,
|
|
854
830
|
params=params,
|
|
855
831
|
job_id=job_id,
|
|
@@ -864,10 +840,7 @@ def query(
|
|
|
864
840
|
error_stack=error_stack,
|
|
865
841
|
)
|
|
866
842
|
raise
|
|
867
|
-
|
|
868
|
-
catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE, metrics=result.metrics)
|
|
869
|
-
|
|
870
|
-
show_records(result.preview, collapse_columns=not no_collapse)
|
|
843
|
+
catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
|
|
871
844
|
|
|
872
845
|
|
|
873
846
|
def clear_cache(catalog: "Catalog"):
|
|
@@ -1042,12 +1015,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1042
1015
|
query(
|
|
1043
1016
|
catalog,
|
|
1044
1017
|
args.script,
|
|
1045
|
-
dataset_name=args.dataset_name,
|
|
1046
1018
|
parallel=args.parallel,
|
|
1047
|
-
limit=args.limit,
|
|
1048
|
-
offset=args.offset,
|
|
1049
|
-
columns=args.columns,
|
|
1050
|
-
no_collapse=args.no_collapse,
|
|
1051
1019
|
params=args.param,
|
|
1052
1020
|
)
|
|
1053
1021
|
elif args.command == "apply-udf":
|
|
@@ -87,6 +87,7 @@ class Client(ABC):
|
|
|
87
87
|
def get_implementation(url: str) -> type["Client"]:
|
|
88
88
|
from .azure import AzureClient
|
|
89
89
|
from .gcs import GCSClient
|
|
90
|
+
from .hf import HfClient
|
|
90
91
|
from .local import FileClient
|
|
91
92
|
from .s3 import ClientS3
|
|
92
93
|
|
|
@@ -104,6 +105,8 @@ class Client(ABC):
|
|
|
104
105
|
return AzureClient
|
|
105
106
|
if protocol == FileClient.protocol:
|
|
106
107
|
return FileClient
|
|
108
|
+
if protocol == HfClient.protocol:
|
|
109
|
+
return HfClient
|
|
107
110
|
|
|
108
111
|
raise NotImplementedError(f"Unsupported protocol: {protocol}")
|
|
109
112
|
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import posixpath
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from huggingface_hub import HfFileSystem
|
|
6
|
+
|
|
7
|
+
from datachain.lib.file import File
|
|
8
|
+
from datachain.node import Entry
|
|
9
|
+
|
|
10
|
+
from .fsspec import Client
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HfClient(Client):
|
|
14
|
+
FS_CLASS = HfFileSystem
|
|
15
|
+
PREFIX = "hf://"
|
|
16
|
+
protocol = "hf"
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def create_fs(cls, **kwargs) -> HfFileSystem:
|
|
20
|
+
if os.environ.get("HF_TOKEN"):
|
|
21
|
+
kwargs["token"] = os.environ["HF_TOKEN"]
|
|
22
|
+
|
|
23
|
+
return cast(HfFileSystem, super().create_fs(**kwargs))
|
|
24
|
+
|
|
25
|
+
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
26
|
+
return Entry.from_file(
|
|
27
|
+
path=path,
|
|
28
|
+
size=v["size"],
|
|
29
|
+
version=v["last_commit"].oid,
|
|
30
|
+
etag=v.get("blob_id", ""),
|
|
31
|
+
last_modified=v["last_commit"].date,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
35
|
+
return File(
|
|
36
|
+
path=path,
|
|
37
|
+
size=v["size"],
|
|
38
|
+
version=v["last_commit"].oid,
|
|
39
|
+
etag=v.get("blob_id", ""),
|
|
40
|
+
last_modified=v["last_commit"].date,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
async def ls_dir(self, path):
|
|
44
|
+
return self.fs.ls(path, detail=True)
|
|
45
|
+
|
|
46
|
+
def rel_path(self, path):
|
|
47
|
+
return posixpath.relpath(path, self.name)
|