datachain 0.3.18__tar.gz → 0.3.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.18 → datachain-0.3.20}/.github/workflows/benchmarks.yml +11 -8
- {datachain-0.3.18 → datachain-0.3.20}/.github/workflows/release.yml +6 -7
- {datachain-0.3.18 → datachain-0.3.20}/.github/workflows/tests-studio.yml +8 -5
- {datachain-0.3.18 → datachain-0.3.20}/.github/workflows/tests.yml +27 -18
- {datachain-0.3.18/src/datachain.egg-info → datachain-0.3.20}/PKG-INFO +1 -2
- {datachain-0.3.18 → datachain-0.3.20}/docs/references/file.md +2 -2
- {datachain-0.3.18 → datachain-0.3.20}/noxfile.py +2 -2
- {datachain-0.3.18 → datachain-0.3.20}/pyproject.toml +0 -1
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/__init__.py +5 -2
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/catalog/catalog.py +28 -128
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/cli.py +0 -1
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/dataset.py +7 -2
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/error.py +6 -4
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/arrow.py +8 -3
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/dc.py +2 -2
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/file.py +23 -5
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/query/dataset.py +0 -22
- {datachain-0.3.18 → datachain-0.3.20/src/datachain.egg-info}/PKG-INFO +1 -2
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain.egg-info/requires.txt +0 -1
- {datachain-0.3.18 → datachain-0.3.20}/tests/conftest.py +1 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/examples/test_examples.py +38 -30
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_catalog.py +22 -142
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_datachain.py +43 -1
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_dataset_query.py +6 -2
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_datasets.py +6 -2
- datachain-0.3.20/tests/func/test_query.py +112 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_arrow.py +8 -9
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_module_exports.py +2 -2
- datachain-0.3.20/tests/unit/test_query.py +65 -0
- datachain-0.3.18/tests/func/test_query.py +0 -182
- {datachain-0.3.18 → datachain-0.3.20}/.cruft.json +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/.gitattributes +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/.github/codecov.yaml +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/.github/dependabot.yml +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/.gitignore +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/.pre-commit-config.yaml +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/LICENSE +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/README.rst +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/assets/datachain.svg +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/index.md +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/references/datachain.md +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/references/datatype.md +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/references/index.md +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/references/sql.md +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/references/torch.md +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/docs/references/udf.md +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/mkdocs.yml +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/setup.cfg +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/__main__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/asyn.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/cache.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/client/hf.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/client/local.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/config.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/job.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/hf.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/listing.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/tar.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/listing.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/node.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/progress.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/py.typed +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/query/params.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/query/schema.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/query/session.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/storage.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/telemetry.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain/utils.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/data.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/examples/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_client.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_listing.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_ls.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_pull.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/test_telemetry.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_client.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_session.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.18 → datachain-0.3.20}/tests/utils.py +0 -0
|
@@ -23,15 +23,18 @@ jobs:
|
|
|
23
23
|
uses: actions/setup-python@v5
|
|
24
24
|
with:
|
|
25
25
|
python-version: '3.12'
|
|
26
|
-
cache: 'pip'
|
|
27
26
|
|
|
28
|
-
- name:
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
27
|
+
- name: Setup uv
|
|
28
|
+
uses: astral-sh/setup-uv@v3
|
|
29
|
+
with:
|
|
30
|
+
enable-cache: true
|
|
31
|
+
cache-suffix: benchmarks
|
|
32
|
+
cache-dependency-glob: pyproject.toml
|
|
33
|
+
|
|
34
|
+
- name: Install nox and dvc
|
|
35
|
+
run: uv pip install dvc[gs] nox --system
|
|
33
36
|
|
|
34
|
-
-
|
|
35
|
-
|
|
37
|
+
- name: Pull dataset
|
|
38
|
+
run: dvc --cd tests/benchmarks/datasets pull
|
|
36
39
|
- name: Run benchmarks
|
|
37
40
|
run: nox -s bench
|
|
@@ -21,17 +21,16 @@ jobs:
|
|
|
21
21
|
with:
|
|
22
22
|
fetch-depth: 0
|
|
23
23
|
|
|
24
|
-
- name: Set up Python 3.
|
|
24
|
+
- name: Set up Python 3.12
|
|
25
25
|
uses: actions/setup-python@v5
|
|
26
26
|
with:
|
|
27
|
-
python-version: '3.
|
|
27
|
+
python-version: '3.12'
|
|
28
28
|
|
|
29
|
-
- name:
|
|
30
|
-
|
|
31
|
-
python -m pip install --upgrade 'nox[uv]'
|
|
32
|
-
nox --version
|
|
33
|
-
uv --version
|
|
29
|
+
- name: Setup uv
|
|
30
|
+
uses: astral-sh/setup-uv@v3
|
|
34
31
|
|
|
32
|
+
- name: Install nox
|
|
33
|
+
run: uv pip install nox --system
|
|
35
34
|
- name: Build package
|
|
36
35
|
run: nox -s build
|
|
37
36
|
|
|
@@ -82,12 +82,15 @@ jobs:
|
|
|
82
82
|
uses: actions/setup-python@v5
|
|
83
83
|
with:
|
|
84
84
|
python-version: ${{ matrix.pyv }}
|
|
85
|
-
cache: 'pip'
|
|
86
85
|
|
|
87
|
-
- name:
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
86
|
+
- name: Setup uv
|
|
87
|
+
uses: astral-sh/setup-uv@v3
|
|
88
|
+
with:
|
|
89
|
+
enable-cache: true
|
|
90
|
+
cache-suffix: studio
|
|
91
|
+
cache-dependency-glob: |
|
|
92
|
+
backend/datachain_server/pyproject.toml
|
|
93
|
+
backend/datachain/pyproject.toml
|
|
91
94
|
|
|
92
95
|
- name: Install dependencies
|
|
93
96
|
run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
|
|
@@ -26,13 +26,16 @@ jobs:
|
|
|
26
26
|
uses: actions/setup-python@v5
|
|
27
27
|
with:
|
|
28
28
|
python-version: '3.9'
|
|
29
|
-
cache: 'pip'
|
|
30
29
|
|
|
31
|
-
- name:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
- name: Setup uv
|
|
31
|
+
uses: astral-sh/setup-uv@v3
|
|
32
|
+
with:
|
|
33
|
+
enable-cache: true
|
|
34
|
+
cache-suffix: lint
|
|
35
|
+
cache-dependency-glob: pyproject.toml
|
|
36
|
+
|
|
37
|
+
- name: Install nox
|
|
38
|
+
run: uv pip install nox --system
|
|
36
39
|
|
|
37
40
|
- name: Cache mypy
|
|
38
41
|
uses: actions/cache@v4
|
|
@@ -77,13 +80,16 @@ jobs:
|
|
|
77
80
|
uses: actions/setup-python@v5
|
|
78
81
|
with:
|
|
79
82
|
python-version: ${{ matrix.pyv }}
|
|
80
|
-
cache: 'pip'
|
|
81
83
|
|
|
82
|
-
- name:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
84
|
+
- name: Setup uv
|
|
85
|
+
uses: astral-sh/setup-uv@v3
|
|
86
|
+
with:
|
|
87
|
+
enable-cache: true
|
|
88
|
+
cache-suffix: tests-${{ matrix.pyv }}
|
|
89
|
+
cache-dependency-glob: pyproject.toml
|
|
90
|
+
|
|
91
|
+
- name: Install nox
|
|
92
|
+
run: uv pip install nox --system
|
|
87
93
|
|
|
88
94
|
- name: Skip flaky azure, gs remotes on macOS
|
|
89
95
|
if: runner.os == 'macOS'
|
|
@@ -134,13 +140,16 @@ jobs:
|
|
|
134
140
|
uses: actions/setup-python@v5
|
|
135
141
|
with:
|
|
136
142
|
python-version: ${{ matrix.pyv }}
|
|
137
|
-
cache: 'pip'
|
|
138
143
|
|
|
139
|
-
- name:
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
+
- name: Setup uv
|
|
145
|
+
uses: astral-sh/setup-uv@v3
|
|
146
|
+
with:
|
|
147
|
+
enable-cache: true
|
|
148
|
+
cache-suffix: examples-${{ matrix.pyv }}
|
|
149
|
+
cache-dependency-glob: pyproject.toml
|
|
150
|
+
|
|
151
|
+
- name: Install nox
|
|
152
|
+
run: uv pip install nox --system
|
|
144
153
|
|
|
145
154
|
- name: Run examples
|
|
146
155
|
run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.20
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -34,7 +34,6 @@ Requires-Dist: dvc-objects<6,>=4
|
|
|
34
34
|
Requires-Dist: shtab<2,>=1.3.4
|
|
35
35
|
Requires-Dist: sqlalchemy>=2
|
|
36
36
|
Requires-Dist: multiprocess==0.70.16
|
|
37
|
-
Requires-Dist: dill==0.3.8
|
|
38
37
|
Requires-Dist: cloudpickle
|
|
39
38
|
Requires-Dist: orjson>=3.10.5
|
|
40
39
|
Requires-Dist: pydantic<3,>=2
|
|
@@ -7,6 +7,8 @@ automatically when creating a `DataChain` from files, like in
|
|
|
7
7
|
classes include various metadata fields about the underlying file as well as methods to
|
|
8
8
|
read from the files and otherwise work with the file contents.
|
|
9
9
|
|
|
10
|
+
::: datachain.lib.file.ArrowRow
|
|
11
|
+
|
|
10
12
|
::: datachain.lib.file.ExportPlacement
|
|
11
13
|
|
|
12
14
|
::: datachain.lib.file.File
|
|
@@ -15,8 +17,6 @@ read from the files and otherwise work with the file contents.
|
|
|
15
17
|
|
|
16
18
|
::: datachain.lib.file.ImageFile
|
|
17
19
|
|
|
18
|
-
::: datachain.lib.file.IndexedFile
|
|
19
|
-
|
|
20
20
|
::: datachain.lib.file.TarVFile
|
|
21
21
|
|
|
22
22
|
::: datachain.lib.file.TextFile
|
|
@@ -57,8 +57,8 @@ def lint(session: nox.Session) -> None:
|
|
|
57
57
|
|
|
58
58
|
@nox.session
|
|
59
59
|
def build(session: nox.Session) -> None:
|
|
60
|
-
session.install("
|
|
61
|
-
session.run("
|
|
60
|
+
session.install("twine", "uv")
|
|
61
|
+
session.run("uv", "build")
|
|
62
62
|
dists = glob.glob("dist/*")
|
|
63
63
|
session.run("twine", "check", *dists, silent=True)
|
|
64
64
|
|
|
@@ -1,21 +1,23 @@
|
|
|
1
1
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
2
2
|
from datachain.lib.dc import C, Column, DataChain, Sys
|
|
3
3
|
from datachain.lib.file import (
|
|
4
|
+
ArrowRow,
|
|
4
5
|
File,
|
|
5
6
|
FileError,
|
|
6
7
|
ImageFile,
|
|
7
|
-
IndexedFile,
|
|
8
8
|
TarVFile,
|
|
9
9
|
TextFile,
|
|
10
10
|
)
|
|
11
11
|
from datachain.lib.model_store import ModelStore
|
|
12
12
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
13
13
|
from datachain.lib.utils import AbstractUDF, DataChainError
|
|
14
|
+
from datachain.query import metrics, param
|
|
14
15
|
from datachain.query.session import Session
|
|
15
16
|
|
|
16
17
|
__all__ = [
|
|
17
18
|
"AbstractUDF",
|
|
18
19
|
"Aggregator",
|
|
20
|
+
"ArrowRow",
|
|
19
21
|
"C",
|
|
20
22
|
"Column",
|
|
21
23
|
"DataChain",
|
|
@@ -26,7 +28,6 @@ __all__ = [
|
|
|
26
28
|
"FileError",
|
|
27
29
|
"Generator",
|
|
28
30
|
"ImageFile",
|
|
29
|
-
"IndexedFile",
|
|
30
31
|
"Mapper",
|
|
31
32
|
"ModelStore",
|
|
32
33
|
"Session",
|
|
@@ -34,4 +35,6 @@ __all__ = [
|
|
|
34
35
|
"TarVFile",
|
|
35
36
|
"TextFile",
|
|
36
37
|
"is_chain_type",
|
|
38
|
+
"metrics",
|
|
39
|
+
"param",
|
|
37
40
|
]
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import ast
|
|
2
1
|
import glob
|
|
3
2
|
import io
|
|
4
3
|
import json
|
|
@@ -53,9 +52,9 @@ from datachain.error import (
|
|
|
53
52
|
DataChainError,
|
|
54
53
|
DatasetInvalidVersionError,
|
|
55
54
|
DatasetNotFoundError,
|
|
55
|
+
DatasetVersionNotFoundError,
|
|
56
56
|
PendingIndexingError,
|
|
57
57
|
QueryScriptCancelError,
|
|
58
|
-
QueryScriptCompileError,
|
|
59
58
|
QueryScriptRunError,
|
|
60
59
|
)
|
|
61
60
|
from datachain.listing import Listing
|
|
@@ -80,6 +79,7 @@ if TYPE_CHECKING:
|
|
|
80
79
|
)
|
|
81
80
|
from datachain.dataset import DatasetVersion
|
|
82
81
|
from datachain.job import Job
|
|
82
|
+
from datachain.lib.file import File
|
|
83
83
|
|
|
84
84
|
logger = logging.getLogger("datachain")
|
|
85
85
|
|
|
@@ -588,37 +588,6 @@ class Catalog:
|
|
|
588
588
|
def generate_query_dataset_name(cls) -> str:
|
|
589
589
|
return f"{QUERY_DATASET_PREFIX}_{uuid4().hex}"
|
|
590
590
|
|
|
591
|
-
def attach_query_wrapper(self, code_ast):
|
|
592
|
-
if code_ast.body:
|
|
593
|
-
last_expr = code_ast.body[-1]
|
|
594
|
-
if isinstance(last_expr, ast.Expr):
|
|
595
|
-
new_expressions = [
|
|
596
|
-
ast.Import(
|
|
597
|
-
names=[ast.alias(name="datachain.query.dataset", asname=None)]
|
|
598
|
-
),
|
|
599
|
-
ast.Expr(
|
|
600
|
-
value=ast.Call(
|
|
601
|
-
func=ast.Attribute(
|
|
602
|
-
value=ast.Attribute(
|
|
603
|
-
value=ast.Attribute(
|
|
604
|
-
value=ast.Name(id="datachain", ctx=ast.Load()),
|
|
605
|
-
attr="query",
|
|
606
|
-
ctx=ast.Load(),
|
|
607
|
-
),
|
|
608
|
-
attr="dataset",
|
|
609
|
-
ctx=ast.Load(),
|
|
610
|
-
),
|
|
611
|
-
attr="query_wrapper",
|
|
612
|
-
ctx=ast.Load(),
|
|
613
|
-
),
|
|
614
|
-
args=[last_expr],
|
|
615
|
-
keywords=[],
|
|
616
|
-
)
|
|
617
|
-
),
|
|
618
|
-
]
|
|
619
|
-
code_ast.body[-1:] = new_expressions
|
|
620
|
-
return code_ast
|
|
621
|
-
|
|
622
591
|
def get_client(self, uri: str, **config: Any) -> Client:
|
|
623
592
|
"""
|
|
624
593
|
Return the client corresponding to the given source `uri`.
|
|
@@ -1218,7 +1187,9 @@ class Catalog:
|
|
|
1218
1187
|
|
|
1219
1188
|
dataset_version = dataset.get_version(version)
|
|
1220
1189
|
if not dataset_version:
|
|
1221
|
-
raise
|
|
1190
|
+
raise DatasetVersionNotFoundError(
|
|
1191
|
+
f"Dataset {dataset.name} does not have version {version}"
|
|
1192
|
+
)
|
|
1222
1193
|
|
|
1223
1194
|
if not dataset_version.is_final_status():
|
|
1224
1195
|
raise ValueError("Cannot register dataset version in non final status")
|
|
@@ -1429,65 +1400,34 @@ class Catalog:
|
|
|
1429
1400
|
dataset = self.get_dataset(name)
|
|
1430
1401
|
return self.update_dataset(dataset, **update_data)
|
|
1431
1402
|
|
|
1432
|
-
def
|
|
1433
|
-
self, dataset_name: str, dataset_version: int, row: RowDict
|
|
1434
|
-
) ->
|
|
1403
|
+
def get_file_from_row(
|
|
1404
|
+
self, dataset_name: str, dataset_version: int, row: RowDict, signal_name: str
|
|
1405
|
+
) -> "File":
|
|
1435
1406
|
"""
|
|
1436
|
-
Function that returns file
|
|
1437
|
-
Note that signal names are without prefix, so if there was 'laion__file__source'
|
|
1438
|
-
in original row, result will have just 'source'
|
|
1439
|
-
Example output:
|
|
1440
|
-
{
|
|
1441
|
-
"source": "s3://ldb-public",
|
|
1442
|
-
"path": "animals/dogs/dog.jpg",
|
|
1443
|
-
...
|
|
1444
|
-
}
|
|
1407
|
+
Function that returns specific file signal from dataset row by name.
|
|
1445
1408
|
"""
|
|
1446
1409
|
from datachain.lib.file import File
|
|
1447
1410
|
from datachain.lib.signal_schema import DEFAULT_DELIMITER, SignalSchema
|
|
1448
1411
|
|
|
1449
1412
|
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1450
|
-
|
|
1451
|
-
file_signals_values = RowDict()
|
|
1452
|
-
|
|
1453
1413
|
schema = SignalSchema.deserialize(version.feature_schema)
|
|
1454
|
-
for file_signals in schema.get_signals(File):
|
|
1455
|
-
prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
|
|
1456
|
-
file_signals_values[file_signals] = {
|
|
1457
|
-
c_name.removeprefix(prefix): c_value
|
|
1458
|
-
for c_name, c_value in row.items()
|
|
1459
|
-
if c_name.startswith(prefix)
|
|
1460
|
-
and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
|
|
1461
|
-
}
|
|
1462
|
-
|
|
1463
|
-
if not file_signals_values:
|
|
1464
|
-
return None
|
|
1465
1414
|
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
def open_object(
|
|
1472
|
-
self,
|
|
1473
|
-
dataset_name: str,
|
|
1474
|
-
dataset_version: int,
|
|
1475
|
-
row: RowDict,
|
|
1476
|
-
use_cache: bool = True,
|
|
1477
|
-
**config: Any,
|
|
1478
|
-
):
|
|
1479
|
-
from datachain.lib.file import File
|
|
1415
|
+
if signal_name not in schema.get_signals(File):
|
|
1416
|
+
raise RuntimeError(
|
|
1417
|
+
f"File signal with path {signal_name} not found in ",
|
|
1418
|
+
f"dataset {dataset_name}@v{dataset_version} signals schema",
|
|
1419
|
+
)
|
|
1480
1420
|
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1421
|
+
prefix = signal_name.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
|
|
1422
|
+
file_signals = {
|
|
1423
|
+
c_name.removeprefix(prefix): c_value
|
|
1424
|
+
for c_name, c_value in row.items()
|
|
1425
|
+
if c_name.startswith(prefix)
|
|
1426
|
+
and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
|
|
1427
|
+
and c_name.removeprefix(prefix) in File.model_fields
|
|
1428
|
+
}
|
|
1484
1429
|
|
|
1485
|
-
|
|
1486
|
-
client = self.get_client(file_signals["source"], **config)
|
|
1487
|
-
return client.open_object(
|
|
1488
|
-
File._from_row(file_signals),
|
|
1489
|
-
use_cache=use_cache,
|
|
1490
|
-
)
|
|
1430
|
+
return File(**file_signals)
|
|
1491
1431
|
|
|
1492
1432
|
def ls(
|
|
1493
1433
|
self,
|
|
@@ -1581,7 +1521,7 @@ class Catalog:
|
|
|
1581
1521
|
|
|
1582
1522
|
try:
|
|
1583
1523
|
remote_dataset_version = remote_dataset.get_version(version)
|
|
1584
|
-
except (
|
|
1524
|
+
except (DatasetVersionNotFoundError, StopIteration) as exc:
|
|
1585
1525
|
raise DataChainError(
|
|
1586
1526
|
f"Dataset {remote_dataset_name} doesn't have version {version}"
|
|
1587
1527
|
" on server"
|
|
@@ -1722,64 +1662,24 @@ class Catalog:
|
|
|
1722
1662
|
query_script: str,
|
|
1723
1663
|
env: Optional[Mapping[str, str]] = None,
|
|
1724
1664
|
python_executable: str = sys.executable,
|
|
1725
|
-
|
|
1726
|
-
capture_output: bool = True,
|
|
1665
|
+
capture_output: bool = False,
|
|
1727
1666
|
output_hook: Callable[[str], None] = noop,
|
|
1728
1667
|
params: Optional[dict[str, str]] = None,
|
|
1729
1668
|
job_id: Optional[str] = None,
|
|
1730
|
-
_execute_last_expression: bool = False,
|
|
1731
1669
|
) -> None:
|
|
1732
|
-
""
|
|
1733
|
-
Method to run custom user Python script to run a query and, as result,
|
|
1734
|
-
creates new dataset from the results of a query.
|
|
1735
|
-
Returns tuple of result dataset and script output.
|
|
1736
|
-
|
|
1737
|
-
Constraints on query script:
|
|
1738
|
-
1. datachain.query.DatasetQuery should be used in order to create query
|
|
1739
|
-
for a dataset
|
|
1740
|
-
2. There should not be any .save() call on DatasetQuery since the idea
|
|
1741
|
-
is to create only one dataset as the outcome of the script
|
|
1742
|
-
3. Last statement must be an instance of DatasetQuery
|
|
1743
|
-
|
|
1744
|
-
If save is set to True, we are creating new dataset with results
|
|
1745
|
-
from dataset query. If it's set to False, we will just print results
|
|
1746
|
-
without saving anything
|
|
1747
|
-
|
|
1748
|
-
Example of query script:
|
|
1749
|
-
from datachain.query import DatasetQuery, C
|
|
1750
|
-
DatasetQuery('s3://ldb-public/remote/datasets/mnist-tiny/').filter(
|
|
1751
|
-
C.size > 1000
|
|
1752
|
-
)
|
|
1753
|
-
"""
|
|
1754
|
-
if _execute_last_expression:
|
|
1755
|
-
try:
|
|
1756
|
-
code_ast = ast.parse(query_script)
|
|
1757
|
-
code_ast = self.attach_query_wrapper(code_ast)
|
|
1758
|
-
query_script_compiled = ast.unparse(code_ast)
|
|
1759
|
-
except Exception as exc:
|
|
1760
|
-
raise QueryScriptCompileError(
|
|
1761
|
-
f"Query script failed to compile, reason: {exc}"
|
|
1762
|
-
) from exc
|
|
1763
|
-
else:
|
|
1764
|
-
query_script_compiled = query_script
|
|
1765
|
-
assert not save
|
|
1766
|
-
|
|
1670
|
+
cmd = [python_executable, "-c", query_script]
|
|
1767
1671
|
env = dict(env or os.environ)
|
|
1768
1672
|
env.update(
|
|
1769
1673
|
{
|
|
1770
1674
|
"DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
|
|
1771
|
-
"PYTHONPATH": os.getcwd(), # For local imports
|
|
1772
|
-
"DATACHAIN_QUERY_SAVE": "1" if save else "",
|
|
1773
|
-
"PYTHONUNBUFFERED": "1",
|
|
1774
1675
|
"DATACHAIN_JOB_ID": job_id or "",
|
|
1775
1676
|
},
|
|
1776
1677
|
)
|
|
1777
|
-
popen_kwargs = {}
|
|
1678
|
+
popen_kwargs: dict[str, Any] = {}
|
|
1778
1679
|
if capture_output:
|
|
1779
1680
|
popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
|
|
1780
1681
|
|
|
1781
|
-
cmd =
|
|
1782
|
-
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # type: ignore[call-overload] # noqa: S603
|
|
1682
|
+
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
|
|
1783
1683
|
if capture_output:
|
|
1784
1684
|
args = (proc.stdout, output_hook)
|
|
1785
1685
|
thread = Thread(target=_process_stream, args=args, daemon=True)
|
|
@@ -12,6 +12,7 @@ from typing import (
|
|
|
12
12
|
from urllib.parse import urlparse
|
|
13
13
|
|
|
14
14
|
from datachain.client import Client
|
|
15
|
+
from datachain.error import DatasetVersionNotFoundError
|
|
15
16
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
@@ -417,7 +418,9 @@ class DatasetRecord:
|
|
|
417
418
|
|
|
418
419
|
def get_version(self, version: int) -> DatasetVersion:
|
|
419
420
|
if not self.has_version(version):
|
|
420
|
-
raise
|
|
421
|
+
raise DatasetVersionNotFoundError(
|
|
422
|
+
f"Dataset {self.name} does not have version {version}"
|
|
423
|
+
)
|
|
421
424
|
return next(
|
|
422
425
|
v
|
|
423
426
|
for v in self.versions # type: ignore [union-attr]
|
|
@@ -435,7 +438,9 @@ class DatasetRecord:
|
|
|
435
438
|
Get identifier in the form my-dataset@v3
|
|
436
439
|
"""
|
|
437
440
|
if not self.has_version(version):
|
|
438
|
-
raise
|
|
441
|
+
raise DatasetVersionNotFoundError(
|
|
442
|
+
f"Dataset {self.name} doesn't have a version {version}"
|
|
443
|
+
)
|
|
439
444
|
return f"{self.name}@v{version}"
|
|
440
445
|
|
|
441
446
|
def uri(self, version: int) -> str:
|
|
@@ -10,6 +10,10 @@ class DatasetNotFoundError(NotFoundError):
|
|
|
10
10
|
pass
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
class DatasetVersionNotFoundError(NotFoundError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
13
17
|
class DatasetInvalidVersionError(Exception):
|
|
14
18
|
pass
|
|
15
19
|
|
|
@@ -32,14 +36,12 @@ class QueryScriptRunError(Exception):
|
|
|
32
36
|
Attributes:
|
|
33
37
|
message Explanation of the error
|
|
34
38
|
return_code Code returned by the subprocess
|
|
35
|
-
output STDOUT + STDERR output of the subprocess
|
|
36
39
|
"""
|
|
37
40
|
|
|
38
|
-
def __init__(self, message: str, return_code: int = 0
|
|
41
|
+
def __init__(self, message: str, return_code: int = 0):
|
|
39
42
|
self.message = message
|
|
40
43
|
self.return_code = return_code
|
|
41
|
-
|
|
42
|
-
super().__init__(self.message)
|
|
44
|
+
super().__init__(message)
|
|
43
45
|
|
|
44
46
|
|
|
45
47
|
class QueryScriptCancelError(QueryScriptRunError):
|
|
@@ -4,11 +4,11 @@ from tempfile import NamedTemporaryFile
|
|
|
4
4
|
from typing import TYPE_CHECKING, Optional
|
|
5
5
|
|
|
6
6
|
import pyarrow as pa
|
|
7
|
-
from pyarrow.dataset import dataset
|
|
7
|
+
from pyarrow.dataset import CsvFileFormat, dataset
|
|
8
8
|
from tqdm import tqdm
|
|
9
9
|
|
|
10
10
|
from datachain.lib.data_model import dict_to_data_model
|
|
11
|
-
from datachain.lib.file import
|
|
11
|
+
from datachain.lib.file import ArrowRow, File
|
|
12
12
|
from datachain.lib.model_store import ModelStore
|
|
13
13
|
from datachain.lib.udf import Generator
|
|
14
14
|
|
|
@@ -84,7 +84,12 @@ class ArrowGenerator(Generator):
|
|
|
84
84
|
vals_dict[field] = val
|
|
85
85
|
vals = [self.output_schema(**vals_dict)]
|
|
86
86
|
if self.source:
|
|
87
|
-
|
|
87
|
+
kwargs: dict = self.kwargs
|
|
88
|
+
# Can't serialize CsvFileFormat; may lose formatting options.
|
|
89
|
+
if isinstance(kwargs.get("format"), CsvFileFormat):
|
|
90
|
+
kwargs["format"] = "csv"
|
|
91
|
+
arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
|
|
92
|
+
yield [arrow_file, *vals]
|
|
88
93
|
else:
|
|
89
94
|
yield vals
|
|
90
95
|
index += 1
|
|
@@ -26,8 +26,8 @@ from datachain.lib.convert.python_to_sql import python_to_sql
|
|
|
26
26
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
27
27
|
from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
28
28
|
from datachain.lib.dataset_info import DatasetInfo
|
|
29
|
+
from datachain.lib.file import ArrowRow, File, get_file_type
|
|
29
30
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
30
|
-
from datachain.lib.file import File, IndexedFile, get_file_type
|
|
31
31
|
from datachain.lib.listing import (
|
|
32
32
|
is_listing_dataset,
|
|
33
33
|
is_listing_expired,
|
|
@@ -1614,7 +1614,7 @@ class DataChain(DatasetQuery):
|
|
|
1614
1614
|
for name, info in output.model_fields.items()
|
|
1615
1615
|
}
|
|
1616
1616
|
if source:
|
|
1617
|
-
output = {"source":
|
|
1617
|
+
output = {"source": ArrowRow} | output # type: ignore[assignment,operator]
|
|
1618
1618
|
return self.gen(
|
|
1619
1619
|
ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
|
|
1620
1620
|
)
|
|
@@ -17,6 +17,7 @@ from urllib.request import url2pathname
|
|
|
17
17
|
|
|
18
18
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
19
19
|
from PIL import Image
|
|
20
|
+
from pyarrow.dataset import dataset
|
|
20
21
|
from pydantic import Field, field_validator
|
|
21
22
|
|
|
22
23
|
if TYPE_CHECKING:
|
|
@@ -439,14 +440,31 @@ class ImageFile(File):
|
|
|
439
440
|
self.read().save(destination)
|
|
440
441
|
|
|
441
442
|
|
|
442
|
-
class
|
|
443
|
-
"""
|
|
444
|
-
|
|
445
|
-
Includes `file` and `index` signals.
|
|
446
|
-
"""
|
|
443
|
+
class ArrowRow(DataModel):
|
|
444
|
+
"""`DataModel` for reading row from Arrow-supported file."""
|
|
447
445
|
|
|
448
446
|
file: File
|
|
449
447
|
index: int
|
|
448
|
+
kwargs: dict
|
|
449
|
+
|
|
450
|
+
@contextmanager
|
|
451
|
+
def open(self):
|
|
452
|
+
"""Stream row contents from indexed file."""
|
|
453
|
+
if self.file._caching_enabled:
|
|
454
|
+
self.file.ensure_cached()
|
|
455
|
+
path = self.file.get_local_path()
|
|
456
|
+
ds = dataset(path, **self.kwargs)
|
|
457
|
+
|
|
458
|
+
else:
|
|
459
|
+
path = self.file.get_path()
|
|
460
|
+
ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
|
|
461
|
+
|
|
462
|
+
return ds.take([self.index]).to_reader()
|
|
463
|
+
|
|
464
|
+
def read(self):
|
|
465
|
+
"""Returns row contents as dict."""
|
|
466
|
+
with self.open() as record_batch:
|
|
467
|
+
return record_batch.to_pylist()[0]
|
|
450
468
|
|
|
451
469
|
|
|
452
470
|
def get_file_type(type_: Literal["binary", "text", "image"] = "binary") -> type[File]:
|
|
@@ -1604,25 +1604,3 @@ class DatasetQuery:
|
|
|
1604
1604
|
finally:
|
|
1605
1605
|
self.cleanup()
|
|
1606
1606
|
return self.__class__(name=name, version=version, catalog=self.catalog)
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
def query_wrapper(dataset_query: Any) -> Any:
|
|
1610
|
-
"""
|
|
1611
|
-
Wrapper function that wraps the last statement of user query script.
|
|
1612
|
-
Last statement MUST be instance of DatasetQuery, otherwise script exits with
|
|
1613
|
-
error code 10
|
|
1614
|
-
"""
|
|
1615
|
-
if not isinstance(dataset_query, DatasetQuery):
|
|
1616
|
-
return dataset_query
|
|
1617
|
-
|
|
1618
|
-
catalog = dataset_query.catalog
|
|
1619
|
-
save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
|
|
1620
|
-
|
|
1621
|
-
is_session_temp_dataset = dataset_query.name and dataset_query.name.startswith(
|
|
1622
|
-
dataset_query.session.get_temp_prefix()
|
|
1623
|
-
)
|
|
1624
|
-
|
|
1625
|
-
if save and (is_session_temp_dataset or not dataset_query.attached):
|
|
1626
|
-
name = catalog.generate_query_dataset_name()
|
|
1627
|
-
dataset_query = dataset_query.save(name)
|
|
1628
|
-
return dataset_query
|