datachain 0.3.16__tar.gz → 0.3.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.16 → datachain-0.3.18}/.pre-commit-config.yaml +1 -1
- {datachain-0.3.16/src/datachain.egg-info → datachain-0.3.18}/PKG-INFO +5 -3
- datachain-0.3.18/examples/llm_and_nlp/unstructured-embeddings-gen.py +76 -0
- datachain-0.3.16/examples/llm_and_nlp/unstructured-text.py → datachain-0.3.18/examples/llm_and_nlp/unstructured-summary-map.py +7 -3
- {datachain-0.3.16 → datachain-0.3.18}/examples/multimodal/hf_pipeline.py +7 -1
- {datachain-0.3.16 → datachain-0.3.18}/examples/multimodal/openai_image_desc_lib.py +0 -2
- {datachain-0.3.16 → datachain-0.3.18}/pyproject.toml +6 -4
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/cache.py +14 -55
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/catalog/catalog.py +21 -55
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/cli.py +7 -26
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/client/fsspec.py +29 -63
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/client/local.py +2 -3
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/data_storage/metastore.py +7 -66
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/data_storage/sqlite.py +5 -2
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/data_storage/warehouse.py +0 -22
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/arrow.py +2 -1
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/dc.py +5 -2
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/file.py +41 -23
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/listing.py +3 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/tar.py +2 -1
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/listing.py +4 -4
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/node.py +23 -9
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/nodes_fetcher.py +12 -5
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/nodes_thread_pool.py +1 -1
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/progress.py +2 -12
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/query/__init__.py +0 -2
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/query/dataset.py +26 -144
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/query/dispatch.py +2 -15
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/query/schema.py +36 -24
- datachain-0.3.18/src/datachain/query/udf.py +126 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/types.py +4 -2
- datachain-0.3.18/src/datachain/telemetry.py +37 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/utils.py +11 -40
- {datachain-0.3.16 → datachain-0.3.18/src/datachain.egg-info}/PKG-INFO +5 -3
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain.egg-info/SOURCES.txt +4 -3
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain.egg-info/requires.txt +4 -2
- {datachain-0.3.16 → datachain-0.3.18}/tests/conftest.py +20 -9
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_catalog.py +0 -116
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_datachain.py +627 -12
- datachain-0.3.18/tests/func/test_dataset_query.py +1195 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_datasets.py +102 -91
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_feature_pickling.py +0 -8
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_pull.py +23 -11
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_query.py +16 -10
- {datachain-0.3.16 → datachain-0.3.18}/tests/scripts/name_len_slow.py +9 -15
- {datachain-0.3.16 → datachain-0.3.18}/tests/test_cli_e2e.py +1 -0
- datachain-0.3.18/tests/test_telemetry.py +20 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_datachain.py +15 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_datachain_merge.py +98 -1
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_file.py +3 -26
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_cache.py +9 -4
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_data_storage.py +18 -11
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_utils.py +0 -25
- {datachain-0.3.16 → datachain-0.3.18}/tests/utils.py +22 -63
- datachain-0.3.16/src/datachain/query/builtins.py +0 -96
- datachain-0.3.16/src/datachain/query/udf.py +0 -272
- datachain-0.3.16/tests/func/test_dataset_query.py +0 -3580
- datachain-0.3.16/tests/unit/test_udf.py +0 -98
- {datachain-0.3.16 → datachain-0.3.18}/.cruft.json +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.gitattributes +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.github/codecov.yaml +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.github/dependabot.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.github/workflows/release.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.github/workflows/tests.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/.gitignore +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/LICENSE +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/README.rst +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/assets/datachain.svg +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/index.md +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/references/datachain.md +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/references/datatype.md +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/references/file.md +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/references/index.md +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/references/sql.md +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/references/torch.md +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/docs/references/udf.md +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/mkdocs.yml +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/noxfile.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/setup.cfg +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/__main__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/asyn.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/client/hf.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/config.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/dataset.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/error.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/job.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/hf.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/py.typed +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/query/params.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/query/session.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/storage.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/data.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/examples/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_client.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_listing.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_ls.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_client.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_session.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.16 → datachain-0.3.18}/tests/unit/test_warehouse.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.18
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -44,6 +44,7 @@ Requires-Dist: Pillow<11,>=10.0.0
|
|
|
44
44
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
45
45
|
Requires-Dist: psutil
|
|
46
46
|
Requires-Dist: huggingface_hub
|
|
47
|
+
Requires-Dist: iterative-telemetry>=0.0.9
|
|
47
48
|
Provides-Extra: docs
|
|
48
49
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
49
50
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -69,7 +70,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
|
69
70
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
70
71
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
71
72
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
72
|
-
Requires-Dist: pytest-servers[all]>=0.5.
|
|
73
|
+
Requires-Dist: pytest-servers[all]>=0.5.7; extra == "tests"
|
|
73
74
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
74
75
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
75
76
|
Requires-Dist: virtualenv; extra == "tests"
|
|
@@ -91,9 +92,10 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
91
92
|
Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
92
93
|
Requires-Dist: defusedxml; extra == "examples"
|
|
93
94
|
Requires-Dist: accelerate; extra == "examples"
|
|
94
|
-
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
95
|
+
Requires-Dist: unstructured[embed-huggingface,pdf]; extra == "examples"
|
|
95
96
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
96
97
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
98
|
+
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
97
99
|
|
|
98
100
|
================
|
|
99
101
|
|logo| DataChain
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
To install the required dependencies:
|
|
3
|
+
|
|
4
|
+
pip install datachain[examples]
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from collections.abc import Iterator
|
|
9
|
+
|
|
10
|
+
from unstructured.cleaners.core import (
|
|
11
|
+
clean,
|
|
12
|
+
group_broken_paragraphs,
|
|
13
|
+
replace_unicode_quotes,
|
|
14
|
+
)
|
|
15
|
+
from unstructured.embed.huggingface import (
|
|
16
|
+
HuggingFaceEmbeddingConfig,
|
|
17
|
+
HuggingFaceEmbeddingEncoder,
|
|
18
|
+
)
|
|
19
|
+
from unstructured.partition.pdf import partition_pdf
|
|
20
|
+
|
|
21
|
+
from datachain import C, DataChain, DataModel, File
|
|
22
|
+
|
|
23
|
+
source = "gs://datachain-demo/neurips/1987/"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Define the output as a DataModel class
|
|
27
|
+
class Chunk(DataModel):
|
|
28
|
+
key: str
|
|
29
|
+
text: str
|
|
30
|
+
embeddings: list[float]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Define embedding encoder
|
|
34
|
+
|
|
35
|
+
embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Use signatures to define UDF input/output
|
|
39
|
+
# these can be pydantic model or regular Python types
|
|
40
|
+
def process_pdf(file: File) -> Iterator[Chunk]:
|
|
41
|
+
# Ingest the file
|
|
42
|
+
with file.open() as f:
|
|
43
|
+
chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
|
|
44
|
+
|
|
45
|
+
# Clean the chunks and add new columns
|
|
46
|
+
for chunk in chunks:
|
|
47
|
+
chunk.apply(
|
|
48
|
+
lambda text: clean(
|
|
49
|
+
text, bullets=True, extra_whitespace=True, trailing_punctuation=True
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
chunk.apply(replace_unicode_quotes)
|
|
53
|
+
chunk.apply(group_broken_paragraphs)
|
|
54
|
+
|
|
55
|
+
# create embeddings
|
|
56
|
+
chunks_embedded = embedding_encoder.embed_documents(chunks)
|
|
57
|
+
|
|
58
|
+
# Add new rows to DataChain
|
|
59
|
+
for chunk in chunks_embedded:
|
|
60
|
+
yield Chunk(
|
|
61
|
+
key=file.path,
|
|
62
|
+
text=chunk.text,
|
|
63
|
+
embeddings=chunk.embeddings,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
dc = (
|
|
68
|
+
DataChain.from_storage(source)
|
|
69
|
+
.settings(parallel=-1)
|
|
70
|
+
.filter(C.file.path.glob("*.pdf"))
|
|
71
|
+
.gen(document=process_pdf)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
dc.save("embedded-documents")
|
|
75
|
+
|
|
76
|
+
DataChain.from_dataset("embedded-documents").show()
|
|
@@ -46,7 +46,8 @@ dependencies = [
|
|
|
46
46
|
"Pillow>=10.0.0,<11",
|
|
47
47
|
"msgpack>=1.0.4,<2",
|
|
48
48
|
"psutil",
|
|
49
|
-
"huggingface_hub"
|
|
49
|
+
"huggingface_hub",
|
|
50
|
+
"iterative-telemetry>=0.0.9"
|
|
50
51
|
]
|
|
51
52
|
|
|
52
53
|
[project.optional-dependencies]
|
|
@@ -80,7 +81,7 @@ tests = [
|
|
|
80
81
|
"pytest-sugar>=0.9.6",
|
|
81
82
|
"pytest-cov>=4.1.0",
|
|
82
83
|
"pytest-mock>=3.12.0",
|
|
83
|
-
"pytest-servers[all]>=0.5.
|
|
84
|
+
"pytest-servers[all]>=0.5.7",
|
|
84
85
|
"pytest-benchmark[histogram]",
|
|
85
86
|
"pytest-xdist>=3.3.1",
|
|
86
87
|
"virtualenv",
|
|
@@ -104,9 +105,10 @@ examples = [
|
|
|
104
105
|
"numpy>=1,<2",
|
|
105
106
|
"defusedxml",
|
|
106
107
|
"accelerate",
|
|
107
|
-
"unstructured[pdf]",
|
|
108
|
+
"unstructured[pdf, embed-huggingface]",
|
|
108
109
|
"pdfplumber==0.11.4",
|
|
109
|
-
"huggingface_hub[hf_transfer]"
|
|
110
|
+
"huggingface_hub[hf_transfer]",
|
|
111
|
+
"onnx==1.16.1"
|
|
110
112
|
]
|
|
111
113
|
|
|
112
114
|
[project.urls]
|
|
@@ -1,56 +1,15 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import json
|
|
3
1
|
import os
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from functools import partial
|
|
6
2
|
from typing import TYPE_CHECKING, Optional
|
|
7
3
|
|
|
8
|
-
import attrs
|
|
9
4
|
from dvc_data.hashfile.db.local import LocalHashFileDB
|
|
10
5
|
from dvc_objects.fs.local import LocalFileSystem
|
|
11
6
|
from fsspec.callbacks import Callback, TqdmCallback
|
|
12
7
|
|
|
13
|
-
from datachain.utils import TIME_ZERO
|
|
14
|
-
|
|
15
8
|
from .progress import Tqdm
|
|
16
9
|
|
|
17
10
|
if TYPE_CHECKING:
|
|
18
11
|
from datachain.client import Client
|
|
19
|
-
from datachain.
|
|
20
|
-
|
|
21
|
-
sha256 = partial(hashlib.sha256, usedforsecurity=False)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@attrs.frozen
|
|
25
|
-
class UniqueId:
|
|
26
|
-
storage: "StorageURI"
|
|
27
|
-
path: str
|
|
28
|
-
size: int
|
|
29
|
-
etag: str
|
|
30
|
-
version: str = ""
|
|
31
|
-
is_latest: bool = True
|
|
32
|
-
location: Optional[str] = None
|
|
33
|
-
last_modified: datetime = TIME_ZERO
|
|
34
|
-
|
|
35
|
-
def get_parsed_location(self) -> Optional[dict]:
|
|
36
|
-
if not self.location:
|
|
37
|
-
return None
|
|
38
|
-
|
|
39
|
-
loc_stack = (
|
|
40
|
-
json.loads(self.location)
|
|
41
|
-
if isinstance(self.location, str)
|
|
42
|
-
else self.location
|
|
43
|
-
)
|
|
44
|
-
if len(loc_stack) > 1:
|
|
45
|
-
raise NotImplementedError("Nested v-objects are not supported yet.")
|
|
46
|
-
|
|
47
|
-
return loc_stack[0]
|
|
48
|
-
|
|
49
|
-
def get_hash(self) -> str:
|
|
50
|
-
fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
|
|
51
|
-
if self.location:
|
|
52
|
-
fingerprint += f"/{self.location}"
|
|
53
|
-
return sha256(fingerprint.encode()).hexdigest()
|
|
12
|
+
from datachain.lib.file import File
|
|
54
13
|
|
|
55
14
|
|
|
56
15
|
def try_scandir(path):
|
|
@@ -77,30 +36,30 @@ class DataChainCache:
|
|
|
77
36
|
def tmp_dir(self):
|
|
78
37
|
return self.odb.tmp_dir
|
|
79
38
|
|
|
80
|
-
def get_path(self,
|
|
81
|
-
if self.contains(
|
|
82
|
-
return self.path_from_checksum(
|
|
39
|
+
def get_path(self, file: "File") -> Optional[str]:
|
|
40
|
+
if self.contains(file):
|
|
41
|
+
return self.path_from_checksum(file.get_hash())
|
|
83
42
|
return None
|
|
84
43
|
|
|
85
|
-
def contains(self,
|
|
86
|
-
return self.odb.exists(
|
|
44
|
+
def contains(self, file: "File") -> bool:
|
|
45
|
+
return self.odb.exists(file.get_hash())
|
|
87
46
|
|
|
88
47
|
def path_from_checksum(self, checksum: str) -> str:
|
|
89
48
|
assert checksum
|
|
90
49
|
return self.odb.oid_to_path(checksum)
|
|
91
50
|
|
|
92
|
-
def remove(self,
|
|
93
|
-
self.odb.delete(
|
|
51
|
+
def remove(self, file: "File") -> None:
|
|
52
|
+
self.odb.delete(file.get_hash())
|
|
94
53
|
|
|
95
54
|
async def download(
|
|
96
|
-
self,
|
|
55
|
+
self, file: "File", client: "Client", callback: Optional[Callback] = None
|
|
97
56
|
) -> None:
|
|
98
|
-
from_path = f"{
|
|
57
|
+
from_path = f"{file.source}/{file.path}"
|
|
99
58
|
from dvc_objects.fs.utils import tmp_fname
|
|
100
59
|
|
|
101
60
|
odb_fs = self.odb.fs
|
|
102
61
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
103
|
-
size =
|
|
62
|
+
size = file.size
|
|
104
63
|
if size < 0:
|
|
105
64
|
size = await client.get_size(from_path)
|
|
106
65
|
cb = callback or TqdmCallback(
|
|
@@ -115,13 +74,13 @@ class DataChainCache:
|
|
|
115
74
|
cb.close()
|
|
116
75
|
|
|
117
76
|
try:
|
|
118
|
-
oid =
|
|
77
|
+
oid = file.get_hash()
|
|
119
78
|
self.odb.add(tmp_info, self.odb.fs, oid)
|
|
120
79
|
finally:
|
|
121
80
|
os.unlink(tmp_info)
|
|
122
81
|
|
|
123
|
-
def store_data(self,
|
|
124
|
-
checksum =
|
|
82
|
+
def store_data(self, file: "File", contents: bytes) -> None:
|
|
83
|
+
checksum = file.get_hash()
|
|
125
84
|
dst = self.path_from_checksum(checksum)
|
|
126
85
|
if not os.path.exists(dst):
|
|
127
86
|
# Create the file only if it's not already in cache
|
|
@@ -34,7 +34,7 @@ import yaml
|
|
|
34
34
|
from sqlalchemy import Column
|
|
35
35
|
from tqdm import tqdm
|
|
36
36
|
|
|
37
|
-
from datachain.cache import DataChainCache
|
|
37
|
+
from datachain.cache import DataChainCache
|
|
38
38
|
from datachain.client import Client
|
|
39
39
|
from datachain.config import get_remote_config, read_config
|
|
40
40
|
from datachain.dataset import (
|
|
@@ -68,8 +68,6 @@ from datachain.utils import (
|
|
|
68
68
|
DataChainDir,
|
|
69
69
|
batched,
|
|
70
70
|
datachain_paths_join,
|
|
71
|
-
import_object,
|
|
72
|
-
parse_params_string,
|
|
73
71
|
)
|
|
74
72
|
|
|
75
73
|
from .datasource import DataSource
|
|
@@ -621,13 +619,13 @@ class Catalog:
|
|
|
621
619
|
code_ast.body[-1:] = new_expressions
|
|
622
620
|
return code_ast
|
|
623
621
|
|
|
624
|
-
def get_client(self, uri:
|
|
622
|
+
def get_client(self, uri: str, **config: Any) -> Client:
|
|
625
623
|
"""
|
|
626
624
|
Return the client corresponding to the given source `uri`.
|
|
627
625
|
"""
|
|
628
626
|
config = config or self.client_config
|
|
629
627
|
cls = Client.get_implementation(uri)
|
|
630
|
-
return cls.from_source(uri, self.cache, **config)
|
|
628
|
+
return cls.from_source(StorageURI(uri), self.cache, **config)
|
|
631
629
|
|
|
632
630
|
def enlist_source(
|
|
633
631
|
self,
|
|
@@ -843,7 +841,7 @@ class Catalog:
|
|
|
843
841
|
from datachain.query import DatasetQuery
|
|
844
842
|
|
|
845
843
|
def _row_to_node(d: dict[str, Any]) -> Node:
|
|
846
|
-
del d["
|
|
844
|
+
del d["file__source"]
|
|
847
845
|
return Node.from_dict(d)
|
|
848
846
|
|
|
849
847
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
@@ -1148,30 +1146,28 @@ class Catalog:
|
|
|
1148
1146
|
if not sources:
|
|
1149
1147
|
raise ValueError("Sources needs to be non empty list")
|
|
1150
1148
|
|
|
1151
|
-
from datachain.
|
|
1149
|
+
from datachain.lib.dc import DataChain
|
|
1150
|
+
from datachain.query.session import Session
|
|
1151
|
+
|
|
1152
|
+
session = Session.get(catalog=self, client_config=client_config)
|
|
1152
1153
|
|
|
1153
|
-
|
|
1154
|
+
chains = []
|
|
1154
1155
|
for source in sources:
|
|
1155
1156
|
if source.startswith(DATASET_PREFIX):
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
catalog=self,
|
|
1159
|
-
client_config=client_config,
|
|
1157
|
+
dc = DataChain.from_dataset(
|
|
1158
|
+
source[len(DATASET_PREFIX) :], session=session
|
|
1160
1159
|
)
|
|
1161
1160
|
else:
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
catalog=self,
|
|
1165
|
-
client_config=client_config,
|
|
1166
|
-
recursive=recursive,
|
|
1161
|
+
dc = DataChain.from_storage(
|
|
1162
|
+
source, session=session, recursive=recursive
|
|
1167
1163
|
)
|
|
1168
1164
|
|
|
1169
|
-
|
|
1165
|
+
chains.append(dc)
|
|
1170
1166
|
|
|
1171
1167
|
# create union of all dataset queries created from sources
|
|
1172
|
-
|
|
1168
|
+
dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
|
|
1173
1169
|
try:
|
|
1174
|
-
|
|
1170
|
+
dc.save(name)
|
|
1175
1171
|
except Exception as e: # noqa: BLE001
|
|
1176
1172
|
try:
|
|
1177
1173
|
ds = self.get_dataset(name)
|
|
@@ -1435,7 +1431,7 @@ class Catalog:
|
|
|
1435
1431
|
|
|
1436
1432
|
def get_file_signals(
|
|
1437
1433
|
self, dataset_name: str, dataset_version: int, row: RowDict
|
|
1438
|
-
) -> Optional[
|
|
1434
|
+
) -> Optional[RowDict]:
|
|
1439
1435
|
"""
|
|
1440
1436
|
Function that returns file signals from dataset row.
|
|
1441
1437
|
Note that signal names are without prefix, so if there was 'laion__file__source'
|
|
@@ -1452,7 +1448,7 @@ class Catalog:
|
|
|
1452
1448
|
|
|
1453
1449
|
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1454
1450
|
|
|
1455
|
-
file_signals_values =
|
|
1451
|
+
file_signals_values = RowDict()
|
|
1456
1452
|
|
|
1457
1453
|
schema = SignalSchema.deserialize(version.feature_schema)
|
|
1458
1454
|
for file_signals in schema.get_signals(File):
|
|
@@ -1480,6 +1476,8 @@ class Catalog:
|
|
|
1480
1476
|
use_cache: bool = True,
|
|
1481
1477
|
**config: Any,
|
|
1482
1478
|
):
|
|
1479
|
+
from datachain.lib.file import File
|
|
1480
|
+
|
|
1483
1481
|
file_signals = self.get_file_signals(dataset_name, dataset_version, row)
|
|
1484
1482
|
if not file_signals:
|
|
1485
1483
|
raise RuntimeError("Cannot open object without file signals")
|
|
@@ -1487,22 +1485,10 @@ class Catalog:
|
|
|
1487
1485
|
config = config or self.client_config
|
|
1488
1486
|
client = self.get_client(file_signals["source"], **config)
|
|
1489
1487
|
return client.open_object(
|
|
1490
|
-
|
|
1488
|
+
File._from_row(file_signals),
|
|
1491
1489
|
use_cache=use_cache,
|
|
1492
1490
|
)
|
|
1493
1491
|
|
|
1494
|
-
def _get_row_uid(self, row: RowDict) -> UniqueId:
|
|
1495
|
-
return UniqueId(
|
|
1496
|
-
row["source"],
|
|
1497
|
-
row["path"],
|
|
1498
|
-
row["size"],
|
|
1499
|
-
row["etag"],
|
|
1500
|
-
row["version"],
|
|
1501
|
-
row["is_latest"],
|
|
1502
|
-
row["location"],
|
|
1503
|
-
row["last_modified"],
|
|
1504
|
-
)
|
|
1505
|
-
|
|
1506
1492
|
def ls(
|
|
1507
1493
|
self,
|
|
1508
1494
|
sources: list[str],
|
|
@@ -1731,26 +1717,6 @@ class Catalog:
|
|
|
1731
1717
|
output, sources, client_config=client_config, recursive=recursive
|
|
1732
1718
|
)
|
|
1733
1719
|
|
|
1734
|
-
def apply_udf(
|
|
1735
|
-
self,
|
|
1736
|
-
udf_location: str,
|
|
1737
|
-
source: str,
|
|
1738
|
-
target_name: str,
|
|
1739
|
-
parallel: Optional[int] = None,
|
|
1740
|
-
params: Optional[str] = None,
|
|
1741
|
-
):
|
|
1742
|
-
from datachain.query import DatasetQuery
|
|
1743
|
-
|
|
1744
|
-
if source.startswith(DATASET_PREFIX):
|
|
1745
|
-
ds = DatasetQuery(name=source[len(DATASET_PREFIX) :], catalog=self)
|
|
1746
|
-
else:
|
|
1747
|
-
ds = DatasetQuery(path=source, catalog=self)
|
|
1748
|
-
udf = import_object(udf_location)
|
|
1749
|
-
if params:
|
|
1750
|
-
args, kwargs = parse_params_string(params)
|
|
1751
|
-
udf = udf(*args, **kwargs)
|
|
1752
|
-
ds.add_signals(udf, parallel=parallel).save(target_name)
|
|
1753
|
-
|
|
1754
1720
|
def query(
|
|
1755
1721
|
self,
|
|
1756
1722
|
query_script: str,
|
|
@@ -15,6 +15,7 @@ import shtab
|
|
|
15
15
|
from datachain import utils
|
|
16
16
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
17
17
|
from datachain.lib.dc import DataChain
|
|
18
|
+
from datachain.telemetry import telemetry
|
|
18
19
|
from datachain.utils import DataChainDir
|
|
19
20
|
|
|
20
21
|
if TYPE_CHECKING:
|
|
@@ -494,27 +495,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
494
495
|
help="Query parameters",
|
|
495
496
|
)
|
|
496
497
|
|
|
497
|
-
apply_udf_parser = subp.add_parser(
|
|
498
|
-
"apply-udf", parents=[parent_parser], description="Apply UDF"
|
|
499
|
-
)
|
|
500
|
-
apply_udf_parser.add_argument("udf", type=str, help="UDF location")
|
|
501
|
-
apply_udf_parser.add_argument("source", type=str, help="Source storage or dataset")
|
|
502
|
-
apply_udf_parser.add_argument("target", type=str, help="Target dataset name")
|
|
503
|
-
apply_udf_parser.add_argument(
|
|
504
|
-
"--parallel",
|
|
505
|
-
nargs="?",
|
|
506
|
-
type=int,
|
|
507
|
-
const=-1,
|
|
508
|
-
default=None,
|
|
509
|
-
metavar="N",
|
|
510
|
-
help=(
|
|
511
|
-
"Use multiprocessing to run the UDF with N worker processes. "
|
|
512
|
-
"N defaults to the CPU count."
|
|
513
|
-
),
|
|
514
|
-
)
|
|
515
|
-
apply_udf_parser.add_argument(
|
|
516
|
-
"--udf-params", type=str, default=None, help="UDF class parameters"
|
|
517
|
-
)
|
|
518
498
|
subp.add_parser(
|
|
519
499
|
"clear-cache", parents=[parent_parser], description="Clear the local file cache"
|
|
520
500
|
)
|
|
@@ -893,6 +873,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
893
873
|
# This also sets this environment variable for any subprocesses
|
|
894
874
|
os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
|
|
895
875
|
|
|
876
|
+
error = None
|
|
896
877
|
try:
|
|
897
878
|
catalog = get_catalog(client_config=client_config)
|
|
898
879
|
if args.command == "cp":
|
|
@@ -1016,10 +997,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1016
997
|
parallel=args.parallel,
|
|
1017
998
|
params=args.param,
|
|
1018
999
|
)
|
|
1019
|
-
elif args.command == "apply-udf":
|
|
1020
|
-
catalog.apply_udf(
|
|
1021
|
-
args.udf, args.source, args.target, args.parallel, args.udf_params
|
|
1022
|
-
)
|
|
1023
1000
|
elif args.command == "clear-cache":
|
|
1024
1001
|
clear_cache(catalog)
|
|
1025
1002
|
elif args.command == "gc":
|
|
@@ -1028,14 +1005,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1028
1005
|
print(f"invalid command: {args.command}", file=sys.stderr)
|
|
1029
1006
|
return 1
|
|
1030
1007
|
return 0
|
|
1031
|
-
except BrokenPipeError:
|
|
1008
|
+
except BrokenPipeError as exc:
|
|
1032
1009
|
# Python flushes standard streams on exit; redirect remaining output
|
|
1033
1010
|
# to devnull to avoid another BrokenPipeError at shutdown
|
|
1034
1011
|
# See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
|
|
1012
|
+
error = str(exc)
|
|
1035
1013
|
devnull = os.open(os.devnull, os.O_WRONLY)
|
|
1036
1014
|
os.dup2(devnull, sys.stdout.fileno())
|
|
1037
1015
|
return 141 # 128 + 13 (SIGPIPE)
|
|
1038
1016
|
except (KeyboardInterrupt, Exception) as exc:
|
|
1017
|
+
error = str(exc)
|
|
1039
1018
|
if isinstance(exc, KeyboardInterrupt):
|
|
1040
1019
|
msg = "Operation cancelled by the user"
|
|
1041
1020
|
else:
|
|
@@ -1053,3 +1032,5 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1053
1032
|
|
|
1054
1033
|
pdb.post_mortem()
|
|
1055
1034
|
return 1
|
|
1035
|
+
finally:
|
|
1036
|
+
telemetry.send_cli_call(args.command, error=error)
|