datachain 0.5.0__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.5.0 → datachain-0.5.1}/.pre-commit-config.yaml +1 -1
- {datachain-0.5.0/src/datachain.egg-info → datachain-0.5.1}/PKG-INFO +1 -1
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/catalog/catalog.py +8 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/metastore.py +20 -1
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/sqlite.py +24 -32
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/arrow.py +64 -19
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/dc.py +113 -10
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/udf.py +100 -78
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/udf_signature.py +8 -6
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/dataset.py +6 -6
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/dispatch.py +2 -2
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/session.py +42 -0
- {datachain-0.5.0 → datachain-0.5.1/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain.egg-info/SOURCES.txt +2 -1
- datachain-0.5.1/tests/scripts/feature_class_exception.py +24 -0
- datachain-0.5.1/tests/test_atomicity.py +58 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_datachain.py +169 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_id_generator.py +18 -0
- datachain-0.5.0/src/datachain/query/udf.py +0 -126
- {datachain-0.5.0 → datachain-0.5.1}/.cruft.json +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.gitattributes +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.github/codecov.yaml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.github/dependabot.yml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.github/workflows/release.yml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/.gitignore +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/CONTRIBUTING.rst +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/LICENSE +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/README.rst +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/assets/flowchart.png +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/index.md +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/references/datachain.md +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/references/datatype.md +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/references/file.md +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/references/index.md +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/references/sql.md +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/references/torch.md +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/docs/references/udf.md +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/mkdocs.yml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/noxfile.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/overrides/main.html +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/pyproject.toml +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/setup.cfg +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/__main__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/asyn.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/cache.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/cli.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/cli_utils.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/local.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/config.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/dataset.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/error.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/job.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/file.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/listing.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/node.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/progress.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/py.typed +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/batch.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/params.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/schema.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/storage.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain/utils.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/conftest.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/data.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/examples/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_client.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_datachain.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_datasets.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_listing.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_ls.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_pull.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_query.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/test_telemetry.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_client.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_query.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_session.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_storage.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.5.0 → datachain-0.5.1}/tests/utils.py +0 -0
|
@@ -988,6 +988,14 @@ class Catalog:
|
|
|
988
988
|
schema = {
|
|
989
989
|
c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
|
|
990
990
|
}
|
|
991
|
+
|
|
992
|
+
job_id = job_id or os.getenv("DATACHAIN_JOB_ID")
|
|
993
|
+
if not job_id:
|
|
994
|
+
from datachain.query.session import Session
|
|
995
|
+
|
|
996
|
+
session = Session.get(catalog=self)
|
|
997
|
+
job_id = session.job_id
|
|
998
|
+
|
|
991
999
|
dataset = self.metastore.create_dataset_version(
|
|
992
1000
|
dataset,
|
|
993
1001
|
version,
|
|
@@ -50,7 +50,6 @@ if TYPE_CHECKING:
|
|
|
50
50
|
from datachain.data_storage import AbstractIDGenerator, schema
|
|
51
51
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
52
52
|
|
|
53
|
-
|
|
54
53
|
logger = logging.getLogger("datachain")
|
|
55
54
|
|
|
56
55
|
|
|
@@ -384,6 +383,11 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
384
383
|
) -> None:
|
|
385
384
|
"""Set the status of the given job and dataset."""
|
|
386
385
|
|
|
386
|
+
@abstractmethod
|
|
387
|
+
def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
|
|
388
|
+
"""Returns dataset names and versions for the job."""
|
|
389
|
+
raise NotImplementedError
|
|
390
|
+
|
|
387
391
|
|
|
388
392
|
class AbstractDBMetastore(AbstractMetastore):
|
|
389
393
|
"""
|
|
@@ -1519,3 +1523,18 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1519
1523
|
.values(status=dataset_status)
|
|
1520
1524
|
)
|
|
1521
1525
|
self.db.execute(query, conn=conn) # type: ignore[attr-defined]
|
|
1526
|
+
|
|
1527
|
+
def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
|
|
1528
|
+
"""Returns dataset names and versions for the job."""
|
|
1529
|
+
dv = self._datasets_versions
|
|
1530
|
+
ds = self._datasets
|
|
1531
|
+
|
|
1532
|
+
join_condition = dv.c.dataset_id == ds.c.id
|
|
1533
|
+
|
|
1534
|
+
query = (
|
|
1535
|
+
self._datasets_versions_select(ds.c.name, dv.c.version)
|
|
1536
|
+
.select_from(dv.join(ds, join_condition))
|
|
1537
|
+
.where(dv.c.job_id == job_id)
|
|
1538
|
+
)
|
|
1539
|
+
|
|
1540
|
+
return list(self.db.execute(query))
|
|
@@ -15,6 +15,7 @@ from typing import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
import sqlalchemy
|
|
18
|
+
from packaging import version
|
|
18
19
|
from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
|
|
19
20
|
from sqlalchemy.dialects import sqlite
|
|
20
21
|
from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
|
|
@@ -153,7 +154,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
153
154
|
if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
|
|
154
155
|
import sys
|
|
155
156
|
|
|
156
|
-
db.set_trace_callback(sys.stderr
|
|
157
|
+
db.set_trace_callback(lambda stmt: print(stmt, file=sys.stderr))
|
|
157
158
|
|
|
158
159
|
load_usearch_extension(db)
|
|
159
160
|
|
|
@@ -345,45 +346,36 @@ class SQLiteIDGenerator(AbstractDBIDGenerator):
|
|
|
345
346
|
def get_next_ids(self, uri: str, count: int) -> range:
|
|
346
347
|
"""Returns a range of IDs for the given URI."""
|
|
347
348
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
# leaving fallback to the current implementation for older versions of SQLite,
|
|
353
|
-
# which is still supported, for example, in Ubuntu 20.04 LTS (Focal Fossa),
|
|
354
|
-
# where SQLite version 3.31.1 is used.
|
|
355
|
-
|
|
356
|
-
# sqlite_version = version.parse(sqlite3.sqlite_version)
|
|
357
|
-
# if sqlite_version >= version.parse("3.35.0"):
|
|
358
|
-
# # RETURNING is supported on SQLite 3.35.0 (2021-03-12) or newer
|
|
359
|
-
# stmt = (
|
|
360
|
-
# sqlite.insert(self._table)
|
|
361
|
-
# .values(uri=uri, last_id=count)
|
|
362
|
-
# .on_conflict_do_update(
|
|
363
|
-
# index_elements=["uri"],
|
|
364
|
-
# set_={"last_id": self._table.c.last_id + count},
|
|
365
|
-
# )
|
|
366
|
-
# .returning(self._table.c.last_id)
|
|
367
|
-
# )
|
|
368
|
-
# last_id = self._db.execute(stmt).fetchone()[0]
|
|
369
|
-
# else:
|
|
370
|
-
# (fallback to the current implementation with a transaction)
|
|
371
|
-
|
|
372
|
-
# Transactions ensure no concurrency conflicts
|
|
373
|
-
with self._db.transaction() as conn:
|
|
374
|
-
# UPSERT syntax was added to SQLite with version 3.24.0 (2018-06-04).
|
|
375
|
-
stmt_ins = (
|
|
349
|
+
sqlite_version = version.parse(sqlite3.sqlite_version)
|
|
350
|
+
is_returning_supported = sqlite_version >= version.parse("3.35.0")
|
|
351
|
+
if is_returning_supported:
|
|
352
|
+
stmt = (
|
|
376
353
|
sqlite.insert(self._table)
|
|
377
354
|
.values(uri=uri, last_id=count)
|
|
378
355
|
.on_conflict_do_update(
|
|
379
356
|
index_elements=["uri"],
|
|
380
357
|
set_={"last_id": self._table.c.last_id + count},
|
|
381
358
|
)
|
|
359
|
+
.returning(self._table.c.last_id)
|
|
382
360
|
)
|
|
383
|
-
self._db.execute(
|
|
361
|
+
last_id = self._db.execute(stmt).fetchone()[0]
|
|
362
|
+
else:
|
|
363
|
+
# Older versions of SQLite are still the default under Ubuntu LTS,
|
|
364
|
+
# e.g. Ubuntu 20.04 LTS (Focal Fossa) uses 3.31.1
|
|
365
|
+
# Transactions ensure no concurrency conflicts
|
|
366
|
+
with self._db.transaction() as conn:
|
|
367
|
+
stmt_ins = (
|
|
368
|
+
sqlite.insert(self._table)
|
|
369
|
+
.values(uri=uri, last_id=count)
|
|
370
|
+
.on_conflict_do_update(
|
|
371
|
+
index_elements=["uri"],
|
|
372
|
+
set_={"last_id": self._table.c.last_id + count},
|
|
373
|
+
)
|
|
374
|
+
)
|
|
375
|
+
self._db.execute(stmt_ins, conn=conn)
|
|
384
376
|
|
|
385
|
-
|
|
386
|
-
|
|
377
|
+
stmt_sel = select(self._table.c.last_id).where(self._table.c.uri == uri)
|
|
378
|
+
last_id = self._db.execute(stmt_sel, conn=conn).fetchone()[0]
|
|
387
379
|
|
|
388
380
|
return range(last_id - count + 1, last_id + 1)
|
|
389
381
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from collections.abc import Sequence
|
|
3
3
|
from tempfile import NamedTemporaryFile
|
|
4
|
-
from typing import TYPE_CHECKING, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
5
5
|
|
|
6
|
+
import orjson
|
|
6
7
|
import pyarrow as pa
|
|
7
8
|
from pyarrow.dataset import CsvFileFormat, dataset
|
|
8
9
|
from tqdm import tqdm
|
|
@@ -10,6 +11,7 @@ from tqdm import tqdm
|
|
|
10
11
|
from datachain.lib.data_model import dict_to_data_model
|
|
11
12
|
from datachain.lib.file import ArrowRow, File
|
|
12
13
|
from datachain.lib.model_store import ModelStore
|
|
14
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
13
15
|
from datachain.lib.udf import Generator
|
|
14
16
|
|
|
15
17
|
if TYPE_CHECKING:
|
|
@@ -20,6 +22,9 @@ if TYPE_CHECKING:
|
|
|
20
22
|
from datachain.lib.dc import DataChain
|
|
21
23
|
|
|
22
24
|
|
|
25
|
+
DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
|
|
26
|
+
|
|
27
|
+
|
|
23
28
|
class ArrowGenerator(Generator):
|
|
24
29
|
def __init__(
|
|
25
30
|
self,
|
|
@@ -61,28 +66,35 @@ class ArrowGenerator(Generator):
|
|
|
61
66
|
path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
|
|
62
67
|
)
|
|
63
68
|
hf_schema = _get_hf_schema(ds.schema)
|
|
69
|
+
use_datachain_schema = (
|
|
70
|
+
bool(ds.schema.metadata)
|
|
71
|
+
and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in ds.schema.metadata
|
|
72
|
+
)
|
|
64
73
|
index = 0
|
|
65
74
|
with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
|
|
66
75
|
for record_batch in ds.to_batches():
|
|
67
76
|
for record in record_batch.to_pylist():
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
77
|
+
if use_datachain_schema and self.output_schema:
|
|
78
|
+
vals = [_nested_model_instantiate(record, self.output_schema)]
|
|
79
|
+
else:
|
|
80
|
+
vals = list(record.values())
|
|
81
|
+
if self.output_schema:
|
|
82
|
+
fields = self.output_schema.model_fields
|
|
83
|
+
vals_dict = {}
|
|
84
|
+
for i, ((field, field_info), val) in enumerate(
|
|
85
|
+
zip(fields.items(), vals)
|
|
86
|
+
):
|
|
87
|
+
anno = field_info.annotation
|
|
88
|
+
if hf_schema:
|
|
89
|
+
from datachain.lib.hf import convert_feature
|
|
90
|
+
|
|
91
|
+
feat = list(hf_schema[0].values())[i]
|
|
92
|
+
vals_dict[field] = convert_feature(val, feat, anno)
|
|
93
|
+
elif ModelStore.is_pydantic(anno):
|
|
94
|
+
vals_dict[field] = anno(**val) # type: ignore[misc]
|
|
95
|
+
else:
|
|
96
|
+
vals_dict[field] = val
|
|
97
|
+
vals = [self.output_schema(**vals_dict)]
|
|
86
98
|
if self.source:
|
|
87
99
|
kwargs: dict = self.kwargs
|
|
88
100
|
# Can't serialize CsvFileFormat; may lose formatting options.
|
|
@@ -113,6 +125,9 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
113
125
|
)
|
|
114
126
|
if not col_names:
|
|
115
127
|
col_names = schema.names
|
|
128
|
+
signal_schema = _get_datachain_schema(schema)
|
|
129
|
+
if signal_schema:
|
|
130
|
+
return signal_schema.values
|
|
116
131
|
columns = _convert_col_names(col_names) # type: ignore[arg-type]
|
|
117
132
|
hf_schema = _get_hf_schema(schema)
|
|
118
133
|
if hf_schema:
|
|
@@ -197,3 +212,33 @@ def _get_hf_schema(
|
|
|
197
212
|
features = schema_from_arrow(schema)
|
|
198
213
|
return features, get_output_schema(features)
|
|
199
214
|
return None
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _get_datachain_schema(schema: "pa.Schema") -> Optional[SignalSchema]:
|
|
218
|
+
"""Return a restored SignalSchema from parquet metadata, if any is found."""
|
|
219
|
+
if schema.metadata and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in schema.metadata:
|
|
220
|
+
serialized_signal_schema = orjson.loads(
|
|
221
|
+
schema.metadata[DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY]
|
|
222
|
+
)
|
|
223
|
+
return SignalSchema.deserialize(serialized_signal_schema)
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _nested_model_instantiate(
|
|
228
|
+
column_values: dict[str, Any], model: type["BaseModel"], prefix: str = ""
|
|
229
|
+
) -> "BaseModel":
|
|
230
|
+
"""Instantiate the given model and all sub-models/fields based on the provided
|
|
231
|
+
column values."""
|
|
232
|
+
vals_dict = {}
|
|
233
|
+
for field, field_info in model.model_fields.items():
|
|
234
|
+
anno = field_info.annotation
|
|
235
|
+
cur_path = f"{prefix}.{field}" if prefix else field
|
|
236
|
+
if ModelStore.is_pydantic(anno):
|
|
237
|
+
vals_dict[field] = _nested_model_instantiate(
|
|
238
|
+
column_values,
|
|
239
|
+
anno, # type: ignore[arg-type]
|
|
240
|
+
prefix=cur_path,
|
|
241
|
+
)
|
|
242
|
+
elif cur_path in column_values:
|
|
243
|
+
vals_dict[field] = column_values[cur_path]
|
|
244
|
+
return model(**vals_dict)
|
|
@@ -16,6 +16,7 @@ from typing import (
|
|
|
16
16
|
overload,
|
|
17
17
|
)
|
|
18
18
|
|
|
19
|
+
import orjson
|
|
19
20
|
import pandas as pd
|
|
20
21
|
import sqlalchemy
|
|
21
22
|
from pydantic import BaseModel
|
|
@@ -58,7 +59,7 @@ from datachain.query.dataset import (
|
|
|
58
59
|
from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
|
|
59
60
|
from datachain.sql.functions import path as pathfunc
|
|
60
61
|
from datachain.telemetry import telemetry
|
|
61
|
-
from datachain.utils import inside_notebook
|
|
62
|
+
from datachain.utils import batched_it, inside_notebook
|
|
62
63
|
|
|
63
64
|
if TYPE_CHECKING:
|
|
64
65
|
from typing_extensions import Concatenate, ParamSpec, Self
|
|
@@ -71,6 +72,10 @@ C = Column
|
|
|
71
72
|
|
|
72
73
|
_T = TypeVar("_T")
|
|
73
74
|
D = TypeVar("D", bound="DataChain")
|
|
75
|
+
UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
DEFAULT_PARQUET_CHUNK_SIZE = 100_000
|
|
74
79
|
|
|
75
80
|
|
|
76
81
|
def resolve_columns(
|
|
@@ -819,7 +824,7 @@ class DataChain:
|
|
|
819
824
|
|
|
820
825
|
def gen(
|
|
821
826
|
self,
|
|
822
|
-
func: Optional[Callable] = None,
|
|
827
|
+
func: Optional[Union[Callable, Generator]] = None,
|
|
823
828
|
params: Union[None, str, Sequence[str]] = None,
|
|
824
829
|
output: OutputType = None,
|
|
825
830
|
**signal_map,
|
|
@@ -931,12 +936,12 @@ class DataChain:
|
|
|
931
936
|
|
|
932
937
|
def _udf_to_obj(
|
|
933
938
|
self,
|
|
934
|
-
target_class: type[
|
|
935
|
-
func: Optional[Callable],
|
|
939
|
+
target_class: type[UDFObjT],
|
|
940
|
+
func: Optional[Union[Callable, UDFObjT]],
|
|
936
941
|
params: Union[None, str, Sequence[str]],
|
|
937
942
|
output: OutputType,
|
|
938
943
|
signal_map,
|
|
939
|
-
) ->
|
|
944
|
+
) -> UDFObjT:
|
|
940
945
|
is_generator = target_class.is_output_batched
|
|
941
946
|
name = self.name or ""
|
|
942
947
|
|
|
@@ -1103,6 +1108,29 @@ class DataChain:
|
|
|
1103
1108
|
rows = (row_factory(db_signals, r) for r in rows)
|
|
1104
1109
|
yield from rows
|
|
1105
1110
|
|
|
1111
|
+
def to_columnar_data_with_names(
|
|
1112
|
+
self, chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE
|
|
1113
|
+
) -> tuple[list[str], Iterator[list[list[Any]]]]:
|
|
1114
|
+
"""Returns column names and the results as an iterator that provides chunks,
|
|
1115
|
+
with each chunk containing a list of columns, where each column contains a
|
|
1116
|
+
list of the row values for that column in that chunk. Useful for columnar data
|
|
1117
|
+
formats, such as parquet or other OLAP databases.
|
|
1118
|
+
"""
|
|
1119
|
+
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
1120
|
+
column_names = [".".join(filter(None, header)) for header in headers]
|
|
1121
|
+
|
|
1122
|
+
results_iter = self.collect_flatten()
|
|
1123
|
+
|
|
1124
|
+
def column_chunks() -> Iterator[list[list[Any]]]:
|
|
1125
|
+
for chunk_iter in batched_it(results_iter, chunk_size):
|
|
1126
|
+
columns: list[list[Any]] = [[] for _ in column_names]
|
|
1127
|
+
for row in chunk_iter:
|
|
1128
|
+
for i, col in enumerate(columns):
|
|
1129
|
+
col.append(row[i])
|
|
1130
|
+
yield columns
|
|
1131
|
+
|
|
1132
|
+
return column_names, column_chunks()
|
|
1133
|
+
|
|
1106
1134
|
@overload
|
|
1107
1135
|
def results(self) -> list[tuple[Any, ...]]: ...
|
|
1108
1136
|
|
|
@@ -1808,21 +1836,96 @@ class DataChain:
|
|
|
1808
1836
|
self,
|
|
1809
1837
|
path: Union[str, os.PathLike[str], BinaryIO],
|
|
1810
1838
|
partition_cols: Optional[Sequence[str]] = None,
|
|
1839
|
+
chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
|
|
1811
1840
|
**kwargs,
|
|
1812
1841
|
) -> None:
|
|
1813
|
-
"""Save chain to parquet file.
|
|
1842
|
+
"""Save chain to parquet file with SignalSchema metadata.
|
|
1814
1843
|
|
|
1815
1844
|
Parameters:
|
|
1816
1845
|
path : Path or a file-like binary object to save the file.
|
|
1817
1846
|
partition_cols : Column names by which to partition the dataset.
|
|
1847
|
+
chunk_size : The chunk size of results to read and convert to columnar
|
|
1848
|
+
data, to avoid running out of memory.
|
|
1818
1849
|
"""
|
|
1850
|
+
import pyarrow as pa
|
|
1851
|
+
import pyarrow.parquet as pq
|
|
1852
|
+
|
|
1853
|
+
from datachain.lib.arrow import DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY
|
|
1854
|
+
|
|
1819
1855
|
_partition_cols = list(partition_cols) if partition_cols else None
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
partition_cols=_partition_cols,
|
|
1823
|
-
**kwargs,
|
|
1856
|
+
signal_schema_metadata = orjson.dumps(
|
|
1857
|
+
self._effective_signals_schema.serialize()
|
|
1824
1858
|
)
|
|
1825
1859
|
|
|
1860
|
+
column_names, column_chunks = self.to_columnar_data_with_names(chunk_size)
|
|
1861
|
+
|
|
1862
|
+
parquet_schema = None
|
|
1863
|
+
parquet_writer = None
|
|
1864
|
+
first_chunk = True
|
|
1865
|
+
|
|
1866
|
+
for chunk in column_chunks:
|
|
1867
|
+
# pyarrow infers the best parquet schema from the python types of
|
|
1868
|
+
# the input data.
|
|
1869
|
+
table = pa.Table.from_pydict(
|
|
1870
|
+
dict(zip(column_names, chunk)),
|
|
1871
|
+
schema=parquet_schema,
|
|
1872
|
+
)
|
|
1873
|
+
|
|
1874
|
+
# Preserve any existing metadata, and add the DataChain SignalSchema.
|
|
1875
|
+
existing_metadata = table.schema.metadata or {}
|
|
1876
|
+
merged_metadata = {
|
|
1877
|
+
**existing_metadata,
|
|
1878
|
+
DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY: signal_schema_metadata,
|
|
1879
|
+
}
|
|
1880
|
+
table = table.replace_schema_metadata(merged_metadata)
|
|
1881
|
+
parquet_schema = table.schema
|
|
1882
|
+
|
|
1883
|
+
if _partition_cols:
|
|
1884
|
+
# Write to a partitioned parquet dataset.
|
|
1885
|
+
pq.write_to_dataset(
|
|
1886
|
+
table,
|
|
1887
|
+
root_path=path,
|
|
1888
|
+
partition_cols=_partition_cols,
|
|
1889
|
+
**kwargs,
|
|
1890
|
+
)
|
|
1891
|
+
else:
|
|
1892
|
+
if first_chunk:
|
|
1893
|
+
# Write to a single parquet file.
|
|
1894
|
+
parquet_writer = pq.ParquetWriter(path, parquet_schema, **kwargs)
|
|
1895
|
+
first_chunk = False
|
|
1896
|
+
|
|
1897
|
+
assert parquet_writer
|
|
1898
|
+
parquet_writer.write_table(table)
|
|
1899
|
+
|
|
1900
|
+
if parquet_writer:
|
|
1901
|
+
parquet_writer.close()
|
|
1902
|
+
|
|
1903
|
+
def to_csv(
|
|
1904
|
+
self,
|
|
1905
|
+
path: Union[str, os.PathLike[str]],
|
|
1906
|
+
delimiter: str = ",",
|
|
1907
|
+
**kwargs,
|
|
1908
|
+
) -> None:
|
|
1909
|
+
"""Save chain to a csv (comma-separated values) file.
|
|
1910
|
+
|
|
1911
|
+
Parameters:
|
|
1912
|
+
path : Path to save the file.
|
|
1913
|
+
delimiter : Delimiter to use for the resulting file.
|
|
1914
|
+
"""
|
|
1915
|
+
import csv
|
|
1916
|
+
|
|
1917
|
+
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
1918
|
+
column_names = [".".join(filter(None, header)) for header in headers]
|
|
1919
|
+
|
|
1920
|
+
results_iter = self.collect_flatten()
|
|
1921
|
+
|
|
1922
|
+
with open(path, "w", newline="") as f:
|
|
1923
|
+
writer = csv.writer(f, delimiter=delimiter, **kwargs)
|
|
1924
|
+
writer.writerow(column_names)
|
|
1925
|
+
|
|
1926
|
+
for row in results_iter:
|
|
1927
|
+
writer.writerow(row)
|
|
1928
|
+
|
|
1826
1929
|
@classmethod
|
|
1827
1930
|
def from_records(
|
|
1828
1931
|
cls,
|