datachain 0.6.2__tar.gz → 0.6.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.6.2 → datachain-0.6.3}/.github/workflows/tests-studio.yml +1 -1
- {datachain-0.6.2 → datachain-0.6.3}/.pre-commit-config.yaml +1 -1
- {datachain-0.6.2/src/datachain.egg-info → datachain-0.6.3}/PKG-INFO +2 -2
- {datachain-0.6.2 → datachain-0.6.3}/noxfile.py +1 -2
- {datachain-0.6.2 → datachain-0.6.3}/pyproject.toml +4 -5
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/arrow.py +2 -15
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/data_model.py +10 -2
- datachain-0.6.3/src/datachain/lib/utils.py +60 -0
- {datachain-0.6.2 → datachain-0.6.3/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain.egg-info/requires.txt +1 -1
- datachain-0.6.3/tests/benchmarks/conftest.py +8 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/test_datachain.py +0 -3
- datachain-0.6.3/tests/benchmarks/test_ls.py +6 -0
- datachain-0.6.3/tests/benchmarks/test_version.py +7 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_arrow.py +11 -3
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_datachain.py +34 -1
- datachain-0.6.3/tests/unit/lib/test_utils.py +128 -0
- datachain-0.6.2/src/datachain/lib/utils.py +0 -30
- datachain-0.6.2/tests/benchmarks/conftest.py +0 -137
- datachain-0.6.2/tests/benchmarks/test_ls.py +0 -2
- datachain-0.6.2/tests/benchmarks/test_version.py +0 -2
- datachain-0.6.2/tests/unit/lib/test_utils.py +0 -58
- {datachain-0.6.2 → datachain-0.6.3}/.cruft.json +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.gitattributes +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.github/codecov.yaml +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.github/dependabot.yml +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.github/workflows/release.yml +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.github/workflows/tests.yml +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/.gitignore +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/CONTRIBUTING.rst +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/LICENSE +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/README.rst +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/assets/datachain.svg +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/assets/flowchart.png +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/index.md +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/references/datachain.md +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/references/datatype.md +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/references/file.md +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/references/index.md +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/references/sql.md +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/references/torch.md +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/docs/references/udf.md +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/multimodal/wds.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/mkdocs.yml +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/overrides/main.html +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/setup.cfg +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/__main__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/asyn.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/cache.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/cli.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/cli_utils.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/azure.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/gcs.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/hf.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/local.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/s3.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/config.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/dataset.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/error.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/job.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/clip.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/dc.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/file.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/func/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/func/aggregate.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/func/func.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/hf.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/image.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/listing.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/settings.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/tar.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/text.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/udf.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/listing.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/node.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/progress.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/py.typed +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/batch.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/dataset.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/metrics.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/params.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/queue.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/schema.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/session.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/remote/studio.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/types.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/utils.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/storage.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/studio.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/telemetry.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain/utils.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/conftest.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/data.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/examples/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/examples/test_examples.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/examples/wds_data.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_catalog.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_client.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_datachain.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_datasets.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_listing.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_ls.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_metrics.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_pull.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_pytorch.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_query.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/scripts/feature_class.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/test_atomicity.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/test_cli_e2e.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/test_cli_studio.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/test_query_e2e.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/test_telemetry.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_asyn.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_cache.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_catalog.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_client.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_config.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_dataset.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_listing.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_metastore.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_query.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_query_params.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_serializer.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_session.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_storage.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_utils.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.6.2 → datachain-0.6.3}/tests/utils.py +0 -0
|
@@ -101,6 +101,6 @@ jobs:
|
|
|
101
101
|
pytest
|
|
102
102
|
--config-file=pyproject.toml -rs
|
|
103
103
|
--splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
|
|
104
|
-
-
|
|
104
|
+
--benchmark-skip
|
|
105
105
|
tests ../datachain/tests
|
|
106
106
|
working-directory: backend/datachain_server
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -82,7 +82,7 @@ Requires-Dist: requests-mock; extra == "tests"
|
|
|
82
82
|
Requires-Dist: scipy; extra == "tests"
|
|
83
83
|
Provides-Extra: dev
|
|
84
84
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
85
|
-
Requires-Dist: mypy==1.
|
|
85
|
+
Requires-Dist: mypy==1.13.0; extra == "dev"
|
|
86
86
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
87
87
|
Requires-Dist: types-pytz; extra == "dev"
|
|
88
88
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -94,7 +94,7 @@ tests = [
|
|
|
94
94
|
]
|
|
95
95
|
dev = [
|
|
96
96
|
"datachain[docs,tests]",
|
|
97
|
-
"mypy==1.
|
|
97
|
+
"mypy==1.13.0",
|
|
98
98
|
"types-python-dateutil",
|
|
99
99
|
"types-pytz",
|
|
100
100
|
"types-PyYAML",
|
|
@@ -127,9 +127,8 @@ namespaces = false
|
|
|
127
127
|
[tool.setuptools_scm]
|
|
128
128
|
|
|
129
129
|
[tool.pytest.ini_options]
|
|
130
|
-
addopts = "-rfEs -m 'not
|
|
130
|
+
addopts = "-rfEs -m 'not examples' --benchmark-skip"
|
|
131
131
|
markers = [
|
|
132
|
-
"benchmark: benchmarks.",
|
|
133
132
|
"e2e: End-to-end tests",
|
|
134
133
|
"examples: All examples",
|
|
135
134
|
"computer_vision: Computer vision examples",
|
|
@@ -214,6 +213,7 @@ ignore = [
|
|
|
214
213
|
select = [
|
|
215
214
|
"B", # flake8-bugbear
|
|
216
215
|
"C4", # flake8-comprehensions
|
|
216
|
+
"C420", # unnecessary-dict-comprehension-for-iterable
|
|
217
217
|
"C90", # mccabe
|
|
218
218
|
"W", # pycodestyle - Warning
|
|
219
219
|
"E", # pycodestyle - Error
|
|
@@ -252,11 +252,10 @@ select = [
|
|
|
252
252
|
"NPY", # numpy
|
|
253
253
|
"TRY004", # type-check-without-type-error
|
|
254
254
|
"TRY201", # verbose-raise
|
|
255
|
-
"
|
|
255
|
+
"TRY203", # useless-try-except
|
|
256
256
|
"TRY401", # verbose-log-message
|
|
257
257
|
"RUF022", # unsorted-dunder-all
|
|
258
258
|
"RUF023", # unsorted-dunder-slots
|
|
259
|
-
"RUF025", # unnecessary-dict-comprehension-for-iterable
|
|
260
259
|
"RUF027", # missing-f-string-syntax
|
|
261
260
|
"RUF030", # assert-with-print-message
|
|
262
261
|
"RUF101", # redirected-noqa
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import re
|
|
2
1
|
from collections.abc import Sequence
|
|
3
2
|
from tempfile import NamedTemporaryFile
|
|
4
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
@@ -13,6 +12,7 @@ from datachain.lib.file import ArrowRow, File
|
|
|
13
12
|
from datachain.lib.model_store import ModelStore
|
|
14
13
|
from datachain.lib.signal_schema import SignalSchema
|
|
15
14
|
from datachain.lib.udf import Generator
|
|
15
|
+
from datachain.lib.utils import normalize_col_names
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
18
|
from datasets.features.features import Features
|
|
@@ -128,7 +128,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
128
128
|
signal_schema = _get_datachain_schema(schema)
|
|
129
129
|
if signal_schema:
|
|
130
130
|
return signal_schema.values
|
|
131
|
-
columns =
|
|
131
|
+
columns = list(normalize_col_names(col_names).keys()) # type: ignore[arg-type]
|
|
132
132
|
hf_schema = _get_hf_schema(schema)
|
|
133
133
|
if hf_schema:
|
|
134
134
|
return {
|
|
@@ -143,19 +143,6 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
143
143
|
return output
|
|
144
144
|
|
|
145
145
|
|
|
146
|
-
def _convert_col_names(col_names: Sequence[str]) -> list[str]:
|
|
147
|
-
default_column = 0
|
|
148
|
-
converted_col_names = []
|
|
149
|
-
for column in col_names:
|
|
150
|
-
column = column.lower()
|
|
151
|
-
column = re.sub("[^0-9a-z_]+", "", column)
|
|
152
|
-
if not column:
|
|
153
|
-
column = f"c{default_column}"
|
|
154
|
-
default_column += 1
|
|
155
|
-
converted_col_names.append(column)
|
|
156
|
-
return converted_col_names
|
|
157
|
-
|
|
158
|
-
|
|
159
146
|
def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
|
|
160
147
|
"""Convert pyarrow types to basic types."""
|
|
161
148
|
from datetime import datetime
|
|
@@ -2,9 +2,10 @@ from collections.abc import Sequence
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import ClassVar, Union, get_args, get_origin
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel, create_model
|
|
5
|
+
from pydantic import BaseModel, Field, create_model
|
|
6
6
|
|
|
7
7
|
from datachain.lib.model_store import ModelStore
|
|
8
|
+
from datachain.lib.utils import normalize_col_names
|
|
8
9
|
|
|
9
10
|
StandardType = Union[
|
|
10
11
|
type[int],
|
|
@@ -60,7 +61,14 @@ def is_chain_type(t: type) -> bool:
|
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
|
|
63
|
-
|
|
64
|
+
# Gets a map of a normalized_name -> original_name
|
|
65
|
+
columns = normalize_col_names(list(data_dict.keys()))
|
|
66
|
+
# We reverse if for convenience to original_name -> normalized_name
|
|
67
|
+
columns = {v: k for k, v in columns.items()}
|
|
68
|
+
|
|
69
|
+
fields = {
|
|
70
|
+
columns[name]: (anno, Field(alias=name)) for name, anno in data_dict.items()
|
|
71
|
+
}
|
|
64
72
|
return create_model(
|
|
65
73
|
name,
|
|
66
74
|
__base__=(DataModel,), # type: ignore[call-overload]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AbstractUDF(ABC):
|
|
7
|
+
@abstractmethod
|
|
8
|
+
def process(self, *args, **kwargs):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def setup(self):
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def teardown(self):
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DataChainError(Exception):
|
|
21
|
+
def __init__(self, message):
|
|
22
|
+
super().__init__(message)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DataChainParamsError(DataChainError):
|
|
26
|
+
def __init__(self, message):
|
|
27
|
+
super().__init__(message)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DataChainColumnError(DataChainParamsError):
|
|
31
|
+
def __init__(self, col_name, msg):
|
|
32
|
+
super().__init__(f"Error for column {col_name}: {msg}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
|
|
36
|
+
gen_col_counter = 0
|
|
37
|
+
new_col_names = {}
|
|
38
|
+
org_col_names = set(col_names)
|
|
39
|
+
|
|
40
|
+
for org_column in col_names:
|
|
41
|
+
new_column = org_column.lower()
|
|
42
|
+
new_column = re.sub("[^0-9a-z]+", "_", new_column)
|
|
43
|
+
new_column = new_column.strip("_")
|
|
44
|
+
|
|
45
|
+
generated_column = new_column
|
|
46
|
+
|
|
47
|
+
while (
|
|
48
|
+
not generated_column.isidentifier()
|
|
49
|
+
or generated_column in new_col_names
|
|
50
|
+
or (generated_column != org_column and generated_column in org_col_names)
|
|
51
|
+
):
|
|
52
|
+
if new_column:
|
|
53
|
+
generated_column = f"c{gen_col_counter}_{new_column}"
|
|
54
|
+
else:
|
|
55
|
+
generated_column = f"c{gen_col_counter}"
|
|
56
|
+
gen_col_counter += 1
|
|
57
|
+
|
|
58
|
+
new_col_names[generated_column] = org_column
|
|
59
|
+
|
|
60
|
+
return new_col_names
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -82,7 +82,7 @@ Requires-Dist: requests-mock; extra == "tests"
|
|
|
82
82
|
Requires-Dist: scipy; extra == "tests"
|
|
83
83
|
Provides-Extra: dev
|
|
84
84
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
85
|
-
Requires-Dist: mypy==1.
|
|
85
|
+
Requires-Dist: mypy==1.13.0; extra == "dev"
|
|
86
86
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
87
87
|
Requires-Dist: types-pytz; extra == "dev"
|
|
88
88
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -1,10 +1,7 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
|
|
3
1
|
from datachain.lib.dc import DataChain
|
|
4
2
|
from datachain.lib.webdataset_laion import process_laion_meta
|
|
5
3
|
|
|
6
4
|
|
|
7
|
-
@pytest.mark.benchmark
|
|
8
5
|
def test_datachain(tmp_dir, test_session, datasets, benchmark):
|
|
9
6
|
def run_script(uri, **kwargs):
|
|
10
7
|
DataChain.from_storage(uri, session=test_session, **kwargs).gen(
|
|
@@ -168,13 +168,21 @@ def test_parquet_convert_column_names():
|
|
|
168
168
|
("dot.notation.col", pa.int32()),
|
|
169
169
|
("with-dashes", pa.int32()),
|
|
170
170
|
("with spaces", pa.int32()),
|
|
171
|
+
("with-multiple--dashes", pa.int32()),
|
|
172
|
+
("with__underscores", pa.int32()),
|
|
173
|
+
("__leading__underscores", pa.int32()),
|
|
174
|
+
("trailing__underscores__", pa.int32()),
|
|
171
175
|
]
|
|
172
176
|
)
|
|
173
177
|
assert list(schema_to_output(schema)) == [
|
|
174
178
|
"uppercasecol",
|
|
175
|
-
"
|
|
176
|
-
"
|
|
177
|
-
"
|
|
179
|
+
"dot_notation_col",
|
|
180
|
+
"with_dashes",
|
|
181
|
+
"with_spaces",
|
|
182
|
+
"with_multiple_dashes",
|
|
183
|
+
"with_underscores",
|
|
184
|
+
"leading_underscores",
|
|
185
|
+
"trailing_underscores",
|
|
178
186
|
]
|
|
179
187
|
|
|
180
188
|
|
|
@@ -36,6 +36,18 @@ DF_DATA = {
|
|
|
36
36
|
"city": ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix"],
|
|
37
37
|
}
|
|
38
38
|
|
|
39
|
+
DF_DATA_NESTED_NOT_NORMALIZED = {
|
|
40
|
+
"nAmE": [
|
|
41
|
+
{"first-SELECT": "Alice", "l--as@t": "Smith"},
|
|
42
|
+
{"l--as@t": "Jones", "first-SELECT": "Bob"},
|
|
43
|
+
{"first-SELECT": "Charlie", "l--as@t": "Brown"},
|
|
44
|
+
{"first-SELECT": "David", "l--as@t": "White"},
|
|
45
|
+
{"first-SELECT": "Eva", "l--as@t": "Black"},
|
|
46
|
+
],
|
|
47
|
+
"AgE": [25, 30, 35, 40, 45],
|
|
48
|
+
"citY": ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix"],
|
|
49
|
+
}
|
|
50
|
+
|
|
39
51
|
DF_OTHER_DATA = {
|
|
40
52
|
"last_name": ["Smith", "Jones"],
|
|
41
53
|
"country": ["USA", "Russia"],
|
|
@@ -272,7 +284,9 @@ def test_listings(test_session, tmp_dir):
|
|
|
272
284
|
assert listing.expires
|
|
273
285
|
assert listing.version == 1
|
|
274
286
|
assert listing.num_objects == 1
|
|
275
|
-
|
|
287
|
+
# Exact number if unreliable here since it depends on the PyArrow version
|
|
288
|
+
assert listing.size > 1000
|
|
289
|
+
assert listing.size < 5000
|
|
276
290
|
assert listing.status == 4
|
|
277
291
|
|
|
278
292
|
|
|
@@ -988,6 +1002,25 @@ def test_parse_tabular_format(tmp_dir, test_session):
|
|
|
988
1002
|
assert df1.equals(df)
|
|
989
1003
|
|
|
990
1004
|
|
|
1005
|
+
def test_parse_nested_json(tmp_dir, test_session):
|
|
1006
|
+
df = pd.DataFrame(DF_DATA_NESTED_NOT_NORMALIZED)
|
|
1007
|
+
path = tmp_dir / "test.jsonl"
|
|
1008
|
+
path.write_text(df.to_json(orient="records", lines=True))
|
|
1009
|
+
dc = DataChain.from_storage(path.as_uri(), session=test_session).parse_tabular(
|
|
1010
|
+
format="json"
|
|
1011
|
+
)
|
|
1012
|
+
# Field names are normalized, values are preserved
|
|
1013
|
+
# E.g. nAmE -> name, l--as@t -> l_as_t, etc
|
|
1014
|
+
df1 = dc.select("name", "age", "city").to_pandas()
|
|
1015
|
+
|
|
1016
|
+
assert df1["name"]["first_select"].to_list() == [
|
|
1017
|
+
d["first-SELECT"] for d in df["nAmE"].to_list()
|
|
1018
|
+
]
|
|
1019
|
+
assert df1["name"]["l_as_t"].to_list() == [
|
|
1020
|
+
d["l--as@t"] for d in df["nAmE"].to_list()
|
|
1021
|
+
]
|
|
1022
|
+
|
|
1023
|
+
|
|
991
1024
|
def test_parse_tabular_partitions(tmp_dir, test_session):
|
|
992
1025
|
df = pd.DataFrame(DF_DATA)
|
|
993
1026
|
path = tmp_dir / "test.parquet"
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from collections.abc import Iterable, Mapping
|
|
2
|
+
from typing import Literal, Optional, Union
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
8
|
+
from datachain.lib.utils import normalize_col_names
|
|
9
|
+
from datachain.sql.types import JSON, Array, String
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MyModel(BaseModel):
|
|
13
|
+
val1: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MyFeature(BaseModel):
|
|
17
|
+
val1: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@pytest.mark.parametrize(
|
|
21
|
+
"typ,expected",
|
|
22
|
+
(
|
|
23
|
+
(str, String),
|
|
24
|
+
(String, String),
|
|
25
|
+
(Literal["text"], String),
|
|
26
|
+
(dict[str, int], JSON),
|
|
27
|
+
(Mapping[str, int], JSON),
|
|
28
|
+
(Optional[str], String),
|
|
29
|
+
(Union[dict, list[dict]], JSON),
|
|
30
|
+
),
|
|
31
|
+
)
|
|
32
|
+
def test_convert_type_to_datachain(typ, expected):
|
|
33
|
+
assert python_to_sql(typ) == expected
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@pytest.mark.parametrize(
|
|
37
|
+
"typ,expected",
|
|
38
|
+
(
|
|
39
|
+
(list[str], Array(String())),
|
|
40
|
+
(Iterable[str], Array(String())),
|
|
41
|
+
(list[list[str]], Array(Array(String()))),
|
|
42
|
+
),
|
|
43
|
+
)
|
|
44
|
+
def test_convert_type_to_datachain_array(typ, expected):
|
|
45
|
+
assert python_to_sql(typ).to_dict() == expected.to_dict()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.mark.parametrize(
|
|
49
|
+
"typ",
|
|
50
|
+
(
|
|
51
|
+
Union[str, int],
|
|
52
|
+
list[Union[str, int]],
|
|
53
|
+
MyFeature,
|
|
54
|
+
MyModel,
|
|
55
|
+
),
|
|
56
|
+
)
|
|
57
|
+
def test_convert_type_to_datachain_error(typ):
|
|
58
|
+
with pytest.raises(TypeError):
|
|
59
|
+
python_to_sql(typ)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_normalize_column_names():
|
|
63
|
+
res = normalize_col_names(
|
|
64
|
+
[
|
|
65
|
+
"UpperCase",
|
|
66
|
+
"_underscore_start",
|
|
67
|
+
"double__underscore",
|
|
68
|
+
"1start_with_number",
|
|
69
|
+
"не_ascii_start",
|
|
70
|
+
" space_start",
|
|
71
|
+
"space_end ",
|
|
72
|
+
"dash-end-",
|
|
73
|
+
"-dash-start",
|
|
74
|
+
"--multiple--dash--",
|
|
75
|
+
"-_ mix_ -dash_ -",
|
|
76
|
+
"__2digit_after_uderscore",
|
|
77
|
+
"",
|
|
78
|
+
"_-_- _---_ _",
|
|
79
|
+
"_-_- _---_ _1",
|
|
80
|
+
]
|
|
81
|
+
)
|
|
82
|
+
assert list(res.keys()) == [
|
|
83
|
+
"uppercase",
|
|
84
|
+
"underscore_start",
|
|
85
|
+
"double_underscore",
|
|
86
|
+
"c0_1start_with_number",
|
|
87
|
+
"ascii_start",
|
|
88
|
+
"space_start",
|
|
89
|
+
"space_end",
|
|
90
|
+
"dash_end",
|
|
91
|
+
"dash_start",
|
|
92
|
+
"multiple_dash",
|
|
93
|
+
"mix_dash",
|
|
94
|
+
"c1_2digit_after_uderscore",
|
|
95
|
+
"c2",
|
|
96
|
+
"c3",
|
|
97
|
+
"c4_1",
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def test_normalize_column_names_case_repeat():
|
|
102
|
+
res = normalize_col_names(["UpperCase", "UpPerCase"])
|
|
103
|
+
|
|
104
|
+
assert list(res.keys()) == ["uppercase", "c0_uppercase"]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def test_normalize_column_names_exists_after_normalize():
|
|
108
|
+
res = normalize_col_names(["1digit", "c0_1digit"])
|
|
109
|
+
|
|
110
|
+
assert list(res.keys()) == ["c1_1digit", "c0_1digit"]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def test_normalize_column_names_normalized_repeat():
|
|
114
|
+
res = normalize_col_names(["column", "_column"])
|
|
115
|
+
|
|
116
|
+
assert list(res.keys()) == ["column", "c0_column"]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_normalize_column_names_normalized_case_repeat():
|
|
120
|
+
res = normalize_col_names(["CoLuMn", "_column"])
|
|
121
|
+
|
|
122
|
+
assert res == {"column": "CoLuMn", "c0_column": "_column"}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_normalize_column_names_repeat_generated_after_normalize():
|
|
126
|
+
res = normalize_col_names(["c0_CoLuMn", "_column", "column"])
|
|
127
|
+
|
|
128
|
+
assert res == {"c0_column": "c0_CoLuMn", "c1_column": "_column", "column": "column"}
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class AbstractUDF(ABC):
|
|
5
|
-
@abstractmethod
|
|
6
|
-
def process(self, *args, **kwargs):
|
|
7
|
-
pass
|
|
8
|
-
|
|
9
|
-
@abstractmethod
|
|
10
|
-
def setup(self):
|
|
11
|
-
pass
|
|
12
|
-
|
|
13
|
-
@abstractmethod
|
|
14
|
-
def teardown(self):
|
|
15
|
-
pass
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class DataChainError(Exception):
|
|
19
|
-
def __init__(self, message):
|
|
20
|
-
super().__init__(message)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class DataChainParamsError(DataChainError):
|
|
24
|
-
def __init__(self, message):
|
|
25
|
-
super().__init__(message)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class DataChainColumnError(DataChainParamsError):
|
|
29
|
-
def __init__(self, col_name, msg):
|
|
30
|
-
super().__init__(f"Error for column {col_name}: {msg}")
|
|
@@ -1,137 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import shutil
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from subprocess import check_output
|
|
5
|
-
|
|
6
|
-
import pytest
|
|
7
|
-
import virtualenv
|
|
8
|
-
from dulwich.porcelain import clone
|
|
9
|
-
from packaging import version
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@pytest.fixture
|
|
13
|
-
def bucket():
|
|
14
|
-
return "s3://noaa-bathymetry-pds/"
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def pytest_generate_tests(metafunc):
|
|
18
|
-
str_revs = metafunc.config.getoption("--datachain-revs")
|
|
19
|
-
revs = str_revs.split(",") if str_revs else [None]
|
|
20
|
-
if "datachain_rev" in metafunc.fixturenames:
|
|
21
|
-
metafunc.parametrize("datachain_rev", revs, scope="session")
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class VirtualEnv:
|
|
25
|
-
def __init__(self, path) -> None:
|
|
26
|
-
self.path = path
|
|
27
|
-
self.bin = self.path / ("Scripts" if os.name == "nt" else "bin")
|
|
28
|
-
|
|
29
|
-
def create(self) -> None:
|
|
30
|
-
virtualenv.cli_run([os.fspath(self.path)])
|
|
31
|
-
|
|
32
|
-
def run(self, cmd: str, *args: str, env=None) -> None:
|
|
33
|
-
exe = self.which(cmd)
|
|
34
|
-
check_output([exe, *args], env=env) # noqa: S603
|
|
35
|
-
|
|
36
|
-
def which(self, cmd: str) -> str:
|
|
37
|
-
assert self.bin.exists()
|
|
38
|
-
return shutil.which(cmd, path=self.bin) or cmd
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@pytest.fixture(scope="session", name="make_datachain_venv")
|
|
42
|
-
def fixture_make_datachain_venv(tmp_path_factory):
|
|
43
|
-
def _make_datachain_venv(name):
|
|
44
|
-
venv_dir = tmp_path_factory.mktemp(f"datachain-venv-{name}")
|
|
45
|
-
venv = VirtualEnv(venv_dir)
|
|
46
|
-
venv.create()
|
|
47
|
-
return venv
|
|
48
|
-
|
|
49
|
-
return _make_datachain_venv
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@pytest.fixture(scope="session", name="datachain_venvs")
|
|
53
|
-
def fixture_datachain_venvs():
|
|
54
|
-
return {}
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
@pytest.fixture(scope="session", name="datachain_git_repo")
|
|
58
|
-
def fixture_datachain_git_repo(tmp_path_factory, test_config):
|
|
59
|
-
url = test_config.datachain_git_repo
|
|
60
|
-
|
|
61
|
-
if os.path.isdir(url):
|
|
62
|
-
return url
|
|
63
|
-
|
|
64
|
-
tmp_path = os.fspath(tmp_path_factory.mktemp("datachain-git-repo"))
|
|
65
|
-
clone(url, tmp_path)
|
|
66
|
-
|
|
67
|
-
return tmp_path
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
@pytest.fixture(scope="session", name="datachain_bin")
|
|
71
|
-
def fixture_datachain_bin(
|
|
72
|
-
datachain_rev,
|
|
73
|
-
datachain_venvs,
|
|
74
|
-
make_datachain_venv,
|
|
75
|
-
datachain_git_repo,
|
|
76
|
-
test_config,
|
|
77
|
-
):
|
|
78
|
-
if datachain_rev:
|
|
79
|
-
venv = datachain_venvs.get(datachain_rev)
|
|
80
|
-
if not venv:
|
|
81
|
-
venv = make_datachain_venv(datachain_rev)
|
|
82
|
-
venv.run("pip", "install", "-U", "pip")
|
|
83
|
-
venv.run(
|
|
84
|
-
"pip", "install", f"git+file://{datachain_git_repo}@{datachain_rev}"
|
|
85
|
-
)
|
|
86
|
-
datachain_venvs[datachain_rev] = venv
|
|
87
|
-
datachain_bin = venv.which("datachain")
|
|
88
|
-
else:
|
|
89
|
-
datachain_bin = test_config.datachain_bin
|
|
90
|
-
|
|
91
|
-
def _datachain_bin(*args):
|
|
92
|
-
return check_output([datachain_bin, *args], text=True) # noqa: S603
|
|
93
|
-
|
|
94
|
-
actual = version.parse(_datachain_bin("--version"))
|
|
95
|
-
_datachain_bin.version = (actual.major, actual.minor, actual.micro)
|
|
96
|
-
|
|
97
|
-
return _datachain_bin
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
@pytest.fixture(scope="function", name="make_bench")
|
|
101
|
-
def fixture_make_bench(request):
|
|
102
|
-
def _make_bench(name):
|
|
103
|
-
import pytest_benchmark.plugin
|
|
104
|
-
|
|
105
|
-
# hack from https://github.com/ionelmc/pytest-benchmark/issues/166
|
|
106
|
-
bench = pytest_benchmark.plugin.benchmark.__pytest_wrapped__.obj(request)
|
|
107
|
-
|
|
108
|
-
suffix = f"-{name}"
|
|
109
|
-
|
|
110
|
-
def add_suffix(_name):
|
|
111
|
-
start, sep, end = _name.partition("[")
|
|
112
|
-
return start + suffix + sep + end
|
|
113
|
-
|
|
114
|
-
bench.name = add_suffix(bench.name)
|
|
115
|
-
bench.fullname = add_suffix(bench.fullname)
|
|
116
|
-
|
|
117
|
-
return bench
|
|
118
|
-
|
|
119
|
-
return _make_bench
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@pytest.fixture(
|
|
123
|
-
scope="function", params=[pytest.param(None, marks=pytest.mark.benchmark)]
|
|
124
|
-
)
|
|
125
|
-
def bench_datachain(datachain_bin, make_bench):
|
|
126
|
-
def _bench_datachain(*args, **kwargs):
|
|
127
|
-
name = kwargs.pop("name", None)
|
|
128
|
-
name = f"-{name}" if name else ""
|
|
129
|
-
bench = make_bench(args[0] + name)
|
|
130
|
-
return bench.pedantic(datachain_bin, args=args, **kwargs)
|
|
131
|
-
|
|
132
|
-
return _bench_datachain
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
@pytest.fixture
|
|
136
|
-
def datasets():
|
|
137
|
-
return Path(__file__).parent / "datasets"
|