datachain 0.6.0__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.6.0/src/datachain.egg-info → datachain-0.6.1}/PKG-INFO +2 -2
- {datachain-0.6.0 → datachain-0.6.1}/pyproject.toml +1 -1
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/__init__.py +2 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/catalog/catalog.py +1 -9
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/sqlite.py +8 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/warehouse.py +0 -4
- datachain-0.6.1/src/datachain/lib/convert/sql_to_python.py +14 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/dc.py +64 -28
- datachain-0.6.1/src/datachain/lib/func/__init__.py +14 -0
- datachain-0.6.1/src/datachain/lib/func/aggregate.py +42 -0
- datachain-0.6.1/src/datachain/lib/func/func.py +64 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/signal_schema.py +9 -3
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/utils.py +5 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/__init__.py +1 -2
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/batch.py +0 -1
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/dataset.py +22 -43
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/schema.py +1 -61
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/session.py +33 -25
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/__init__.py +1 -1
- datachain-0.6.1/src/datachain/sql/functions/aggregate.py +47 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/array.py +0 -8
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/sqlite/base.py +20 -2
- {datachain-0.6.0 → datachain-0.6.1/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain.egg-info/SOURCES.txt +4 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_datachain.py +61 -8
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_dataset_query.py +0 -34
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_datasets.py +33 -0
- datachain-0.6.1/tests/scripts/feature_class_exception.py +34 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/test_atomicity.py +10 -4
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_datachain.py +350 -19
- datachain-0.6.1/tests/unit/lib/test_sql_to_python.py +25 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/utils.py +20 -0
- datachain-0.6.0/src/datachain/lib/convert/sql_to_python.py +0 -18
- datachain-0.6.0/tests/scripts/feature_class_exception.py +0 -11
- datachain-0.6.0/tests/unit/lib/test_sql_to_python.py +0 -28
- {datachain-0.6.0 → datachain-0.6.1}/.cruft.json +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.gitattributes +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.github/codecov.yaml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.github/dependabot.yml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.github/workflows/release.yml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.gitignore +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/.pre-commit-config.yaml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/CONTRIBUTING.rst +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/LICENSE +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/README.rst +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/assets/flowchart.png +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/index.md +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/references/datachain.md +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/references/datatype.md +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/references/file.md +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/references/index.md +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/references/sql.md +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/references/torch.md +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/docs/references/udf.md +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/mkdocs.yml +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/noxfile.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/overrides/main.html +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/setup.cfg +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/__main__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/asyn.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/cache.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/cli.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/cli_utils.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/local.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/config.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/dataset.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/error.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/job.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/file.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/udf.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/listing.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/node.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/progress.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/py.typed +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/params.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/storage.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain/utils.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/conftest.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/data.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/examples/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_client.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_listing.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_ls.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_pull.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_query.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/test_telemetry.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_client.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_query.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_session.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_storage.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_warehouse.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -81,7 +81,7 @@ Requires-Dist: requests-mock; extra == "tests"
|
|
|
81
81
|
Requires-Dist: scipy; extra == "tests"
|
|
82
82
|
Provides-Extra: dev
|
|
83
83
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
84
|
-
Requires-Dist: mypy==1.
|
|
84
|
+
Requires-Dist: mypy==1.12.0; extra == "dev"
|
|
85
85
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
86
86
|
Requires-Dist: types-pytz; extra == "dev"
|
|
87
87
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from datachain.lib import func
|
|
1
2
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
2
3
|
from datachain.lib.dc import C, Column, DataChain, Sys
|
|
3
4
|
from datachain.lib.file import (
|
|
@@ -34,6 +35,7 @@ __all__ = [
|
|
|
34
35
|
"Sys",
|
|
35
36
|
"TarVFile",
|
|
36
37
|
"TextFile",
|
|
38
|
+
"func",
|
|
37
39
|
"is_chain_type",
|
|
38
40
|
"metrics",
|
|
39
41
|
"param",
|
|
@@ -989,13 +989,6 @@ class Catalog:
|
|
|
989
989
|
c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
|
|
990
990
|
}
|
|
991
991
|
|
|
992
|
-
job_id = job_id or os.getenv("DATACHAIN_JOB_ID")
|
|
993
|
-
if not job_id:
|
|
994
|
-
from datachain.query.session import Session
|
|
995
|
-
|
|
996
|
-
session = Session.get(catalog=self)
|
|
997
|
-
job_id = session.job_id
|
|
998
|
-
|
|
999
992
|
dataset = self.metastore.create_dataset_version(
|
|
1000
993
|
dataset,
|
|
1001
994
|
version,
|
|
@@ -1218,6 +1211,7 @@ class Catalog:
|
|
|
1218
1211
|
preview=dataset_version.preview,
|
|
1219
1212
|
job_id=dataset_version.job_id,
|
|
1220
1213
|
)
|
|
1214
|
+
|
|
1221
1215
|
# to avoid re-creating rows table, we are just renaming it for a new version
|
|
1222
1216
|
# of target dataset
|
|
1223
1217
|
self.warehouse.rename_dataset_table(
|
|
@@ -1325,8 +1319,6 @@ class Catalog:
|
|
|
1325
1319
|
if offset:
|
|
1326
1320
|
q = q.offset(offset)
|
|
1327
1321
|
|
|
1328
|
-
q = q.order_by("sys__id")
|
|
1329
|
-
|
|
1330
1322
|
return q.to_db_records()
|
|
1331
1323
|
|
|
1332
1324
|
def signed_url(self, source: str, path: str, client_config=None) -> str:
|
|
@@ -763,6 +763,14 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
763
763
|
query: Select,
|
|
764
764
|
progress_cb: Optional[Callable[[int], None]] = None,
|
|
765
765
|
) -> None:
|
|
766
|
+
if len(query._group_by_clause) > 0:
|
|
767
|
+
select_q = query.with_only_columns(
|
|
768
|
+
*[c for c in query.selected_columns if c.name != "sys__id"]
|
|
769
|
+
)
|
|
770
|
+
q = table.insert().from_select(list(select_q.selected_columns), select_q)
|
|
771
|
+
self.db.execute(q)
|
|
772
|
+
return
|
|
773
|
+
|
|
766
774
|
if "sys__id" in query.selected_columns:
|
|
767
775
|
col_id = query.selected_columns.sys__id
|
|
768
776
|
else:
|
|
@@ -215,10 +215,6 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
215
215
|
limit = query._limit
|
|
216
216
|
paginated_query = query.limit(page_size)
|
|
217
217
|
|
|
218
|
-
if not paginated_query._order_by_clauses:
|
|
219
|
-
# default order by is order by `sys__id`
|
|
220
|
-
paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
|
|
221
|
-
|
|
222
218
|
results = None
|
|
223
219
|
offset = 0
|
|
224
220
|
num_yielded = 0
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from decimal import Decimal
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import ColumnElement
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def sql_to_python(sql_exp: ColumnElement) -> Any:
|
|
8
|
+
try:
|
|
9
|
+
type_ = sql_exp.type.python_type
|
|
10
|
+
if type_ == Decimal:
|
|
11
|
+
type_ = float
|
|
12
|
+
except NotImplementedError:
|
|
13
|
+
type_ = str
|
|
14
|
+
return type_
|
|
@@ -29,6 +29,7 @@ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
|
29
29
|
from datachain.lib.dataset_info import DatasetInfo
|
|
30
30
|
from datachain.lib.file import ArrowRow, File, get_file_type
|
|
31
31
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
32
|
+
from datachain.lib.func import Func
|
|
32
33
|
from datachain.lib.listing import (
|
|
33
34
|
is_listing_dataset,
|
|
34
35
|
is_listing_expired,
|
|
@@ -42,21 +43,12 @@ from datachain.lib.meta_formats import read_meta, read_schema
|
|
|
42
43
|
from datachain.lib.model_store import ModelStore
|
|
43
44
|
from datachain.lib.settings import Settings
|
|
44
45
|
from datachain.lib.signal_schema import SignalSchema
|
|
45
|
-
from datachain.lib.udf import
|
|
46
|
-
Aggregator,
|
|
47
|
-
BatchMapper,
|
|
48
|
-
Generator,
|
|
49
|
-
Mapper,
|
|
50
|
-
UDFBase,
|
|
51
|
-
)
|
|
46
|
+
from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
|
|
52
47
|
from datachain.lib.udf_signature import UdfSignature
|
|
53
|
-
from datachain.lib.utils import DataChainParamsError
|
|
48
|
+
from datachain.lib.utils import DataChainColumnError, DataChainParamsError
|
|
54
49
|
from datachain.query import Session
|
|
55
|
-
from datachain.query.dataset import
|
|
56
|
-
|
|
57
|
-
PartitionByType,
|
|
58
|
-
)
|
|
59
|
-
from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
|
|
50
|
+
from datachain.query.dataset import DatasetQuery, PartitionByType
|
|
51
|
+
from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
|
|
60
52
|
from datachain.sql.functions import path as pathfunc
|
|
61
53
|
from datachain.telemetry import telemetry
|
|
62
54
|
from datachain.utils import batched_it, inside_notebook
|
|
@@ -149,11 +141,6 @@ class DatasetMergeError(DataChainParamsError): # noqa: D101
|
|
|
149
141
|
super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
|
|
150
142
|
|
|
151
143
|
|
|
152
|
-
class DataChainColumnError(DataChainParamsError): # noqa: D101
|
|
153
|
-
def __init__(self, col_name, msg): # noqa: D107
|
|
154
|
-
super().__init__(f"Error for column {col_name}: {msg}")
|
|
155
|
-
|
|
156
|
-
|
|
157
144
|
OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
|
|
158
145
|
|
|
159
146
|
|
|
@@ -982,10 +969,9 @@ class DataChain:
|
|
|
982
969
|
row is left in the result set.
|
|
983
970
|
|
|
984
971
|
Example:
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
```
|
|
972
|
+
```py
|
|
973
|
+
dc.distinct("file.parent", "file.name")
|
|
974
|
+
```
|
|
989
975
|
"""
|
|
990
976
|
return self._evolve(
|
|
991
977
|
query=self._query.distinct(
|
|
@@ -1011,6 +997,60 @@ class DataChain:
|
|
|
1011
997
|
query=self._query.select(*columns), signal_schema=new_schema
|
|
1012
998
|
)
|
|
1013
999
|
|
|
1000
|
+
def group_by(
|
|
1001
|
+
self,
|
|
1002
|
+
*,
|
|
1003
|
+
partition_by: Union[str, Sequence[str]],
|
|
1004
|
+
**kwargs: Func,
|
|
1005
|
+
) -> "Self":
|
|
1006
|
+
"""Group rows by specified set of signals and return new signals
|
|
1007
|
+
with aggregated values.
|
|
1008
|
+
|
|
1009
|
+
Example:
|
|
1010
|
+
```py
|
|
1011
|
+
chain = chain.group_by(
|
|
1012
|
+
cnt=func.count(),
|
|
1013
|
+
partition_by=("file_source", "file_ext"),
|
|
1014
|
+
)
|
|
1015
|
+
```
|
|
1016
|
+
"""
|
|
1017
|
+
if isinstance(partition_by, str):
|
|
1018
|
+
partition_by = [partition_by]
|
|
1019
|
+
if not partition_by:
|
|
1020
|
+
raise ValueError("At least one column should be provided for partition_by")
|
|
1021
|
+
|
|
1022
|
+
if not kwargs:
|
|
1023
|
+
raise ValueError("At least one column should be provided for group_by")
|
|
1024
|
+
for col_name, func in kwargs.items():
|
|
1025
|
+
if not isinstance(func, Func):
|
|
1026
|
+
raise DataChainColumnError(
|
|
1027
|
+
col_name,
|
|
1028
|
+
f"Column {col_name} has type {type(func)} but expected Func object",
|
|
1029
|
+
)
|
|
1030
|
+
|
|
1031
|
+
partition_by_columns: list[Column] = []
|
|
1032
|
+
signal_columns: list[Column] = []
|
|
1033
|
+
schema_fields: dict[str, DataType] = {}
|
|
1034
|
+
|
|
1035
|
+
# validate partition_by columns and add them to the schema
|
|
1036
|
+
for col_name in partition_by:
|
|
1037
|
+
col_db_name = ColumnMeta.to_db_name(col_name)
|
|
1038
|
+
col_type = self.signals_schema.get_column_type(col_db_name)
|
|
1039
|
+
col = Column(col_db_name, python_to_sql(col_type))
|
|
1040
|
+
partition_by_columns.append(col)
|
|
1041
|
+
schema_fields[col_db_name] = col_type
|
|
1042
|
+
|
|
1043
|
+
# validate signal columns and add them to the schema
|
|
1044
|
+
for col_name, func in kwargs.items():
|
|
1045
|
+
col = func.get_column(self.signals_schema, label=col_name)
|
|
1046
|
+
signal_columns.append(col)
|
|
1047
|
+
schema_fields[col_name] = func.get_result_type(self.signals_schema)
|
|
1048
|
+
|
|
1049
|
+
return self._evolve(
|
|
1050
|
+
query=self._query.group_by(signal_columns, partition_by_columns),
|
|
1051
|
+
signal_schema=SignalSchema(schema_fields),
|
|
1052
|
+
)
|
|
1053
|
+
|
|
1014
1054
|
def mutate(self, **kwargs) -> "Self":
|
|
1015
1055
|
"""Create new signals based on existing signals.
|
|
1016
1056
|
|
|
@@ -1477,12 +1517,6 @@ class DataChain:
|
|
|
1477
1517
|
fr_map = {col.lower(): df[col].tolist() for col in df.columns}
|
|
1478
1518
|
|
|
1479
1519
|
for column in fr_map:
|
|
1480
|
-
if column in DatasetRow.schema:
|
|
1481
|
-
raise DatasetPrepareError(
|
|
1482
|
-
name,
|
|
1483
|
-
f"import from pandas error - column '{column}' conflicts with"
|
|
1484
|
-
" default schema",
|
|
1485
|
-
)
|
|
1486
1520
|
if not column.isidentifier():
|
|
1487
1521
|
raise DatasetPrepareError(
|
|
1488
1522
|
name,
|
|
@@ -1994,6 +2028,8 @@ class DataChain:
|
|
|
1994
2028
|
),
|
|
1995
2029
|
)
|
|
1996
2030
|
|
|
2031
|
+
session.add_dataset_version(dsr, dsr.latest_version)
|
|
2032
|
+
|
|
1997
2033
|
if isinstance(to_insert, dict):
|
|
1998
2034
|
to_insert = [to_insert]
|
|
1999
2035
|
elif not to_insert:
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import func as sa_func
|
|
4
|
+
|
|
5
|
+
from datachain.sql import functions as dc_func
|
|
6
|
+
|
|
7
|
+
from .func import Func
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def count(col: Optional[str] = None) -> Func:
|
|
11
|
+
return Func(inner=sa_func.count, col=col, result_type=int)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def sum(col: str) -> Func:
|
|
15
|
+
return Func(inner=sa_func.sum, col=col)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def avg(col: str) -> Func:
|
|
19
|
+
return Func(inner=dc_func.aggregate.avg, col=col)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def min(col: str) -> Func:
|
|
23
|
+
return Func(inner=sa_func.min, col=col)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def max(col: str) -> Func:
|
|
27
|
+
return Func(inner=sa_func.max, col=col)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def any_value(col: str) -> Func:
|
|
31
|
+
return Func(inner=dc_func.aggregate.any_value, col=col)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def collect(col: str) -> Func:
|
|
35
|
+
return Func(inner=dc_func.aggregate.collect, col=col, is_array=True)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def concat(col: str, separator="") -> Func:
|
|
39
|
+
def inner(arg):
|
|
40
|
+
return dc_func.aggregate.group_concat(arg, separator)
|
|
41
|
+
|
|
42
|
+
return Func(inner=inner, col=col, result_type=str)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Callable, Optional
|
|
2
|
+
|
|
3
|
+
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
4
|
+
from datachain.lib.utils import DataChainColumnError
|
|
5
|
+
from datachain.query.schema import Column, ColumnMeta
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from datachain import DataType
|
|
9
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Func:
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
inner: Callable,
|
|
16
|
+
col: Optional[str] = None,
|
|
17
|
+
result_type: Optional["DataType"] = None,
|
|
18
|
+
is_array: bool = False,
|
|
19
|
+
) -> None:
|
|
20
|
+
self.inner = inner
|
|
21
|
+
self.col = col
|
|
22
|
+
self.result_type = result_type
|
|
23
|
+
self.is_array = is_array
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def db_col(self) -> Optional[str]:
|
|
27
|
+
return ColumnMeta.to_db_name(self.col) if self.col else None
|
|
28
|
+
|
|
29
|
+
def db_col_type(self, signals_schema: "SignalSchema") -> Optional["DataType"]:
|
|
30
|
+
if not self.db_col:
|
|
31
|
+
return None
|
|
32
|
+
col_type: type = signals_schema.get_column_type(self.db_col)
|
|
33
|
+
return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
|
|
34
|
+
|
|
35
|
+
def get_result_type(self, signals_schema: "SignalSchema") -> "DataType":
|
|
36
|
+
col_type = self.db_col_type(signals_schema)
|
|
37
|
+
|
|
38
|
+
if self.result_type:
|
|
39
|
+
return self.result_type
|
|
40
|
+
|
|
41
|
+
if col_type:
|
|
42
|
+
return col_type
|
|
43
|
+
|
|
44
|
+
raise DataChainColumnError(
|
|
45
|
+
str(self.inner),
|
|
46
|
+
"Column name is required to infer result type",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def get_column(
|
|
50
|
+
self, signals_schema: "SignalSchema", label: Optional[str] = None
|
|
51
|
+
) -> Column:
|
|
52
|
+
if self.col:
|
|
53
|
+
if label == "collect":
|
|
54
|
+
print(label)
|
|
55
|
+
col_type = self.get_result_type(signals_schema)
|
|
56
|
+
col = Column(self.db_col, python_to_sql(col_type))
|
|
57
|
+
func_col = self.inner(col)
|
|
58
|
+
else:
|
|
59
|
+
func_col = self.inner()
|
|
60
|
+
|
|
61
|
+
if label:
|
|
62
|
+
func_col = func_col.label(label)
|
|
63
|
+
|
|
64
|
+
return func_col
|
|
@@ -400,6 +400,12 @@ class SignalSchema:
|
|
|
400
400
|
if ModelStore.is_pydantic(finfo.annotation):
|
|
401
401
|
SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
|
|
402
402
|
|
|
403
|
+
def get_column_type(self, col_name: str) -> DataType:
|
|
404
|
+
for path, _type, has_subtree, _ in self.get_flat_tree():
|
|
405
|
+
if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name:
|
|
406
|
+
return _type
|
|
407
|
+
raise SignalResolvingError([col_name], "is not found")
|
|
408
|
+
|
|
403
409
|
def db_signals(
|
|
404
410
|
self, name: Optional[str] = None, as_columns=False
|
|
405
411
|
) -> Union[list[str], list[Column]]:
|
|
@@ -490,7 +496,7 @@ class SignalSchema:
|
|
|
490
496
|
new_values[name] = args_map[name]
|
|
491
497
|
else:
|
|
492
498
|
# adding new signal
|
|
493
|
-
new_values
|
|
499
|
+
new_values[name] = sql_to_python(value)
|
|
494
500
|
|
|
495
501
|
return SignalSchema(new_values)
|
|
496
502
|
|
|
@@ -534,12 +540,12 @@ class SignalSchema:
|
|
|
534
540
|
for name, val in values.items()
|
|
535
541
|
}
|
|
536
542
|
|
|
537
|
-
def get_flat_tree(self) -> Iterator[tuple[list[str],
|
|
543
|
+
def get_flat_tree(self) -> Iterator[tuple[list[str], DataType, bool, int]]:
|
|
538
544
|
yield from self._get_flat_tree(self.tree, [], 0)
|
|
539
545
|
|
|
540
546
|
def _get_flat_tree(
|
|
541
547
|
self, tree: dict, prefix: list[str], depth: int
|
|
542
|
-
) -> Iterator[tuple[list[str],
|
|
548
|
+
) -> Iterator[tuple[list[str], DataType, bool, int]]:
|
|
543
549
|
for name, (type_, substree) in tree.items():
|
|
544
550
|
suffix = name.split(".")
|
|
545
551
|
new_prefix = prefix + suffix
|
|
@@ -23,3 +23,8 @@ class DataChainError(Exception):
|
|
|
23
23
|
class DataChainParamsError(DataChainError):
|
|
24
24
|
def __init__(self, message):
|
|
25
25
|
super().__init__(message)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DataChainColumnError(DataChainParamsError):
|
|
29
|
+
def __init__(self, col_name, msg):
|
|
30
|
+
super().__init__(f"Error for column {col_name}: {msg}")
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
from .dataset import DatasetQuery
|
|
2
2
|
from .params import param
|
|
3
|
-
from .schema import C,
|
|
3
|
+
from .schema import C, LocalFilename, Object, Stream
|
|
4
4
|
from .session import Session
|
|
5
5
|
|
|
6
6
|
__all__ = [
|
|
7
7
|
"C",
|
|
8
8
|
"DatasetQuery",
|
|
9
|
-
"DatasetRow",
|
|
10
9
|
"LocalFilename",
|
|
11
10
|
"Object",
|
|
12
11
|
"Session",
|
|
@@ -591,10 +591,6 @@ class UDFSignal(UDFStep):
|
|
|
591
591
|
return query, []
|
|
592
592
|
table = self.catalog.warehouse.create_pre_udf_table(query)
|
|
593
593
|
q: Select = sqlalchemy.select(*table.c)
|
|
594
|
-
if query._order_by_clauses:
|
|
595
|
-
# we are adding ordering only if it's explicitly added by user in
|
|
596
|
-
# query part before adding signals
|
|
597
|
-
q = q.order_by(table.c.sys__id)
|
|
598
594
|
return q, [table]
|
|
599
595
|
|
|
600
596
|
def create_result_query(
|
|
@@ -630,11 +626,6 @@ class UDFSignal(UDFStep):
|
|
|
630
626
|
else:
|
|
631
627
|
res = sqlalchemy.select(*cols1).select_from(subq)
|
|
632
628
|
|
|
633
|
-
if query._order_by_clauses:
|
|
634
|
-
# if ordering is used in query part before adding signals, we
|
|
635
|
-
# will have it as order by id from select from pre-created udf table
|
|
636
|
-
res = res.order_by(subq.c.sys__id)
|
|
637
|
-
|
|
638
629
|
if self.partition_by is not None:
|
|
639
630
|
subquery = res.subquery()
|
|
640
631
|
res = sqlalchemy.select(*subquery.c).select_from(subquery)
|
|
@@ -666,13 +657,6 @@ class RowGenerator(UDFStep):
|
|
|
666
657
|
def create_result_query(
|
|
667
658
|
self, udf_table, query: Select
|
|
668
659
|
) -> tuple[QueryGeneratorFunc, list["sqlalchemy.Column"]]:
|
|
669
|
-
if not query._order_by_clauses:
|
|
670
|
-
# if we are not selecting all rows in UDF, we need to ensure that
|
|
671
|
-
# we get the same rows as we got as inputs of UDF since selecting
|
|
672
|
-
# without ordering can be non deterministic in some databases
|
|
673
|
-
c = query.selected_columns
|
|
674
|
-
query = query.order_by(c.sys__id)
|
|
675
|
-
|
|
676
660
|
udf_table_query = udf_table.select().subquery()
|
|
677
661
|
udf_table_cols: list[sqlalchemy.Label[Any]] = [
|
|
678
662
|
label(c.name, c) for c in udf_table_query.columns
|
|
@@ -957,24 +941,24 @@ class SQLJoin(Step):
|
|
|
957
941
|
|
|
958
942
|
|
|
959
943
|
@frozen
|
|
960
|
-
class
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
cols: PartitionByType
|
|
944
|
+
class SQLGroupBy(SQLClause):
|
|
945
|
+
cols: Sequence[Union[str, ColumnElement]]
|
|
946
|
+
group_by: Sequence[Union[str, ColumnElement]]
|
|
964
947
|
|
|
965
|
-
def
|
|
966
|
-
|
|
948
|
+
def apply_sql_clause(self, query) -> Select:
|
|
949
|
+
if not self.cols:
|
|
950
|
+
raise ValueError("No columns to select")
|
|
951
|
+
if not self.group_by:
|
|
952
|
+
raise ValueError("No columns to group by")
|
|
967
953
|
|
|
968
|
-
|
|
969
|
-
self, query_generator: QueryGenerator, temp_tables: list[str]
|
|
970
|
-
) -> StepResult:
|
|
971
|
-
query = query_generator.select()
|
|
972
|
-
grouped_query = query.group_by(*self.cols)
|
|
954
|
+
subquery = query.subquery()
|
|
973
955
|
|
|
974
|
-
|
|
975
|
-
|
|
956
|
+
cols = [
|
|
957
|
+
subquery.c[str(c)] if isinstance(c, (str, C)) else c
|
|
958
|
+
for c in [*self.group_by, *self.cols]
|
|
959
|
+
]
|
|
976
960
|
|
|
977
|
-
return
|
|
961
|
+
return sqlalchemy.select(*cols).select_from(subquery).group_by(*self.group_by)
|
|
978
962
|
|
|
979
963
|
|
|
980
964
|
def _validate_columns(
|
|
@@ -1130,25 +1114,14 @@ class DatasetQuery:
|
|
|
1130
1114
|
query.steps = query.steps[-1:] + query.steps[:-1]
|
|
1131
1115
|
|
|
1132
1116
|
result = query.starting_step.apply()
|
|
1133
|
-
group_by = None
|
|
1134
1117
|
self.dependencies.update(result.dependencies)
|
|
1135
1118
|
|
|
1136
1119
|
for step in query.steps:
|
|
1137
|
-
if isinstance(step, GroupBy):
|
|
1138
|
-
if group_by is not None:
|
|
1139
|
-
raise TypeError("only one group_by allowed")
|
|
1140
|
-
group_by = step
|
|
1141
|
-
continue
|
|
1142
|
-
|
|
1143
1120
|
result = step.apply(
|
|
1144
1121
|
result.query_generator, self.temp_table_names
|
|
1145
1122
|
) # a chain of steps linked by results
|
|
1146
1123
|
self.dependencies.update(result.dependencies)
|
|
1147
1124
|
|
|
1148
|
-
if group_by:
|
|
1149
|
-
result = group_by.apply(result.query_generator, self.temp_table_names)
|
|
1150
|
-
self.dependencies.update(result.dependencies)
|
|
1151
|
-
|
|
1152
1125
|
return result.query_generator
|
|
1153
1126
|
|
|
1154
1127
|
@staticmethod
|
|
@@ -1410,9 +1383,13 @@ class DatasetQuery:
|
|
|
1410
1383
|
return query.as_scalar()
|
|
1411
1384
|
|
|
1412
1385
|
@detach
|
|
1413
|
-
def group_by(
|
|
1386
|
+
def group_by(
|
|
1387
|
+
self,
|
|
1388
|
+
cols: Sequence[ColumnElement],
|
|
1389
|
+
group_by: Sequence[ColumnElement],
|
|
1390
|
+
) -> "Self":
|
|
1414
1391
|
query = self.clone()
|
|
1415
|
-
query.steps.append(
|
|
1392
|
+
query.steps.append(SQLGroupBy(cols, group_by))
|
|
1416
1393
|
return query
|
|
1417
1394
|
|
|
1418
1395
|
@detach
|
|
@@ -1591,6 +1568,8 @@ class DatasetQuery:
|
|
|
1591
1568
|
)
|
|
1592
1569
|
version = version or dataset.latest_version
|
|
1593
1570
|
|
|
1571
|
+
self.session.add_dataset_version(dataset=dataset, version=version)
|
|
1572
|
+
|
|
1594
1573
|
dr = self.catalog.warehouse.dataset_rows(dataset)
|
|
1595
1574
|
|
|
1596
1575
|
self.catalog.warehouse.copy_table(dr.get_table(), query.select())
|
|
@@ -1,16 +1,13 @@
|
|
|
1
1
|
import functools
|
|
2
|
-
import json
|
|
3
2
|
from abc import ABC, abstractmethod
|
|
4
|
-
from datetime import datetime, timezone
|
|
5
3
|
from fnmatch import fnmatch
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Callable,
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
|
7
5
|
|
|
8
6
|
import attrs
|
|
9
7
|
import sqlalchemy as sa
|
|
10
8
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
11
9
|
|
|
12
10
|
from datachain.lib.file import File
|
|
13
|
-
from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
|
|
14
11
|
|
|
15
12
|
if TYPE_CHECKING:
|
|
16
13
|
from datachain.catalog import Catalog
|
|
@@ -228,61 +225,4 @@ def normalize_param(param: UDFParamSpec) -> UDFParameter:
|
|
|
228
225
|
raise TypeError(f"Invalid UDF parameter: {param}")
|
|
229
226
|
|
|
230
227
|
|
|
231
|
-
class DatasetRow:
|
|
232
|
-
schema: ClassVar[dict[str, type[SQLType]]] = {
|
|
233
|
-
"source": String,
|
|
234
|
-
"path": String,
|
|
235
|
-
"size": Int64,
|
|
236
|
-
"location": JSON,
|
|
237
|
-
"is_latest": Boolean,
|
|
238
|
-
"last_modified": DateTime,
|
|
239
|
-
"version": String,
|
|
240
|
-
"etag": String,
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
@staticmethod
|
|
244
|
-
def create(
|
|
245
|
-
path: str,
|
|
246
|
-
source: str = "",
|
|
247
|
-
size: int = 0,
|
|
248
|
-
location: Optional[dict[str, Any]] = None,
|
|
249
|
-
is_latest: bool = True,
|
|
250
|
-
last_modified: Optional[datetime] = None,
|
|
251
|
-
version: str = "",
|
|
252
|
-
etag: str = "",
|
|
253
|
-
) -> tuple[
|
|
254
|
-
str,
|
|
255
|
-
str,
|
|
256
|
-
int,
|
|
257
|
-
Optional[str],
|
|
258
|
-
int,
|
|
259
|
-
bool,
|
|
260
|
-
datetime,
|
|
261
|
-
str,
|
|
262
|
-
str,
|
|
263
|
-
int,
|
|
264
|
-
]:
|
|
265
|
-
if location:
|
|
266
|
-
location = json.dumps([location]) # type: ignore [assignment]
|
|
267
|
-
|
|
268
|
-
last_modified = last_modified or datetime.now(timezone.utc)
|
|
269
|
-
|
|
270
|
-
return ( # type: ignore [return-value]
|
|
271
|
-
source,
|
|
272
|
-
path,
|
|
273
|
-
size,
|
|
274
|
-
location,
|
|
275
|
-
is_latest,
|
|
276
|
-
last_modified,
|
|
277
|
-
version,
|
|
278
|
-
etag,
|
|
279
|
-
)
|
|
280
|
-
|
|
281
|
-
@staticmethod
|
|
282
|
-
def extend(**columns):
|
|
283
|
-
cols = {**DatasetRow.schema}
|
|
284
|
-
cols.update(columns)
|
|
285
|
-
return cols
|
|
286
|
-
|
|
287
|
-
|
|
288
228
|
C = Column
|