datachain 0.5.1__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.5.1 → datachain-0.6.1}/.pre-commit-config.yaml +2 -2
- {datachain-0.5.1/src/datachain.egg-info → datachain-0.6.1}/PKG-INFO +2 -2
- {datachain-0.5.1 → datachain-0.6.1}/pyproject.toml +1 -1
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/__init__.py +2 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/catalog/catalog.py +1 -9
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/sqlite.py +8 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/warehouse.py +0 -4
- datachain-0.6.1/src/datachain/lib/convert/sql_to_python.py +14 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/convert/values_to_tuples.py +2 -2
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/data_model.py +1 -1
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/dc.py +82 -30
- datachain-0.6.1/src/datachain/lib/func/__init__.py +14 -0
- datachain-0.6.1/src/datachain/lib/func/aggregate.py +42 -0
- datachain-0.6.1/src/datachain/lib/func/func.py +64 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/signal_schema.py +15 -9
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/udf.py +177 -151
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/utils.py +5 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/query/__init__.py +1 -2
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/query/batch.py +0 -11
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/query/dataset.py +23 -44
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/query/dispatch.py +0 -12
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/query/schema.py +1 -61
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/query/session.py +33 -25
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/functions/__init__.py +1 -1
- datachain-0.6.1/src/datachain/sql/functions/aggregate.py +47 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/functions/array.py +0 -8
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/functions/string.py +12 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/sqlite/base.py +30 -7
- {datachain-0.5.1 → datachain-0.6.1/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain.egg-info/SOURCES.txt +4 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_datachain.py +61 -8
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_dataset_query.py +0 -34
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_datasets.py +33 -0
- datachain-0.6.1/tests/scripts/feature_class_exception.py +34 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/test_atomicity.py +10 -4
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_datachain.py +361 -19
- datachain-0.6.1/tests/unit/lib/test_sql_to_python.py +25 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/sql/test_string.py +15 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/utils.py +20 -0
- datachain-0.5.1/src/datachain/lib/convert/sql_to_python.py +0 -18
- datachain-0.5.1/tests/scripts/feature_class_exception.py +0 -24
- datachain-0.5.1/tests/unit/lib/test_sql_to_python.py +0 -28
- {datachain-0.5.1 → datachain-0.6.1}/.cruft.json +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.gitattributes +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.github/codecov.yaml +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.github/dependabot.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.github/workflows/release.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/.gitignore +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/CONTRIBUTING.rst +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/LICENSE +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/README.rst +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/assets/flowchart.png +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/index.md +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/references/datachain.md +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/references/datatype.md +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/references/file.md +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/references/index.md +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/references/sql.md +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/references/torch.md +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/docs/references/udf.md +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/mkdocs.yml +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/noxfile.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/overrides/main.html +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/setup.cfg +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/__main__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/asyn.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/cache.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/cli.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/cli_utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/client/local.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/config.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/dataset.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/error.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/job.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/file.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/listing.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/node.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/progress.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/py.typed +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/query/params.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/storage.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain/utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/conftest.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/data.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/examples/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_client.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_listing.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_ls.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_pull.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/func/test_query.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/test_telemetry.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_client.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_query.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_session.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_storage.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.5.1 → datachain-0.6.1}/tests/unit/test_warehouse.py +0 -0
|
@@ -4,7 +4,7 @@ ci:
|
|
|
4
4
|
skip: [mypy]
|
|
5
5
|
repos:
|
|
6
6
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
7
|
-
rev:
|
|
7
|
+
rev: v5.0.0
|
|
8
8
|
hooks:
|
|
9
9
|
- id: check-added-large-files
|
|
10
10
|
exclude: '^tests/examples/data/'
|
|
@@ -24,7 +24,7 @@ repos:
|
|
|
24
24
|
- id: trailing-whitespace
|
|
25
25
|
exclude: '^LICENSES/'
|
|
26
26
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
27
|
-
rev: 'v0.6.
|
|
27
|
+
rev: 'v0.6.9'
|
|
28
28
|
hooks:
|
|
29
29
|
- id: ruff
|
|
30
30
|
args: [--fix, --exit-non-zero-on-fix]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -81,7 +81,7 @@ Requires-Dist: requests-mock; extra == "tests"
|
|
|
81
81
|
Requires-Dist: scipy; extra == "tests"
|
|
82
82
|
Provides-Extra: dev
|
|
83
83
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
84
|
-
Requires-Dist: mypy==1.
|
|
84
|
+
Requires-Dist: mypy==1.12.0; extra == "dev"
|
|
85
85
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
86
86
|
Requires-Dist: types-pytz; extra == "dev"
|
|
87
87
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from datachain.lib import func
|
|
1
2
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
2
3
|
from datachain.lib.dc import C, Column, DataChain, Sys
|
|
3
4
|
from datachain.lib.file import (
|
|
@@ -34,6 +35,7 @@ __all__ = [
|
|
|
34
35
|
"Sys",
|
|
35
36
|
"TarVFile",
|
|
36
37
|
"TextFile",
|
|
38
|
+
"func",
|
|
37
39
|
"is_chain_type",
|
|
38
40
|
"metrics",
|
|
39
41
|
"param",
|
|
@@ -989,13 +989,6 @@ class Catalog:
|
|
|
989
989
|
c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
|
|
990
990
|
}
|
|
991
991
|
|
|
992
|
-
job_id = job_id or os.getenv("DATACHAIN_JOB_ID")
|
|
993
|
-
if not job_id:
|
|
994
|
-
from datachain.query.session import Session
|
|
995
|
-
|
|
996
|
-
session = Session.get(catalog=self)
|
|
997
|
-
job_id = session.job_id
|
|
998
|
-
|
|
999
992
|
dataset = self.metastore.create_dataset_version(
|
|
1000
993
|
dataset,
|
|
1001
994
|
version,
|
|
@@ -1218,6 +1211,7 @@ class Catalog:
|
|
|
1218
1211
|
preview=dataset_version.preview,
|
|
1219
1212
|
job_id=dataset_version.job_id,
|
|
1220
1213
|
)
|
|
1214
|
+
|
|
1221
1215
|
# to avoid re-creating rows table, we are just renaming it for a new version
|
|
1222
1216
|
# of target dataset
|
|
1223
1217
|
self.warehouse.rename_dataset_table(
|
|
@@ -1325,8 +1319,6 @@ class Catalog:
|
|
|
1325
1319
|
if offset:
|
|
1326
1320
|
q = q.offset(offset)
|
|
1327
1321
|
|
|
1328
|
-
q = q.order_by("sys__id")
|
|
1329
|
-
|
|
1330
1322
|
return q.to_db_records()
|
|
1331
1323
|
|
|
1332
1324
|
def signed_url(self, source: str, path: str, client_config=None) -> str:
|
|
@@ -763,6 +763,14 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
763
763
|
query: Select,
|
|
764
764
|
progress_cb: Optional[Callable[[int], None]] = None,
|
|
765
765
|
) -> None:
|
|
766
|
+
if len(query._group_by_clause) > 0:
|
|
767
|
+
select_q = query.with_only_columns(
|
|
768
|
+
*[c for c in query.selected_columns if c.name != "sys__id"]
|
|
769
|
+
)
|
|
770
|
+
q = table.insert().from_select(list(select_q.selected_columns), select_q)
|
|
771
|
+
self.db.execute(q)
|
|
772
|
+
return
|
|
773
|
+
|
|
766
774
|
if "sys__id" in query.selected_columns:
|
|
767
775
|
col_id = query.selected_columns.sys__id
|
|
768
776
|
else:
|
|
@@ -215,10 +215,6 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
215
215
|
limit = query._limit
|
|
216
216
|
paginated_query = query.limit(page_size)
|
|
217
217
|
|
|
218
|
-
if not paginated_query._order_by_clauses:
|
|
219
|
-
# default order by is order by `sys__id`
|
|
220
|
-
paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
|
|
221
|
-
|
|
222
218
|
results = None
|
|
223
219
|
offset = 0
|
|
224
220
|
num_yielded = 0
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from decimal import Decimal
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import ColumnElement
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def sql_to_python(sql_exp: ColumnElement) -> Any:
|
|
8
|
+
try:
|
|
9
|
+
type_ = sql_exp.type.python_type
|
|
10
|
+
if type_ == Decimal:
|
|
11
|
+
type_ = float
|
|
12
|
+
except NotImplementedError:
|
|
13
|
+
type_ = str
|
|
14
|
+
return type_
|
|
@@ -4,7 +4,7 @@ from typing import Any, Union
|
|
|
4
4
|
from datachain.lib.data_model import (
|
|
5
5
|
DataType,
|
|
6
6
|
DataTypeNames,
|
|
7
|
-
|
|
7
|
+
DataValue,
|
|
8
8
|
is_chain_type,
|
|
9
9
|
)
|
|
10
10
|
from datachain.lib.utils import DataChainParamsError
|
|
@@ -20,7 +20,7 @@ class ValuesToTupleError(DataChainParamsError):
|
|
|
20
20
|
def values_to_tuples( # noqa: C901, PLR0912
|
|
21
21
|
ds_name: str = "",
|
|
22
22
|
output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
|
|
23
|
-
**fr_map: Sequence[
|
|
23
|
+
**fr_map: Sequence[DataValue],
|
|
24
24
|
) -> tuple[Any, Any, Any]:
|
|
25
25
|
if output:
|
|
26
26
|
if not isinstance(output, (Sequence, str, dict)):
|
|
@@ -18,7 +18,7 @@ StandardType = Union[
|
|
|
18
18
|
]
|
|
19
19
|
DataType = Union[type[BaseModel], StandardType]
|
|
20
20
|
DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
|
|
21
|
-
|
|
21
|
+
DataValue = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class DataModel(BaseModel):
|
|
@@ -29,6 +29,7 @@ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
|
29
29
|
from datachain.lib.dataset_info import DatasetInfo
|
|
30
30
|
from datachain.lib.file import ArrowRow, File, get_file_type
|
|
31
31
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
32
|
+
from datachain.lib.func import Func
|
|
32
33
|
from datachain.lib.listing import (
|
|
33
34
|
is_listing_dataset,
|
|
34
35
|
is_listing_expired,
|
|
@@ -42,26 +43,18 @@ from datachain.lib.meta_formats import read_meta, read_schema
|
|
|
42
43
|
from datachain.lib.model_store import ModelStore
|
|
43
44
|
from datachain.lib.settings import Settings
|
|
44
45
|
from datachain.lib.signal_schema import SignalSchema
|
|
45
|
-
from datachain.lib.udf import
|
|
46
|
-
Aggregator,
|
|
47
|
-
BatchMapper,
|
|
48
|
-
Generator,
|
|
49
|
-
Mapper,
|
|
50
|
-
UDFBase,
|
|
51
|
-
)
|
|
46
|
+
from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
|
|
52
47
|
from datachain.lib.udf_signature import UdfSignature
|
|
53
|
-
from datachain.lib.utils import DataChainParamsError
|
|
48
|
+
from datachain.lib.utils import DataChainColumnError, DataChainParamsError
|
|
54
49
|
from datachain.query import Session
|
|
55
|
-
from datachain.query.dataset import
|
|
56
|
-
|
|
57
|
-
PartitionByType,
|
|
58
|
-
)
|
|
59
|
-
from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
|
|
50
|
+
from datachain.query.dataset import DatasetQuery, PartitionByType
|
|
51
|
+
from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
|
|
60
52
|
from datachain.sql.functions import path as pathfunc
|
|
61
53
|
from datachain.telemetry import telemetry
|
|
62
54
|
from datachain.utils import batched_it, inside_notebook
|
|
63
55
|
|
|
64
56
|
if TYPE_CHECKING:
|
|
57
|
+
from pyarrow import DataType as ArrowDataType
|
|
65
58
|
from typing_extensions import Concatenate, ParamSpec, Self
|
|
66
59
|
|
|
67
60
|
from datachain.lib.hf import HFDatasetType
|
|
@@ -148,11 +141,6 @@ class DatasetMergeError(DataChainParamsError): # noqa: D101
|
|
|
148
141
|
super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
|
|
149
142
|
|
|
150
143
|
|
|
151
|
-
class DataChainColumnError(DataChainParamsError): # noqa: D101
|
|
152
|
-
def __init__(self, col_name, msg): # noqa: D107
|
|
153
|
-
super().__init__(f"Error for column {col_name}: {msg}")
|
|
154
|
-
|
|
155
|
-
|
|
156
144
|
OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
|
|
157
145
|
|
|
158
146
|
|
|
@@ -981,10 +969,9 @@ class DataChain:
|
|
|
981
969
|
row is left in the result set.
|
|
982
970
|
|
|
983
971
|
Example:
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
```
|
|
972
|
+
```py
|
|
973
|
+
dc.distinct("file.parent", "file.name")
|
|
974
|
+
```
|
|
988
975
|
"""
|
|
989
976
|
return self._evolve(
|
|
990
977
|
query=self._query.distinct(
|
|
@@ -1010,6 +997,60 @@ class DataChain:
|
|
|
1010
997
|
query=self._query.select(*columns), signal_schema=new_schema
|
|
1011
998
|
)
|
|
1012
999
|
|
|
1000
|
+
def group_by(
|
|
1001
|
+
self,
|
|
1002
|
+
*,
|
|
1003
|
+
partition_by: Union[str, Sequence[str]],
|
|
1004
|
+
**kwargs: Func,
|
|
1005
|
+
) -> "Self":
|
|
1006
|
+
"""Group rows by specified set of signals and return new signals
|
|
1007
|
+
with aggregated values.
|
|
1008
|
+
|
|
1009
|
+
Example:
|
|
1010
|
+
```py
|
|
1011
|
+
chain = chain.group_by(
|
|
1012
|
+
cnt=func.count(),
|
|
1013
|
+
partition_by=("file_source", "file_ext"),
|
|
1014
|
+
)
|
|
1015
|
+
```
|
|
1016
|
+
"""
|
|
1017
|
+
if isinstance(partition_by, str):
|
|
1018
|
+
partition_by = [partition_by]
|
|
1019
|
+
if not partition_by:
|
|
1020
|
+
raise ValueError("At least one column should be provided for partition_by")
|
|
1021
|
+
|
|
1022
|
+
if not kwargs:
|
|
1023
|
+
raise ValueError("At least one column should be provided for group_by")
|
|
1024
|
+
for col_name, func in kwargs.items():
|
|
1025
|
+
if not isinstance(func, Func):
|
|
1026
|
+
raise DataChainColumnError(
|
|
1027
|
+
col_name,
|
|
1028
|
+
f"Column {col_name} has type {type(func)} but expected Func object",
|
|
1029
|
+
)
|
|
1030
|
+
|
|
1031
|
+
partition_by_columns: list[Column] = []
|
|
1032
|
+
signal_columns: list[Column] = []
|
|
1033
|
+
schema_fields: dict[str, DataType] = {}
|
|
1034
|
+
|
|
1035
|
+
# validate partition_by columns and add them to the schema
|
|
1036
|
+
for col_name in partition_by:
|
|
1037
|
+
col_db_name = ColumnMeta.to_db_name(col_name)
|
|
1038
|
+
col_type = self.signals_schema.get_column_type(col_db_name)
|
|
1039
|
+
col = Column(col_db_name, python_to_sql(col_type))
|
|
1040
|
+
partition_by_columns.append(col)
|
|
1041
|
+
schema_fields[col_db_name] = col_type
|
|
1042
|
+
|
|
1043
|
+
# validate signal columns and add them to the schema
|
|
1044
|
+
for col_name, func in kwargs.items():
|
|
1045
|
+
col = func.get_column(self.signals_schema, label=col_name)
|
|
1046
|
+
signal_columns.append(col)
|
|
1047
|
+
schema_fields[col_name] = func.get_result_type(self.signals_schema)
|
|
1048
|
+
|
|
1049
|
+
return self._evolve(
|
|
1050
|
+
query=self._query.group_by(signal_columns, partition_by_columns),
|
|
1051
|
+
signal_schema=SignalSchema(schema_fields),
|
|
1052
|
+
)
|
|
1053
|
+
|
|
1013
1054
|
def mutate(self, **kwargs) -> "Self":
|
|
1014
1055
|
"""Create new signals based on existing signals.
|
|
1015
1056
|
|
|
@@ -1024,7 +1065,7 @@ class DataChain:
|
|
|
1024
1065
|
The supported functions:
|
|
1025
1066
|
Numerical: +, -, *, /, rand(), avg(), count(), func(),
|
|
1026
1067
|
greatest(), least(), max(), min(), sum()
|
|
1027
|
-
String: length(), split()
|
|
1068
|
+
String: length(), split(), replace(), regexp_replace()
|
|
1028
1069
|
Filename: name(), parent(), file_stem(), file_ext()
|
|
1029
1070
|
Array: length(), sip_hash_64(), euclidean_distance(),
|
|
1030
1071
|
cosine_distance()
|
|
@@ -1476,12 +1517,6 @@ class DataChain:
|
|
|
1476
1517
|
fr_map = {col.lower(): df[col].tolist() for col in df.columns}
|
|
1477
1518
|
|
|
1478
1519
|
for column in fr_map:
|
|
1479
|
-
if column in DatasetRow.schema:
|
|
1480
|
-
raise DatasetPrepareError(
|
|
1481
|
-
name,
|
|
1482
|
-
f"import from pandas error - column '{column}' conflicts with"
|
|
1483
|
-
" default schema",
|
|
1484
|
-
)
|
|
1485
1520
|
if not column.isidentifier():
|
|
1486
1521
|
raise DatasetPrepareError(
|
|
1487
1522
|
name,
|
|
@@ -1709,6 +1744,7 @@ class DataChain:
|
|
|
1709
1744
|
nrows=None,
|
|
1710
1745
|
session: Optional[Session] = None,
|
|
1711
1746
|
settings: Optional[dict] = None,
|
|
1747
|
+
column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
|
|
1712
1748
|
**kwargs,
|
|
1713
1749
|
) -> "DataChain":
|
|
1714
1750
|
"""Generate chain from csv files.
|
|
@@ -1727,6 +1763,9 @@ class DataChain:
|
|
|
1727
1763
|
nrows : Optional row limit.
|
|
1728
1764
|
session : Session to use for the chain.
|
|
1729
1765
|
settings : Settings to use for the chain.
|
|
1766
|
+
column_types : Dictionary of column names and their corresponding types.
|
|
1767
|
+
It is passed to CSV reader and for each column specified type auto
|
|
1768
|
+
inference is disabled.
|
|
1730
1769
|
|
|
1731
1770
|
Example:
|
|
1732
1771
|
Reading a csv file:
|
|
@@ -1742,6 +1781,15 @@ class DataChain:
|
|
|
1742
1781
|
from pandas.io.parsers.readers import STR_NA_VALUES
|
|
1743
1782
|
from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
|
|
1744
1783
|
from pyarrow.dataset import CsvFileFormat
|
|
1784
|
+
from pyarrow.lib import type_for_alias
|
|
1785
|
+
|
|
1786
|
+
if column_types:
|
|
1787
|
+
column_types = {
|
|
1788
|
+
name: type_for_alias(typ) if isinstance(typ, str) else typ
|
|
1789
|
+
for name, typ in column_types.items()
|
|
1790
|
+
}
|
|
1791
|
+
else:
|
|
1792
|
+
column_types = {}
|
|
1745
1793
|
|
|
1746
1794
|
chain = DataChain.from_storage(
|
|
1747
1795
|
path, session=session, settings=settings, **kwargs
|
|
@@ -1767,7 +1815,9 @@ class DataChain:
|
|
|
1767
1815
|
parse_options = ParseOptions(delimiter=delimiter)
|
|
1768
1816
|
read_options = ReadOptions(column_names=column_names)
|
|
1769
1817
|
convert_options = ConvertOptions(
|
|
1770
|
-
strings_can_be_null=True,
|
|
1818
|
+
strings_can_be_null=True,
|
|
1819
|
+
null_values=STR_NA_VALUES,
|
|
1820
|
+
column_types=column_types,
|
|
1771
1821
|
)
|
|
1772
1822
|
format = CsvFileFormat(
|
|
1773
1823
|
parse_options=parse_options,
|
|
@@ -1978,6 +2028,8 @@ class DataChain:
|
|
|
1978
2028
|
),
|
|
1979
2029
|
)
|
|
1980
2030
|
|
|
2031
|
+
session.add_dataset_version(dsr, dsr.latest_version)
|
|
2032
|
+
|
|
1981
2033
|
if isinstance(to_insert, dict):
|
|
1982
2034
|
to_insert = [to_insert]
|
|
1983
2035
|
elif not to_insert:
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import func as sa_func
|
|
4
|
+
|
|
5
|
+
from datachain.sql import functions as dc_func
|
|
6
|
+
|
|
7
|
+
from .func import Func
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def count(col: Optional[str] = None) -> Func:
|
|
11
|
+
return Func(inner=sa_func.count, col=col, result_type=int)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def sum(col: str) -> Func:
|
|
15
|
+
return Func(inner=sa_func.sum, col=col)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def avg(col: str) -> Func:
|
|
19
|
+
return Func(inner=dc_func.aggregate.avg, col=col)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def min(col: str) -> Func:
|
|
23
|
+
return Func(inner=sa_func.min, col=col)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def max(col: str) -> Func:
|
|
27
|
+
return Func(inner=sa_func.max, col=col)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def any_value(col: str) -> Func:
|
|
31
|
+
return Func(inner=dc_func.aggregate.any_value, col=col)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def collect(col: str) -> Func:
|
|
35
|
+
return Func(inner=dc_func.aggregate.collect, col=col, is_array=True)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def concat(col: str, separator="") -> Func:
|
|
39
|
+
def inner(arg):
|
|
40
|
+
return dc_func.aggregate.group_concat(arg, separator)
|
|
41
|
+
|
|
42
|
+
return Func(inner=inner, col=col, result_type=str)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Callable, Optional
|
|
2
|
+
|
|
3
|
+
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
4
|
+
from datachain.lib.utils import DataChainColumnError
|
|
5
|
+
from datachain.query.schema import Column, ColumnMeta
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from datachain import DataType
|
|
9
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Func:
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
inner: Callable,
|
|
16
|
+
col: Optional[str] = None,
|
|
17
|
+
result_type: Optional["DataType"] = None,
|
|
18
|
+
is_array: bool = False,
|
|
19
|
+
) -> None:
|
|
20
|
+
self.inner = inner
|
|
21
|
+
self.col = col
|
|
22
|
+
self.result_type = result_type
|
|
23
|
+
self.is_array = is_array
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def db_col(self) -> Optional[str]:
|
|
27
|
+
return ColumnMeta.to_db_name(self.col) if self.col else None
|
|
28
|
+
|
|
29
|
+
def db_col_type(self, signals_schema: "SignalSchema") -> Optional["DataType"]:
|
|
30
|
+
if not self.db_col:
|
|
31
|
+
return None
|
|
32
|
+
col_type: type = signals_schema.get_column_type(self.db_col)
|
|
33
|
+
return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
|
|
34
|
+
|
|
35
|
+
def get_result_type(self, signals_schema: "SignalSchema") -> "DataType":
|
|
36
|
+
col_type = self.db_col_type(signals_schema)
|
|
37
|
+
|
|
38
|
+
if self.result_type:
|
|
39
|
+
return self.result_type
|
|
40
|
+
|
|
41
|
+
if col_type:
|
|
42
|
+
return col_type
|
|
43
|
+
|
|
44
|
+
raise DataChainColumnError(
|
|
45
|
+
str(self.inner),
|
|
46
|
+
"Column name is required to infer result type",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def get_column(
|
|
50
|
+
self, signals_schema: "SignalSchema", label: Optional[str] = None
|
|
51
|
+
) -> Column:
|
|
52
|
+
if self.col:
|
|
53
|
+
if label == "collect":
|
|
54
|
+
print(label)
|
|
55
|
+
col_type = self.get_result_type(signals_schema)
|
|
56
|
+
col = Column(self.db_col, python_to_sql(col_type))
|
|
57
|
+
func_col = self.inner(col)
|
|
58
|
+
else:
|
|
59
|
+
func_col = self.inner()
|
|
60
|
+
|
|
61
|
+
if label:
|
|
62
|
+
func_col = func_col.label(label)
|
|
63
|
+
|
|
64
|
+
return func_col
|
|
@@ -25,7 +25,7 @@ from typing_extensions import Literal as LiteralEx
|
|
|
25
25
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
26
26
|
from datachain.lib.convert.sql_to_python import sql_to_python
|
|
27
27
|
from datachain.lib.convert.unflatten import unflatten_to_json_pos
|
|
28
|
-
from datachain.lib.data_model import DataModel, DataType
|
|
28
|
+
from datachain.lib.data_model import DataModel, DataType, DataValue
|
|
29
29
|
from datachain.lib.file import File
|
|
30
30
|
from datachain.lib.model_store import ModelStore
|
|
31
31
|
from datachain.lib.utils import DataChainParamsError
|
|
@@ -110,7 +110,7 @@ class SignalSchema:
|
|
|
110
110
|
values: dict[str, DataType]
|
|
111
111
|
tree: dict[str, Any]
|
|
112
112
|
setup_func: dict[str, Callable]
|
|
113
|
-
setup_values: Optional[dict[str,
|
|
113
|
+
setup_values: Optional[dict[str, Any]]
|
|
114
114
|
|
|
115
115
|
def __init__(
|
|
116
116
|
self,
|
|
@@ -333,21 +333,21 @@ class SignalSchema:
|
|
|
333
333
|
res[db_name] = python_to_sql(type_)
|
|
334
334
|
return res
|
|
335
335
|
|
|
336
|
-
def row_to_objs(self, row: Sequence[Any]) -> list[
|
|
336
|
+
def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
|
|
337
337
|
self._init_setup_values()
|
|
338
338
|
|
|
339
|
-
objs = []
|
|
339
|
+
objs: list[DataValue] = []
|
|
340
340
|
pos = 0
|
|
341
341
|
for name, fr_type in self.values.items():
|
|
342
342
|
if self.setup_values and (val := self.setup_values.get(name, None)):
|
|
343
343
|
objs.append(val)
|
|
344
344
|
elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
|
|
345
345
|
j, pos = unflatten_to_json_pos(fr, row, pos)
|
|
346
|
-
objs.append(fr(**j))
|
|
346
|
+
objs.append(fr(**j))
|
|
347
347
|
else:
|
|
348
348
|
objs.append(row[pos])
|
|
349
349
|
pos += 1
|
|
350
|
-
return objs
|
|
350
|
+
return objs
|
|
351
351
|
|
|
352
352
|
def contains_file(self) -> bool:
|
|
353
353
|
for type_ in self.values.values():
|
|
@@ -400,6 +400,12 @@ class SignalSchema:
|
|
|
400
400
|
if ModelStore.is_pydantic(finfo.annotation):
|
|
401
401
|
SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
|
|
402
402
|
|
|
403
|
+
def get_column_type(self, col_name: str) -> DataType:
|
|
404
|
+
for path, _type, has_subtree, _ in self.get_flat_tree():
|
|
405
|
+
if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name:
|
|
406
|
+
return _type
|
|
407
|
+
raise SignalResolvingError([col_name], "is not found")
|
|
408
|
+
|
|
403
409
|
def db_signals(
|
|
404
410
|
self, name: Optional[str] = None, as_columns=False
|
|
405
411
|
) -> Union[list[str], list[Column]]:
|
|
@@ -490,7 +496,7 @@ class SignalSchema:
|
|
|
490
496
|
new_values[name] = args_map[name]
|
|
491
497
|
else:
|
|
492
498
|
# adding new signal
|
|
493
|
-
new_values
|
|
499
|
+
new_values[name] = sql_to_python(value)
|
|
494
500
|
|
|
495
501
|
return SignalSchema(new_values)
|
|
496
502
|
|
|
@@ -534,12 +540,12 @@ class SignalSchema:
|
|
|
534
540
|
for name, val in values.items()
|
|
535
541
|
}
|
|
536
542
|
|
|
537
|
-
def get_flat_tree(self) -> Iterator[tuple[list[str],
|
|
543
|
+
def get_flat_tree(self) -> Iterator[tuple[list[str], DataType, bool, int]]:
|
|
538
544
|
yield from self._get_flat_tree(self.tree, [], 0)
|
|
539
545
|
|
|
540
546
|
def _get_flat_tree(
|
|
541
547
|
self, tree: dict, prefix: list[str], depth: int
|
|
542
|
-
) -> Iterator[tuple[list[str],
|
|
548
|
+
) -> Iterator[tuple[list[str], DataType, bool, int]]:
|
|
543
549
|
for name, (type_, substree) in tree.items():
|
|
544
550
|
suffix = name.split(".")
|
|
545
551
|
new_prefix = prefix + suffix
|