datachain 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.4.0/src/datachain.egg-info → datachain-0.5.0}/PKG-INFO +1 -1
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/cli.py +3 -2
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/metastore.py +8 -8
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/warehouse.py +1 -3
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/dataset.py +0 -3
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/dc.py +197 -113
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/listing.py +5 -3
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/pytorch.py +5 -1
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/dataset.py +1 -1
- {datachain-0.4.0 → datachain-0.5.0/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.4.0 → datachain-0.5.0}/tests/conftest.py +0 -1
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_catalog.py +5 -2
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_datachain.py +4 -4
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_pull.py +0 -1
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_datachain.py +21 -25
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_datachain_merge.py +1 -1
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_warehouse.py +0 -2
- {datachain-0.4.0 → datachain-0.5.0}/.cruft.json +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.gitattributes +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.github/codecov.yaml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.github/dependabot.yml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.github/workflows/release.yml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.gitignore +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/.pre-commit-config.yaml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/CONTRIBUTING.rst +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/LICENSE +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/README.rst +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/assets/flowchart.png +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/index.md +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/references/datachain.md +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/references/datatype.md +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/references/file.md +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/references/index.md +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/references/sql.md +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/references/torch.md +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/docs/references/udf.md +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/mkdocs.yml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/noxfile.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/overrides/main.html +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/pyproject.toml +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/setup.cfg +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/__main__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/asyn.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/cache.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/cli_utils.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/local.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/config.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/error.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/job.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/file.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/listing.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/node.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/progress.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/py.typed +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/params.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/session.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/storage.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain/utils.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/data.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/examples/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_client.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_listing.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_ls.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_query.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/test_telemetry.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_client.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_query.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_session.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_storage.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.4.0 → datachain-0.5.0}/tests/utils.py +0 -0
|
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING, Optional, Union
|
|
|
12
12
|
|
|
13
13
|
import shtab
|
|
14
14
|
|
|
15
|
-
from datachain import utils
|
|
15
|
+
from datachain import Session, utils
|
|
16
16
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
17
17
|
from datachain.lib.dc import DataChain
|
|
18
18
|
from datachain.telemetry import telemetry
|
|
@@ -770,7 +770,8 @@ def show(
|
|
|
770
770
|
show_records(records, collapse_columns=not no_collapse)
|
|
771
771
|
if schema and dataset_version.feature_schema:
|
|
772
772
|
print("\nSchema:")
|
|
773
|
-
|
|
773
|
+
session = Session.get(catalog=catalog)
|
|
774
|
+
dc = DataChain.from_dataset(name=name, version=version, session=session)
|
|
774
775
|
dc.print_schema()
|
|
775
776
|
|
|
776
777
|
|
|
@@ -15,7 +15,6 @@ from uuid import uuid4
|
|
|
15
15
|
from sqlalchemy import (
|
|
16
16
|
JSON,
|
|
17
17
|
BigInteger,
|
|
18
|
-
Boolean,
|
|
19
18
|
Column,
|
|
20
19
|
DateTime,
|
|
21
20
|
ForeignKey,
|
|
@@ -228,7 +227,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
228
227
|
self,
|
|
229
228
|
dataset: DatasetRecord,
|
|
230
229
|
version: int,
|
|
231
|
-
status: int
|
|
230
|
+
status: int,
|
|
232
231
|
sources: str = "",
|
|
233
232
|
feature_schema: Optional[dict] = None,
|
|
234
233
|
query_script: str = "",
|
|
@@ -448,7 +447,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
448
447
|
Column("name", Text, nullable=False),
|
|
449
448
|
Column("description", Text),
|
|
450
449
|
Column("labels", JSON, nullable=True),
|
|
451
|
-
Column("shadow", Boolean, nullable=False),
|
|
452
450
|
Column("status", Integer, nullable=False),
|
|
453
451
|
Column("feature_schema", JSON, nullable=True),
|
|
454
452
|
Column("created_at", DateTime(timezone=True)),
|
|
@@ -481,8 +479,11 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
481
479
|
nullable=False,
|
|
482
480
|
),
|
|
483
481
|
Column("version", Integer, nullable=False),
|
|
484
|
-
|
|
485
|
-
|
|
482
|
+
Column(
|
|
483
|
+
"status",
|
|
484
|
+
Integer,
|
|
485
|
+
nullable=False,
|
|
486
|
+
),
|
|
486
487
|
Column("feature_schema", JSON, nullable=True),
|
|
487
488
|
Column("created_at", DateTime(timezone=True)),
|
|
488
489
|
Column("finished_at", DateTime(timezone=True)),
|
|
@@ -969,7 +970,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
969
970
|
# TODO abstract this method and add registered = True based on kwargs
|
|
970
971
|
query = self._datasets_insert().values(
|
|
971
972
|
name=name,
|
|
972
|
-
shadow=False,
|
|
973
973
|
status=status,
|
|
974
974
|
feature_schema=json.dumps(feature_schema or {}),
|
|
975
975
|
created_at=datetime.now(timezone.utc),
|
|
@@ -992,7 +992,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
992
992
|
self,
|
|
993
993
|
dataset: DatasetRecord,
|
|
994
994
|
version: int,
|
|
995
|
-
status: int
|
|
995
|
+
status: int,
|
|
996
996
|
sources: str = "",
|
|
997
997
|
feature_schema: Optional[dict] = None,
|
|
998
998
|
query_script: str = "",
|
|
@@ -1018,7 +1018,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1018
1018
|
query = self._datasets_versions_insert().values(
|
|
1019
1019
|
dataset_id=dataset.id,
|
|
1020
1020
|
version=version,
|
|
1021
|
-
status=status,
|
|
1021
|
+
status=status,
|
|
1022
1022
|
feature_schema=json.dumps(feature_schema or {}),
|
|
1023
1023
|
created_at=created_at or datetime.now(timezone.utc),
|
|
1024
1024
|
finished_at=finished_at,
|
|
@@ -919,9 +919,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
919
919
|
def is_temp_table_name(self, name: str) -> bool:
|
|
920
920
|
"""Returns if the given table name refers to a temporary
|
|
921
921
|
or no longer needed table."""
|
|
922
|
-
return name.startswith(
|
|
923
|
-
(self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX, "ds_shadow_")
|
|
924
|
-
) or name.endswith("_shadow")
|
|
922
|
+
return name.startswith((self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX))
|
|
925
923
|
|
|
926
924
|
def get_temp_table_names(self) -> list[str]:
|
|
927
925
|
return [
|
|
@@ -267,7 +267,6 @@ class DatasetRecord:
|
|
|
267
267
|
name: str
|
|
268
268
|
description: Optional[str]
|
|
269
269
|
labels: list[str]
|
|
270
|
-
shadow: bool
|
|
271
270
|
schema: dict[str, Union[SQLType, type[SQLType]]]
|
|
272
271
|
feature_schema: dict
|
|
273
272
|
versions: list[DatasetVersion]
|
|
@@ -296,7 +295,6 @@ class DatasetRecord:
|
|
|
296
295
|
name: str,
|
|
297
296
|
description: Optional[str],
|
|
298
297
|
labels: str,
|
|
299
|
-
shadow: int,
|
|
300
298
|
status: int,
|
|
301
299
|
feature_schema: Optional[str],
|
|
302
300
|
created_at: datetime,
|
|
@@ -356,7 +354,6 @@ class DatasetRecord:
|
|
|
356
354
|
name,
|
|
357
355
|
description,
|
|
358
356
|
labels_lst,
|
|
359
|
-
bool(shadow),
|
|
360
357
|
cls.parse_schema(schema_dct), # type: ignore[arg-type]
|
|
361
358
|
json.loads(feature_schema) if feature_schema else {},
|
|
362
359
|
[dataset_version],
|
|
@@ -54,7 +54,6 @@ from datachain.query import Session
|
|
|
54
54
|
from datachain.query.dataset import (
|
|
55
55
|
DatasetQuery,
|
|
56
56
|
PartitionByType,
|
|
57
|
-
detach,
|
|
58
57
|
)
|
|
59
58
|
from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
|
|
60
59
|
from datachain.sql.functions import path as pathfunc
|
|
@@ -159,7 +158,7 @@ class Sys(DataModel):
|
|
|
159
158
|
rand: int
|
|
160
159
|
|
|
161
160
|
|
|
162
|
-
class DataChain
|
|
161
|
+
class DataChain:
|
|
163
162
|
"""DataChain - a data structure for batch data processing and evaluation.
|
|
164
163
|
|
|
165
164
|
It represents a sequence of data manipulation steps such as reading data from
|
|
@@ -238,33 +237,20 @@ class DataChain(DatasetQuery):
|
|
|
238
237
|
"size": 0,
|
|
239
238
|
}
|
|
240
239
|
|
|
241
|
-
def __init__(
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
else:
|
|
256
|
-
self._settings = Settings()
|
|
257
|
-
self._setup: dict = {}
|
|
258
|
-
|
|
259
|
-
self.signals_schema = SignalSchema({"sys": Sys})
|
|
260
|
-
if self.feature_schema:
|
|
261
|
-
self.signals_schema |= SignalSchema.deserialize(self.feature_schema)
|
|
262
|
-
else:
|
|
263
|
-
self.signals_schema |= SignalSchema.from_column_types(
|
|
264
|
-
self.column_types or {}
|
|
265
|
-
)
|
|
266
|
-
|
|
267
|
-
self._sys = False
|
|
240
|
+
def __init__(
|
|
241
|
+
self,
|
|
242
|
+
query: DatasetQuery,
|
|
243
|
+
settings: Settings,
|
|
244
|
+
signal_schema: SignalSchema,
|
|
245
|
+
setup: Optional[dict] = None,
|
|
246
|
+
_sys: bool = False,
|
|
247
|
+
) -> None:
|
|
248
|
+
"""Don't instantiate this directly, use one of the from_XXX constructors."""
|
|
249
|
+
self._query = query
|
|
250
|
+
self._settings = settings
|
|
251
|
+
self.signals_schema = signal_schema
|
|
252
|
+
self._setup: dict = setup or {}
|
|
253
|
+
self._sys = _sys
|
|
268
254
|
|
|
269
255
|
@property
|
|
270
256
|
def schema(self) -> dict[str, DataType]:
|
|
@@ -290,18 +276,55 @@ class DataChain(DatasetQuery):
|
|
|
290
276
|
def c(self, column: Union[str, Column]) -> Column:
|
|
291
277
|
"""Returns Column instance attached to the current chain."""
|
|
292
278
|
c = self.column(column) if isinstance(column, str) else self.column(column.name)
|
|
293
|
-
c.table = self.table
|
|
279
|
+
c.table = self._query.table
|
|
294
280
|
return c
|
|
295
281
|
|
|
282
|
+
@property
|
|
283
|
+
def session(self) -> Session:
|
|
284
|
+
"""Session of the chain."""
|
|
285
|
+
return self._query.session
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def name(self) -> Optional[str]:
|
|
289
|
+
"""Name of the underlying dataset, if there is one."""
|
|
290
|
+
return self._query.name
|
|
291
|
+
|
|
292
|
+
@property
|
|
293
|
+
def version(self) -> Optional[int]:
|
|
294
|
+
"""Version of the underlying dataset, if there is one."""
|
|
295
|
+
return self._query.version
|
|
296
|
+
|
|
297
|
+
def __or__(self, other: "Self") -> "Self":
|
|
298
|
+
"""Return `self.union(other)`."""
|
|
299
|
+
return self.union(other)
|
|
300
|
+
|
|
296
301
|
def print_schema(self) -> None:
|
|
297
302
|
"""Print schema of the chain."""
|
|
298
303
|
self._effective_signals_schema.print_tree()
|
|
299
304
|
|
|
300
|
-
def clone(self
|
|
305
|
+
def clone(self) -> "Self":
|
|
301
306
|
"""Make a copy of the chain in a new table."""
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
307
|
+
return self._evolve(query=self._query.clone(new_table=True))
|
|
308
|
+
|
|
309
|
+
def _evolve(
|
|
310
|
+
self,
|
|
311
|
+
*,
|
|
312
|
+
query: Optional[DatasetQuery] = None,
|
|
313
|
+
settings: Optional[Settings] = None,
|
|
314
|
+
signal_schema=None,
|
|
315
|
+
_sys=None,
|
|
316
|
+
) -> "Self":
|
|
317
|
+
if query is None:
|
|
318
|
+
query = self._query.clone(new_table=False)
|
|
319
|
+
if settings is None:
|
|
320
|
+
settings = self._settings
|
|
321
|
+
if signal_schema is None:
|
|
322
|
+
signal_schema = copy.deepcopy(self.signals_schema)
|
|
323
|
+
if _sys is None:
|
|
324
|
+
_sys = self._sys
|
|
325
|
+
return type(self)(
|
|
326
|
+
query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
|
|
327
|
+
)
|
|
305
328
|
|
|
306
329
|
def settings(
|
|
307
330
|
self,
|
|
@@ -332,11 +355,11 @@ class DataChain(DatasetQuery):
|
|
|
332
355
|
)
|
|
333
356
|
```
|
|
334
357
|
"""
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
return
|
|
358
|
+
if sys is None:
|
|
359
|
+
sys = self._sys
|
|
360
|
+
settings = copy.copy(self._settings)
|
|
361
|
+
settings.add(Settings(cache, parallel, workers, min_task_size))
|
|
362
|
+
return self._evolve(settings=settings, _sys=sys)
|
|
340
363
|
|
|
341
364
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
342
365
|
"""Reset all settings to default values."""
|
|
@@ -434,7 +457,7 @@ class DataChain(DatasetQuery):
|
|
|
434
457
|
version: Optional[int] = None,
|
|
435
458
|
session: Optional[Session] = None,
|
|
436
459
|
settings: Optional[dict] = None,
|
|
437
|
-
) -> "
|
|
460
|
+
) -> "Self":
|
|
438
461
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
439
462
|
|
|
440
463
|
Parameters:
|
|
@@ -446,7 +469,24 @@ class DataChain(DatasetQuery):
|
|
|
446
469
|
chain = DataChain.from_dataset("my_cats")
|
|
447
470
|
```
|
|
448
471
|
"""
|
|
449
|
-
|
|
472
|
+
query = DatasetQuery(
|
|
473
|
+
name=name,
|
|
474
|
+
version=version,
|
|
475
|
+
session=session,
|
|
476
|
+
indexing_column_types=File._datachain_column_types,
|
|
477
|
+
)
|
|
478
|
+
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
479
|
+
if settings:
|
|
480
|
+
_settings = Settings(**settings)
|
|
481
|
+
else:
|
|
482
|
+
_settings = Settings()
|
|
483
|
+
|
|
484
|
+
signals_schema = SignalSchema({"sys": Sys})
|
|
485
|
+
if query.feature_schema:
|
|
486
|
+
signals_schema |= SignalSchema.deserialize(query.feature_schema)
|
|
487
|
+
else:
|
|
488
|
+
signals_schema |= SignalSchema.from_column_types(query.column_types or {})
|
|
489
|
+
return cls(query, _settings, signals_schema)
|
|
450
490
|
|
|
451
491
|
@classmethod
|
|
452
492
|
def from_json(
|
|
@@ -699,7 +739,11 @@ class DataChain(DatasetQuery):
|
|
|
699
739
|
version : version of a dataset. Default - the last version that exist.
|
|
700
740
|
"""
|
|
701
741
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
702
|
-
return
|
|
742
|
+
return self._evolve(
|
|
743
|
+
query=self._query.save(
|
|
744
|
+
name=name, version=version, feature_schema=schema, **kwargs
|
|
745
|
+
)
|
|
746
|
+
)
|
|
703
747
|
|
|
704
748
|
def apply(self, func, *args, **kwargs):
|
|
705
749
|
"""Apply any function to the chain.
|
|
@@ -765,13 +809,14 @@ class DataChain(DatasetQuery):
|
|
|
765
809
|
"""
|
|
766
810
|
udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
|
|
767
811
|
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
812
|
+
return self._evolve(
|
|
813
|
+
query=self._query.add_signals(
|
|
814
|
+
udf_obj.to_udf_wrapper(),
|
|
815
|
+
**self._settings.to_dict(),
|
|
816
|
+
),
|
|
817
|
+
signal_schema=self.signals_schema | udf_obj.output,
|
|
771
818
|
)
|
|
772
819
|
|
|
773
|
-
return chain.add_schema(udf_obj.output).reset_settings(self._settings)
|
|
774
|
-
|
|
775
820
|
def gen(
|
|
776
821
|
self,
|
|
777
822
|
func: Optional[Callable] = None,
|
|
@@ -800,14 +845,14 @@ class DataChain(DatasetQuery):
|
|
|
800
845
|
```
|
|
801
846
|
"""
|
|
802
847
|
udf_obj = self._udf_to_obj(Generator, func, params, output, signal_map)
|
|
803
|
-
|
|
804
|
-
self
|
|
805
|
-
|
|
806
|
-
|
|
848
|
+
return self._evolve(
|
|
849
|
+
query=self._query.generate(
|
|
850
|
+
udf_obj.to_udf_wrapper(),
|
|
851
|
+
**self._settings.to_dict(),
|
|
852
|
+
),
|
|
853
|
+
signal_schema=udf_obj.output,
|
|
807
854
|
)
|
|
808
855
|
|
|
809
|
-
return chain.reset_schema(udf_obj.output).reset_settings(self._settings)
|
|
810
|
-
|
|
811
856
|
def agg(
|
|
812
857
|
self,
|
|
813
858
|
func: Optional[Callable] = None,
|
|
@@ -840,15 +885,15 @@ class DataChain(DatasetQuery):
|
|
|
840
885
|
```
|
|
841
886
|
"""
|
|
842
887
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
843
|
-
|
|
844
|
-
self
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
888
|
+
return self._evolve(
|
|
889
|
+
query=self._query.generate(
|
|
890
|
+
udf_obj.to_udf_wrapper(),
|
|
891
|
+
partition_by=partition_by,
|
|
892
|
+
**self._settings.to_dict(),
|
|
893
|
+
),
|
|
894
|
+
signal_schema=udf_obj.output,
|
|
848
895
|
)
|
|
849
896
|
|
|
850
|
-
return chain.reset_schema(udf_obj.output).reset_settings(self._settings)
|
|
851
|
-
|
|
852
897
|
def batch_map(
|
|
853
898
|
self,
|
|
854
899
|
func: Optional[Callable] = None,
|
|
@@ -876,14 +921,14 @@ class DataChain(DatasetQuery):
|
|
|
876
921
|
```
|
|
877
922
|
"""
|
|
878
923
|
udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
|
|
879
|
-
|
|
880
|
-
self
|
|
881
|
-
|
|
882
|
-
|
|
924
|
+
return self._evolve(
|
|
925
|
+
query=self._query.add_signals(
|
|
926
|
+
udf_obj.to_udf_wrapper(batch),
|
|
927
|
+
**self._settings.to_dict(),
|
|
928
|
+
),
|
|
929
|
+
signal_schema=self.signals_schema | udf_obj.output,
|
|
883
930
|
)
|
|
884
931
|
|
|
885
|
-
return chain.add_schema(udf_obj.output).reset_settings(self._settings)
|
|
886
|
-
|
|
887
932
|
def _udf_to_obj(
|
|
888
933
|
self,
|
|
889
934
|
target_class: type[UDFBase],
|
|
@@ -907,17 +952,12 @@ class DataChain(DatasetQuery):
|
|
|
907
952
|
return target_class._create(sign, params_schema)
|
|
908
953
|
|
|
909
954
|
def _extend_to_data_model(self, method_name, *args, **kwargs):
|
|
910
|
-
|
|
955
|
+
query_func = getattr(self._query, method_name)
|
|
911
956
|
|
|
912
957
|
new_schema = self.signals_schema.resolve(*args)
|
|
913
958
|
columns = [C(col) for col in new_schema.db_signals()]
|
|
914
|
-
|
|
915
|
-
if isinstance(res, DataChain):
|
|
916
|
-
res.signals_schema = new_schema
|
|
917
|
-
|
|
918
|
-
return res
|
|
959
|
+
return query_func(*columns, **kwargs)
|
|
919
960
|
|
|
920
|
-
@detach
|
|
921
961
|
@resolve_columns
|
|
922
962
|
def order_by(self, *args, descending: bool = False) -> "Self":
|
|
923
963
|
"""Orders by specified set of signals.
|
|
@@ -928,9 +968,8 @@ class DataChain(DatasetQuery):
|
|
|
928
968
|
if descending:
|
|
929
969
|
args = tuple(sqlalchemy.desc(a) for a in args)
|
|
930
970
|
|
|
931
|
-
return
|
|
971
|
+
return self._evolve(query=self._query.order_by(*args))
|
|
932
972
|
|
|
933
|
-
@detach
|
|
934
973
|
def distinct(self, arg: str, *args: str) -> "Self": # type: ignore[override]
|
|
935
974
|
"""Removes duplicate rows based on uniqueness of some input column(s)
|
|
936
975
|
i.e if rows are found with the same value of input column(s), only one
|
|
@@ -942,29 +981,30 @@ class DataChain(DatasetQuery):
|
|
|
942
981
|
)
|
|
943
982
|
```
|
|
944
983
|
"""
|
|
945
|
-
return
|
|
984
|
+
return self._evolve(
|
|
985
|
+
query=self._query.distinct(
|
|
986
|
+
*self.signals_schema.resolve(arg, *args).db_signals()
|
|
987
|
+
)
|
|
988
|
+
)
|
|
946
989
|
|
|
947
|
-
@detach
|
|
948
990
|
def select(self, *args: str, _sys: bool = True) -> "Self":
|
|
949
991
|
"""Select only a specified set of signals."""
|
|
950
992
|
new_schema = self.signals_schema.resolve(*args)
|
|
951
993
|
if _sys:
|
|
952
994
|
new_schema = SignalSchema({"sys": Sys}) | new_schema
|
|
953
995
|
columns = new_schema.db_signals()
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
996
|
+
return self._evolve(
|
|
997
|
+
query=self._query.select(*columns), signal_schema=new_schema
|
|
998
|
+
)
|
|
957
999
|
|
|
958
|
-
@detach
|
|
959
1000
|
def select_except(self, *args: str) -> "Self":
|
|
960
1001
|
"""Select all the signals expect the specified signals."""
|
|
961
1002
|
new_schema = self.signals_schema.select_except_signals(*args)
|
|
962
1003
|
columns = new_schema.db_signals()
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
1004
|
+
return self._evolve(
|
|
1005
|
+
query=self._query.select(*columns), signal_schema=new_schema
|
|
1006
|
+
)
|
|
966
1007
|
|
|
967
|
-
@detach
|
|
968
1008
|
def mutate(self, **kwargs) -> "Self":
|
|
969
1009
|
"""Create new signals based on existing signals.
|
|
970
1010
|
|
|
@@ -1029,9 +1069,9 @@ class DataChain(DatasetQuery):
|
|
|
1029
1069
|
# adding new signal
|
|
1030
1070
|
mutated[name] = value
|
|
1031
1071
|
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1072
|
+
return self._evolve(
|
|
1073
|
+
query=self._query.mutate(**mutated), signal_schema=schema.mutate(kwargs)
|
|
1074
|
+
)
|
|
1035
1075
|
|
|
1036
1076
|
@property
|
|
1037
1077
|
def _effective_signals_schema(self) -> "SignalSchema":
|
|
@@ -1058,7 +1098,7 @@ class DataChain(DatasetQuery):
|
|
|
1058
1098
|
a tuple of row values.
|
|
1059
1099
|
"""
|
|
1060
1100
|
db_signals = self._effective_signals_schema.db_signals()
|
|
1061
|
-
with
|
|
1101
|
+
with self._query.select(*db_signals).as_iterable() as rows:
|
|
1062
1102
|
if row_factory:
|
|
1063
1103
|
rows = (row_factory(db_signals, r) for r in rows)
|
|
1064
1104
|
yield from rows
|
|
@@ -1126,7 +1166,7 @@ class DataChain(DatasetQuery):
|
|
|
1126
1166
|
chain = self.select(*cols) if cols else self
|
|
1127
1167
|
signals_schema = chain._effective_signals_schema
|
|
1128
1168
|
db_signals = signals_schema.db_signals()
|
|
1129
|
-
with
|
|
1169
|
+
with self._query.select(*db_signals).as_iterable() as rows:
|
|
1130
1170
|
for row in rows:
|
|
1131
1171
|
ret = signals_schema.row_to_features(
|
|
1132
1172
|
row, catalog=chain.session.catalog, cache=chain._settings.cache
|
|
@@ -1156,7 +1196,7 @@ class DataChain(DatasetQuery):
|
|
|
1156
1196
|
"""
|
|
1157
1197
|
from datachain.torch import PytorchDataset
|
|
1158
1198
|
|
|
1159
|
-
if self.attached:
|
|
1199
|
+
if self._query.attached:
|
|
1160
1200
|
chain = self
|
|
1161
1201
|
else:
|
|
1162
1202
|
chain = self.save()
|
|
@@ -1164,7 +1204,7 @@ class DataChain(DatasetQuery):
|
|
|
1164
1204
|
return PytorchDataset(
|
|
1165
1205
|
chain.name,
|
|
1166
1206
|
chain.version,
|
|
1167
|
-
catalog=self.catalog,
|
|
1207
|
+
catalog=self.session.catalog,
|
|
1168
1208
|
transform=transform,
|
|
1169
1209
|
tokenizer=tokenizer,
|
|
1170
1210
|
tokenizer_kwargs=tokenizer_kwargs,
|
|
@@ -1175,7 +1215,6 @@ class DataChain(DatasetQuery):
|
|
|
1175
1215
|
schema = self.signals_schema.clone_without_file_signals()
|
|
1176
1216
|
return self.select(*schema.values.keys())
|
|
1177
1217
|
|
|
1178
|
-
@detach
|
|
1179
1218
|
def merge(
|
|
1180
1219
|
self,
|
|
1181
1220
|
right_ds: "DataChain",
|
|
@@ -1240,7 +1279,7 @@ class DataChain(DatasetQuery):
|
|
|
1240
1279
|
)
|
|
1241
1280
|
|
|
1242
1281
|
if self == right_ds:
|
|
1243
|
-
right_ds = right_ds.clone(
|
|
1282
|
+
right_ds = right_ds.clone()
|
|
1244
1283
|
|
|
1245
1284
|
errors = []
|
|
1246
1285
|
|
|
@@ -1266,9 +1305,11 @@ class DataChain(DatasetQuery):
|
|
|
1266
1305
|
on, right_on, f"Could not resolve {', '.join(errors)}"
|
|
1267
1306
|
)
|
|
1268
1307
|
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1308
|
+
query = self._query.join(
|
|
1309
|
+
right_ds._query, sqlalchemy.and_(*ops), inner, rname + "{name}"
|
|
1310
|
+
)
|
|
1311
|
+
query.feature_schema = None
|
|
1312
|
+
ds = self._evolve(query=query)
|
|
1272
1313
|
|
|
1273
1314
|
signals_schema = self.signals_schema.clone_without_sys_signals()
|
|
1274
1315
|
right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
|
|
@@ -1278,6 +1319,14 @@ class DataChain(DatasetQuery):
|
|
|
1278
1319
|
|
|
1279
1320
|
return ds
|
|
1280
1321
|
|
|
1322
|
+
def union(self, other: "Self") -> "Self":
|
|
1323
|
+
"""Return the set union of the two datasets.
|
|
1324
|
+
|
|
1325
|
+
Parameters:
|
|
1326
|
+
other: chain whose rows will be added to `self`.
|
|
1327
|
+
"""
|
|
1328
|
+
return self._evolve(query=self._query.union(other._query))
|
|
1329
|
+
|
|
1281
1330
|
def subtract( # type: ignore[override]
|
|
1282
1331
|
self,
|
|
1283
1332
|
other: "DataChain",
|
|
@@ -1341,7 +1390,7 @@ class DataChain(DatasetQuery):
|
|
|
1341
1390
|
other.signals_schema.resolve(*right_on).db_signals(),
|
|
1342
1391
|
) # type: ignore[arg-type]
|
|
1343
1392
|
)
|
|
1344
|
-
return
|
|
1393
|
+
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1345
1394
|
|
|
1346
1395
|
@classmethod
|
|
1347
1396
|
def from_values(
|
|
@@ -1449,7 +1498,7 @@ class DataChain(DatasetQuery):
|
|
|
1449
1498
|
transpose : Whether to transpose rows and columns.
|
|
1450
1499
|
truncate : Whether or not to truncate the contents of columns.
|
|
1451
1500
|
"""
|
|
1452
|
-
dc = self.limit(limit) if limit > 0 else self
|
|
1501
|
+
dc = self.limit(limit) if limit > 0 else self # type: ignore[misc]
|
|
1453
1502
|
df = dc.to_pandas(flatten)
|
|
1454
1503
|
|
|
1455
1504
|
if df.empty:
|
|
@@ -1782,7 +1831,7 @@ class DataChain(DatasetQuery):
|
|
|
1782
1831
|
settings: Optional[dict] = None,
|
|
1783
1832
|
in_memory: bool = False,
|
|
1784
1833
|
schema: Optional[dict[str, DataType]] = None,
|
|
1785
|
-
) -> "
|
|
1834
|
+
) -> "Self":
|
|
1786
1835
|
"""Create a DataChain from the provided records. This method can be used for
|
|
1787
1836
|
programmatically generating a chain in contrast of reading data from storages
|
|
1788
1837
|
or other sources.
|
|
@@ -1837,7 +1886,7 @@ class DataChain(DatasetQuery):
|
|
|
1837
1886
|
insert_q = dr.get_table().insert()
|
|
1838
1887
|
for record in to_insert:
|
|
1839
1888
|
db.execute(insert_q.values(**record))
|
|
1840
|
-
return
|
|
1889
|
+
return cls.from_dataset(name=dsr.name, session=session, settings=settings)
|
|
1841
1890
|
|
|
1842
1891
|
def sum(self, fr: DataType): # type: ignore[override]
|
|
1843
1892
|
"""Compute the sum of a column."""
|
|
@@ -1898,8 +1947,8 @@ class DataChain(DatasetQuery):
|
|
|
1898
1947
|
) -> None:
|
|
1899
1948
|
"""Method that exports all files from chain to some folder."""
|
|
1900
1949
|
if placement == "filename" and (
|
|
1901
|
-
|
|
1902
|
-
!= self.count()
|
|
1950
|
+
self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
|
|
1951
|
+
!= self._query.count()
|
|
1903
1952
|
):
|
|
1904
1953
|
raise ValueError("Files with the same name found")
|
|
1905
1954
|
|
|
@@ -1919,10 +1968,9 @@ class DataChain(DatasetQuery):
|
|
|
1919
1968
|
NOTE: Samples are not deterministic, and streamed/paginated queries or
|
|
1920
1969
|
multiple workers will draw samples with replacement.
|
|
1921
1970
|
"""
|
|
1922
|
-
return
|
|
1971
|
+
return self._evolve(query=self._query.sample(n))
|
|
1923
1972
|
|
|
1924
|
-
|
|
1925
|
-
def filter(self, *args) -> "Self":
|
|
1973
|
+
def filter(self, *args: Any) -> "Self":
|
|
1926
1974
|
"""Filter the chain according to conditions.
|
|
1927
1975
|
|
|
1928
1976
|
Example:
|
|
@@ -1955,14 +2003,50 @@ class DataChain(DatasetQuery):
|
|
|
1955
2003
|
)
|
|
1956
2004
|
```
|
|
1957
2005
|
"""
|
|
1958
|
-
return
|
|
2006
|
+
return self._evolve(query=self._query.filter(*args))
|
|
1959
2007
|
|
|
1960
|
-
@detach
|
|
1961
2008
|
def limit(self, n: int) -> "Self":
|
|
1962
|
-
"""Return the first n rows of the chain.
|
|
1963
|
-
|
|
2009
|
+
"""Return the first `n` rows of the chain.
|
|
2010
|
+
|
|
2011
|
+
If the chain is unordered, which rows are returned is undefined.
|
|
2012
|
+
If the chain has less than `n` rows, the whole chain is returned.
|
|
2013
|
+
|
|
2014
|
+
Parameters:
|
|
2015
|
+
n (int): Number of rows to return.
|
|
2016
|
+
"""
|
|
2017
|
+
return self._evolve(query=self._query.limit(n))
|
|
1964
2018
|
|
|
1965
|
-
@detach
|
|
1966
2019
|
def offset(self, offset: int) -> "Self":
|
|
1967
|
-
"""Return the results starting with the offset row.
|
|
1968
|
-
|
|
2020
|
+
"""Return the results starting with the offset row.
|
|
2021
|
+
|
|
2022
|
+
If the chain is unordered, which rows are skipped in undefined.
|
|
2023
|
+
If the chain has less than `offset` rows, the result is an empty chain.
|
|
2024
|
+
|
|
2025
|
+
Parameters:
|
|
2026
|
+
offset (int): Number of rows to skip.
|
|
2027
|
+
"""
|
|
2028
|
+
return self._evolve(query=self._query.offset(offset))
|
|
2029
|
+
|
|
2030
|
+
def count(self) -> int:
|
|
2031
|
+
"""Return the number of rows in the chain."""
|
|
2032
|
+
return self._query.count()
|
|
2033
|
+
|
|
2034
|
+
def exec(self) -> "Self":
|
|
2035
|
+
"""Execute the chain."""
|
|
2036
|
+
return self._evolve(query=self._query.exec())
|
|
2037
|
+
|
|
2038
|
+
def chunk(self, index: int, total: int) -> "Self":
|
|
2039
|
+
"""Split a chain into smaller chunks for e.g. parallelization.
|
|
2040
|
+
|
|
2041
|
+
Example:
|
|
2042
|
+
```py
|
|
2043
|
+
chain = DataChain.from_storage(...)
|
|
2044
|
+
chunk_1 = query._chunk(0, 2)
|
|
2045
|
+
chunk_2 = query._chunk(1, 2)
|
|
2046
|
+
```
|
|
2047
|
+
|
|
2048
|
+
Note:
|
|
2049
|
+
Bear in mind that `index` is 0-indexed but `total` isn't.
|
|
2050
|
+
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|
|
2051
|
+
"""
|
|
2052
|
+
return self._evolve(query=self._query.chunk(index, total))
|