datachain 0.6.8__tar.gz → 0.6.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.6.8 → datachain-0.6.10}/.pre-commit-config.yaml +1 -1
- {datachain-0.6.8/src/datachain.egg-info → datachain-0.6.10}/PKG-INFO +42 -22
- {datachain-0.6.8 → datachain-0.6.10}/README.rst +40 -20
- {datachain-0.6.8 → datachain-0.6.10}/mkdocs.yml +1 -1
- {datachain-0.6.8 → datachain-0.6.10}/pyproject.toml +1 -1
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/catalog/catalog.py +20 -3
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/client/fsspec.py +1 -1
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/data_storage/metastore.py +4 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/data_storage/sqlite.py +6 -2
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/dataset.py +5 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/dataset_info.py +3 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/dc.py +79 -6
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/meta_formats.py +1 -0
- datachain-0.6.10/src/datachain/lib/models/__init__.py +6 -0
- datachain-0.6.10/src/datachain/lib/models/bbox.py +116 -0
- datachain-0.6.10/src/datachain/lib/models/pose.py +108 -0
- datachain-0.6.10/src/datachain/lib/models/segment.py +53 -0
- datachain-0.6.10/src/datachain/lib/models/ultralytics/__init__.py +14 -0
- datachain-0.6.10/src/datachain/lib/models/ultralytics/bbox.py +189 -0
- datachain-0.6.10/src/datachain/lib/models/ultralytics/pose.py +126 -0
- datachain-0.6.10/src/datachain/lib/models/ultralytics/segment.py +121 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/signal_schema.py +1 -1
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/listing.py +24 -7
- datachain-0.6.10/src/datachain/toolkit/__init__.py +3 -0
- datachain-0.6.10/src/datachain/toolkit/split.py +67 -0
- {datachain-0.6.8 → datachain-0.6.10/src/datachain.egg-info}/PKG-INFO +42 -22
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain.egg-info/SOURCES.txt +9 -3
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.6.8 → datachain-0.6.10}/tests/conftest.py +41 -1
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_dataset_query.py +66 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_datasets.py +4 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_pull.py +37 -6
- datachain-0.6.10/tests/func/test_toolkit.py +42 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_datachain.py +42 -0
- datachain-0.6.10/tests/unit/lib/test_models.py +142 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_listing.py +2 -1
- datachain-0.6.8/docs/assets/flowchart.png +0 -0
- datachain-0.6.8/src/datachain/lib/models/__init__.py +0 -5
- datachain-0.6.8/src/datachain/lib/models/bbox.py +0 -45
- datachain-0.6.8/src/datachain/lib/models/pose.py +0 -37
- datachain-0.6.8/src/datachain/lib/models/yolo.py +0 -39
- datachain-0.6.8/tests/unit/lib/test_models.py +0 -50
- {datachain-0.6.8 → datachain-0.6.10}/.cruft.json +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.gitattributes +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.github/codecov.yaml +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.github/dependabot.yml +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.github/workflows/release.yml +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.github/workflows/tests.yml +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/.gitignore +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/CONTRIBUTING.rst +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/LICENSE +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/assets/datachain.svg +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/index.md +0 -0
- {datachain-0.6.8 → datachain-0.6.10/docs}/overrides/main.html +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/references/datachain.md +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/references/datatype.md +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/references/file.md +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/references/index.md +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/references/sql.md +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/references/torch.md +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/docs/references/udf.md +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/multimodal/wds.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/noxfile.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/setup.cfg +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/__main__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/asyn.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/cache.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/cli.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/cli_utils.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/client/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/client/azure.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/client/gcs.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/client/hf.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/client/local.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/client/s3.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/config.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/error.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/job.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/clip.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/file.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/func/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/func/aggregate.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/func/func.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/hf.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/image.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/listing.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/settings.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/tar.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/text.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/udf.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/utils.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/node.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/progress.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/py.typed +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/query/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/query/batch.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/query/dataset.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/query/metrics.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/query/params.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/query/queue.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/query/schema.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/query/session.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/remote/studio.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/types.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/sql/utils.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/studio.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/telemetry.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain/utils.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/data.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/examples/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/examples/test_examples.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/examples/wds_data.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_catalog.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_client.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_datachain.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_listing.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_ls.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_metrics.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_pytorch.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/func/test_query.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/scripts/feature_class.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/test_atomicity.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/test_cli_e2e.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/test_cli_studio.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/test_query_e2e.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/test_telemetry.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_asyn.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_cache.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_catalog.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_client.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_config.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_dataset.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_metastore.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_query.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_query_params.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_serializer.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_session.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_utils.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.6.8 → datachain-0.6.10}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.10
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -71,7 +71,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
|
71
71
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
72
72
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
73
73
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
74
|
-
Requires-Dist: pytest-servers[all]>=0.5.
|
|
74
|
+
Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
|
|
75
75
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
76
76
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
77
77
|
Requires-Dist: virtualenv; extra == "tests"
|
|
@@ -120,33 +120,41 @@ Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
|
120
120
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
121
121
|
:alt: Tests
|
|
122
122
|
|
|
123
|
-
DataChain is a
|
|
124
|
-
|
|
125
|
-
|
|
123
|
+
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
124
|
+
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
125
|
+
(e.g., S3) to process data efficiently without data duplication and manages metadata
|
|
126
|
+
in an internal database for easy and efficient querying.
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
Use Cases
|
|
130
|
+
=========
|
|
131
|
+
|
|
132
|
+
1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and
|
|
133
|
+
refining data in pre-training, finetuning or LLM evaluating stages.
|
|
134
|
+
2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and
|
|
135
|
+
ad-hoc analytics using LLMs.
|
|
126
136
|
|
|
127
137
|
Key Features
|
|
128
138
|
============
|
|
129
139
|
|
|
130
|
-
📂 **
|
|
131
|
-
-
|
|
132
|
-
file systems.
|
|
133
|
-
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
140
|
+
📂 **Multimodal Dataset Versioning.**
|
|
141
|
+
- Version unstructured data without redundant data copies, by supporitng
|
|
142
|
+
references to S3, GCP, Azure, and local file systems.
|
|
143
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
134
144
|
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
135
145
|
|
|
136
|
-
🐍 **Python-friendly
|
|
137
|
-
- Operate on Python objects and object fields
|
|
138
|
-
|
|
146
|
+
🐍 **Python-friendly.**
|
|
147
|
+
- Operate on Python objects and object fields: float scores, strings, matrixes,
|
|
148
|
+
LLM response objects.
|
|
149
|
+
- Run Python code in a high-scale, terabytes size datasets, with built-in
|
|
150
|
+
parallelization and memory-efficient computing — no SQL or Spark required.
|
|
139
151
|
|
|
140
152
|
🧠 **Data Enrichment and Processing.**
|
|
141
153
|
- Generate metadata using local AI models and LLM APIs.
|
|
142
|
-
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
154
|
+
- Filter, join, and group datasets by metadata. Search by vector embeddings.
|
|
155
|
+
- High-performance vectorized operations on Python objects: sum, count, avg, etc.
|
|
143
156
|
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
144
157
|
|
|
145
|
-
🚀 **Efficiency.**
|
|
146
|
-
- Parallelization, out-of-memory workloads and data caching.
|
|
147
|
-
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
148
|
-
- Optimized vector search.
|
|
149
|
-
|
|
150
158
|
|
|
151
159
|
Quick Start
|
|
152
160
|
-----------
|
|
@@ -196,7 +204,7 @@ Batch inference with a simple sentiment model using the `transformers` library:
|
|
|
196
204
|
|
|
197
205
|
pip install transformers
|
|
198
206
|
|
|
199
|
-
The code below downloads files the cloud, and applies a user-defined function
|
|
207
|
+
The code below downloads files from the cloud, and applies a user-defined function
|
|
200
208
|
to each one of them. All files with a positive sentiment
|
|
201
209
|
detected are then copied to the local directory.
|
|
202
210
|
|
|
@@ -429,6 +437,19 @@ name suffix, the following code will do it:
|
|
|
429
437
|
loader = DataLoader(chain, batch_size=1)
|
|
430
438
|
|
|
431
439
|
|
|
440
|
+
DataChain Studio Platform
|
|
441
|
+
-------------------------
|
|
442
|
+
|
|
443
|
+
`DataChain Studio`_ is a proprietary solution for teams that offers:
|
|
444
|
+
|
|
445
|
+
- **Centralized dataset registry** to manage data, code and dependency
|
|
446
|
+
dependencies in one place.
|
|
447
|
+
- **Data Lineage** for data sources as well as direvative dataset.
|
|
448
|
+
- **UI for Multimodal Data** like images, videos, and PDFs.
|
|
449
|
+
- **Scalable Compute** to handle large datasets (100M+ files) and in-house
|
|
450
|
+
AI model inference.
|
|
451
|
+
- **Access control** including SSO and team based collaboration.
|
|
452
|
+
|
|
432
453
|
Tutorials
|
|
433
454
|
---------
|
|
434
455
|
|
|
@@ -462,6 +483,5 @@ Community and Support
|
|
|
462
483
|
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
463
484
|
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
464
485
|
.. _SQLite: https://www.sqlite.org/
|
|
465
|
-
.. _Getting Started: https://datachain.
|
|
466
|
-
..
|
|
467
|
-
:alt: DataChain FlowChart
|
|
486
|
+
.. _Getting Started: https://docs.datachain.ai/
|
|
487
|
+
.. _DataChain Studio: https://studio.datachain.ai/
|
|
@@ -19,33 +19,41 @@
|
|
|
19
19
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
20
20
|
:alt: Tests
|
|
21
21
|
|
|
22
|
-
DataChain is a
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
23
|
+
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
24
|
+
(e.g., S3) to process data efficiently without data duplication and manages metadata
|
|
25
|
+
in an internal database for easy and efficient querying.
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
Use Cases
|
|
29
|
+
=========
|
|
30
|
+
|
|
31
|
+
1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and
|
|
32
|
+
refining data in pre-training, finetuning or LLM evaluating stages.
|
|
33
|
+
2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and
|
|
34
|
+
ad-hoc analytics using LLMs.
|
|
25
35
|
|
|
26
36
|
Key Features
|
|
27
37
|
============
|
|
28
38
|
|
|
29
|
-
📂 **
|
|
30
|
-
-
|
|
31
|
-
file systems.
|
|
32
|
-
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
39
|
+
📂 **Multimodal Dataset Versioning.**
|
|
40
|
+
- Version unstructured data without redundant data copies, by supporitng
|
|
41
|
+
references to S3, GCP, Azure, and local file systems.
|
|
42
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
33
43
|
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
34
44
|
|
|
35
|
-
🐍 **Python-friendly
|
|
36
|
-
- Operate on Python objects and object fields
|
|
37
|
-
|
|
45
|
+
🐍 **Python-friendly.**
|
|
46
|
+
- Operate on Python objects and object fields: float scores, strings, matrixes,
|
|
47
|
+
LLM response objects.
|
|
48
|
+
- Run Python code in a high-scale, terabytes size datasets, with built-in
|
|
49
|
+
parallelization and memory-efficient computing — no SQL or Spark required.
|
|
38
50
|
|
|
39
51
|
🧠 **Data Enrichment and Processing.**
|
|
40
52
|
- Generate metadata using local AI models and LLM APIs.
|
|
41
|
-
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
53
|
+
- Filter, join, and group datasets by metadata. Search by vector embeddings.
|
|
54
|
+
- High-performance vectorized operations on Python objects: sum, count, avg, etc.
|
|
42
55
|
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
43
56
|
|
|
44
|
-
🚀 **Efficiency.**
|
|
45
|
-
- Parallelization, out-of-memory workloads and data caching.
|
|
46
|
-
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
47
|
-
- Optimized vector search.
|
|
48
|
-
|
|
49
57
|
|
|
50
58
|
Quick Start
|
|
51
59
|
-----------
|
|
@@ -95,7 +103,7 @@ Batch inference with a simple sentiment model using the `transformers` library:
|
|
|
95
103
|
|
|
96
104
|
pip install transformers
|
|
97
105
|
|
|
98
|
-
The code below downloads files the cloud, and applies a user-defined function
|
|
106
|
+
The code below downloads files from the cloud, and applies a user-defined function
|
|
99
107
|
to each one of them. All files with a positive sentiment
|
|
100
108
|
detected are then copied to the local directory.
|
|
101
109
|
|
|
@@ -328,6 +336,19 @@ name suffix, the following code will do it:
|
|
|
328
336
|
loader = DataLoader(chain, batch_size=1)
|
|
329
337
|
|
|
330
338
|
|
|
339
|
+
DataChain Studio Platform
|
|
340
|
+
-------------------------
|
|
341
|
+
|
|
342
|
+
`DataChain Studio`_ is a proprietary solution for teams that offers:
|
|
343
|
+
|
|
344
|
+
- **Centralized dataset registry** to manage data, code and dependency
|
|
345
|
+
dependencies in one place.
|
|
346
|
+
- **Data Lineage** for data sources as well as direvative dataset.
|
|
347
|
+
- **UI for Multimodal Data** like images, videos, and PDFs.
|
|
348
|
+
- **Scalable Compute** to handle large datasets (100M+ files) and in-house
|
|
349
|
+
AI model inference.
|
|
350
|
+
- **Access control** including SSO and team based collaboration.
|
|
351
|
+
|
|
331
352
|
Tutorials
|
|
332
353
|
---------
|
|
333
354
|
|
|
@@ -361,6 +382,5 @@ Community and Support
|
|
|
361
382
|
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
362
383
|
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
363
384
|
.. _SQLite: https://www.sqlite.org/
|
|
364
|
-
.. _Getting Started: https://datachain.
|
|
365
|
-
..
|
|
366
|
-
:alt: DataChain FlowChart
|
|
385
|
+
.. _Getting Started: https://docs.datachain.ai/
|
|
386
|
+
.. _DataChain Studio: https://studio.datachain.ai/
|
|
@@ -603,9 +603,10 @@ class Catalog:
|
|
|
603
603
|
)
|
|
604
604
|
|
|
605
605
|
lst = Listing(
|
|
606
|
+
self.metastore.clone(),
|
|
606
607
|
self.warehouse.clone(),
|
|
607
608
|
Client.get_client(list_uri, self.cache, **self.client_config),
|
|
608
|
-
|
|
609
|
+
dataset_name=list_ds_name,
|
|
609
610
|
object_name=object_name,
|
|
610
611
|
)
|
|
611
612
|
|
|
@@ -698,9 +699,13 @@ class Catalog:
|
|
|
698
699
|
|
|
699
700
|
client = self.get_client(source, **client_config)
|
|
700
701
|
uri = client.uri
|
|
701
|
-
st = self.warehouse.clone()
|
|
702
702
|
dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
|
|
703
|
-
listing = Listing(
|
|
703
|
+
listing = Listing(
|
|
704
|
+
self.metastore.clone(),
|
|
705
|
+
self.warehouse.clone(),
|
|
706
|
+
client,
|
|
707
|
+
dataset_name=dataset_name,
|
|
708
|
+
)
|
|
704
709
|
rows = DatasetQuery(
|
|
705
710
|
name=dataset.name, version=ds_version, catalog=self
|
|
706
711
|
).to_db_records()
|
|
@@ -769,6 +774,7 @@ class Catalog:
|
|
|
769
774
|
create_rows: Optional[bool] = True,
|
|
770
775
|
validate_version: Optional[bool] = True,
|
|
771
776
|
listing: Optional[bool] = False,
|
|
777
|
+
uuid: Optional[str] = None,
|
|
772
778
|
) -> "DatasetRecord":
|
|
773
779
|
"""
|
|
774
780
|
Creates new dataset of a specific version.
|
|
@@ -816,6 +822,7 @@ class Catalog:
|
|
|
816
822
|
query_script=query_script,
|
|
817
823
|
create_rows_table=create_rows,
|
|
818
824
|
columns=columns,
|
|
825
|
+
uuid=uuid,
|
|
819
826
|
)
|
|
820
827
|
|
|
821
828
|
def create_new_dataset_version(
|
|
@@ -832,6 +839,7 @@ class Catalog:
|
|
|
832
839
|
script_output="",
|
|
833
840
|
create_rows_table=True,
|
|
834
841
|
job_id: Optional[str] = None,
|
|
842
|
+
uuid: Optional[str] = None,
|
|
835
843
|
) -> DatasetRecord:
|
|
836
844
|
"""
|
|
837
845
|
Creates dataset version if it doesn't exist.
|
|
@@ -855,6 +863,7 @@ class Catalog:
|
|
|
855
863
|
schema=schema,
|
|
856
864
|
job_id=job_id,
|
|
857
865
|
ignore_if_exists=True,
|
|
866
|
+
uuid=uuid,
|
|
858
867
|
)
|
|
859
868
|
|
|
860
869
|
if create_rows_table:
|
|
@@ -1350,6 +1359,13 @@ class Catalog:
|
|
|
1350
1359
|
# we will create new one if it doesn't exist
|
|
1351
1360
|
pass
|
|
1352
1361
|
|
|
1362
|
+
if dataset and version and dataset.has_version(version):
|
|
1363
|
+
"""No need to communicate with Studio at all"""
|
|
1364
|
+
dataset_uri = create_dataset_uri(remote_dataset_name, version)
|
|
1365
|
+
print(f"Local copy of dataset {dataset_uri} already present")
|
|
1366
|
+
_instantiate_dataset()
|
|
1367
|
+
return
|
|
1368
|
+
|
|
1353
1369
|
remote_dataset = self.get_remote_dataset(remote_dataset_name)
|
|
1354
1370
|
# if version is not specified in uri, take the latest one
|
|
1355
1371
|
if not version:
|
|
@@ -1400,6 +1416,7 @@ class Catalog:
|
|
|
1400
1416
|
columns=columns,
|
|
1401
1417
|
feature_schema=remote_dataset_version.feature_schema,
|
|
1402
1418
|
validate_version=False,
|
|
1419
|
+
uuid=remote_dataset_version.uuid,
|
|
1403
1420
|
)
|
|
1404
1421
|
|
|
1405
1422
|
# asking remote to export dataset rows table to s3 and to return signed
|
|
@@ -358,7 +358,7 @@ class Client(ABC):
|
|
|
358
358
|
) -> BinaryIO:
|
|
359
359
|
"""Open a file, including files in tar archives."""
|
|
360
360
|
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
361
|
-
return open(cache_path, mode="rb")
|
|
361
|
+
return open(cache_path, mode="rb")
|
|
362
362
|
assert not file.location
|
|
363
363
|
return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb) # type: ignore[return-value]
|
|
364
364
|
|
|
@@ -138,6 +138,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
138
138
|
size: Optional[int] = None,
|
|
139
139
|
preview: Optional[list[dict]] = None,
|
|
140
140
|
job_id: Optional[str] = None,
|
|
141
|
+
uuid: Optional[str] = None,
|
|
141
142
|
) -> DatasetRecord:
|
|
142
143
|
"""Creates new dataset version."""
|
|
143
144
|
|
|
@@ -352,6 +353,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
352
353
|
"""Datasets versions table columns."""
|
|
353
354
|
return [
|
|
354
355
|
Column("id", Integer, primary_key=True),
|
|
356
|
+
Column("uuid", Text, nullable=False, default=uuid4()),
|
|
355
357
|
Column(
|
|
356
358
|
"dataset_id",
|
|
357
359
|
Integer,
|
|
@@ -545,6 +547,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
545
547
|
size: Optional[int] = None,
|
|
546
548
|
preview: Optional[list[dict]] = None,
|
|
547
549
|
job_id: Optional[str] = None,
|
|
550
|
+
uuid: Optional[str] = None,
|
|
548
551
|
conn=None,
|
|
549
552
|
) -> DatasetRecord:
|
|
550
553
|
"""Creates new dataset version."""
|
|
@@ -555,6 +558,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
555
558
|
|
|
556
559
|
query = self._datasets_versions_insert().values(
|
|
557
560
|
dataset_id=dataset.id,
|
|
561
|
+
uuid=uuid or str(uuid4()),
|
|
558
562
|
version=version,
|
|
559
563
|
status=status,
|
|
560
564
|
feature_schema=json.dumps(feature_schema or {}),
|
|
@@ -747,8 +747,12 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
747
747
|
|
|
748
748
|
ids = self.db.execute(select_ids).fetchall()
|
|
749
749
|
|
|
750
|
-
select_q =
|
|
751
|
-
|
|
750
|
+
select_q = (
|
|
751
|
+
query.with_only_columns(
|
|
752
|
+
*[c for c in query.selected_columns if c.name != "sys__id"]
|
|
753
|
+
)
|
|
754
|
+
.offset(None)
|
|
755
|
+
.limit(None)
|
|
752
756
|
)
|
|
753
757
|
|
|
754
758
|
for batch in batched_it(ids, 10_000):
|
|
@@ -163,6 +163,7 @@ class DatasetStatus:
|
|
|
163
163
|
@dataclass
|
|
164
164
|
class DatasetVersion:
|
|
165
165
|
id: int
|
|
166
|
+
uuid: str
|
|
166
167
|
dataset_id: int
|
|
167
168
|
version: int
|
|
168
169
|
status: int
|
|
@@ -184,6 +185,7 @@ class DatasetVersion:
|
|
|
184
185
|
def parse( # noqa: PLR0913
|
|
185
186
|
cls: type[V],
|
|
186
187
|
id: int,
|
|
188
|
+
uuid: str,
|
|
187
189
|
dataset_id: int,
|
|
188
190
|
version: int,
|
|
189
191
|
status: int,
|
|
@@ -203,6 +205,7 @@ class DatasetVersion:
|
|
|
203
205
|
):
|
|
204
206
|
return cls(
|
|
205
207
|
id,
|
|
208
|
+
uuid,
|
|
206
209
|
dataset_id,
|
|
207
210
|
version,
|
|
208
211
|
status,
|
|
@@ -306,6 +309,7 @@ class DatasetRecord:
|
|
|
306
309
|
query_script: str,
|
|
307
310
|
schema: str,
|
|
308
311
|
version_id: int,
|
|
312
|
+
version_uuid: str,
|
|
309
313
|
version_dataset_id: int,
|
|
310
314
|
version: int,
|
|
311
315
|
version_status: int,
|
|
@@ -331,6 +335,7 @@ class DatasetRecord:
|
|
|
331
335
|
|
|
332
336
|
dataset_version = DatasetVersion.parse(
|
|
333
337
|
version_id,
|
|
338
|
+
version_uuid,
|
|
334
339
|
version_dataset_id,
|
|
335
340
|
version,
|
|
336
341
|
version_status,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
4
|
+
from uuid import uuid4
|
|
4
5
|
|
|
5
6
|
from pydantic import Field, field_validator
|
|
6
7
|
|
|
@@ -15,6 +16,7 @@ if TYPE_CHECKING:
|
|
|
15
16
|
|
|
16
17
|
class DatasetInfo(DataModel):
|
|
17
18
|
name: str
|
|
19
|
+
uuid: str = Field(default=str(uuid4()))
|
|
18
20
|
version: int = Field(default=1)
|
|
19
21
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
20
22
|
created_at: datetime = Field(default=TIME_ZERO)
|
|
@@ -60,6 +62,7 @@ class DatasetInfo(DataModel):
|
|
|
60
62
|
job: Optional[Job],
|
|
61
63
|
) -> "Self":
|
|
62
64
|
return cls(
|
|
65
|
+
uuid=version.uuid,
|
|
63
66
|
name=dataset.name,
|
|
64
67
|
version=version.version,
|
|
65
68
|
status=version.status,
|
|
@@ -30,7 +30,7 @@ from datachain.client.local import FileClient
|
|
|
30
30
|
from datachain.dataset import DatasetRecord
|
|
31
31
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
32
32
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
33
|
-
from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
33
|
+
from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
|
|
34
34
|
from datachain.lib.dataset_info import DatasetInfo
|
|
35
35
|
from datachain.lib.file import ArrowRow, File, get_file_type
|
|
36
36
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
@@ -642,6 +642,59 @@ class DataChain:
|
|
|
642
642
|
}
|
|
643
643
|
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
644
644
|
|
|
645
|
+
def explode(
|
|
646
|
+
self,
|
|
647
|
+
col: str,
|
|
648
|
+
model_name: Optional[str] = None,
|
|
649
|
+
object_name: Optional[str] = None,
|
|
650
|
+
) -> "DataChain":
|
|
651
|
+
"""Explodes a column containing JSON objects (dict or str DataChain type) into
|
|
652
|
+
individual columns based on the schema of the JSON. Schema is inferred from
|
|
653
|
+
the first row of the column.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
col: the name of the column containing JSON to be exploded.
|
|
657
|
+
model_name: optional generated model name. By default generates the name
|
|
658
|
+
automatically.
|
|
659
|
+
object_name: optional generated object column name. By default generates the
|
|
660
|
+
name automatically.
|
|
661
|
+
|
|
662
|
+
Returns:
|
|
663
|
+
DataChain: A new DataChain instance with the new set of columns.
|
|
664
|
+
"""
|
|
665
|
+
import json
|
|
666
|
+
|
|
667
|
+
import pyarrow as pa
|
|
668
|
+
|
|
669
|
+
from datachain.lib.arrow import schema_to_output
|
|
670
|
+
|
|
671
|
+
json_value = next(self.limit(1).collect(col))
|
|
672
|
+
json_dict = (
|
|
673
|
+
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
if not isinstance(json_dict, dict):
|
|
677
|
+
raise TypeError(f"Column {col} should be a string or dict type with JSON")
|
|
678
|
+
|
|
679
|
+
schema = pa.Table.from_pylist([json_dict]).schema
|
|
680
|
+
output = schema_to_output(schema, None)
|
|
681
|
+
|
|
682
|
+
if not model_name:
|
|
683
|
+
model_name = f"{col.title()}ExplodedModel"
|
|
684
|
+
|
|
685
|
+
model = dict_to_data_model(model_name, output)
|
|
686
|
+
|
|
687
|
+
def json_to_model(json_value: Union[str, dict]):
|
|
688
|
+
json_dict = (
|
|
689
|
+
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
690
|
+
)
|
|
691
|
+
return model.model_validate(json_dict)
|
|
692
|
+
|
|
693
|
+
if not object_name:
|
|
694
|
+
object_name = f"{col}_expl"
|
|
695
|
+
|
|
696
|
+
return self.map(json_to_model, params=col, output={object_name: model})
|
|
697
|
+
|
|
645
698
|
@classmethod
|
|
646
699
|
def datasets(
|
|
647
700
|
cls,
|
|
@@ -895,7 +948,7 @@ class DataChain:
|
|
|
895
948
|
2. Group-based UDF function input: Instead of individual rows, the function
|
|
896
949
|
receives a list all rows within each group defined by `partition_by`.
|
|
897
950
|
|
|
898
|
-
|
|
951
|
+
Examples:
|
|
899
952
|
```py
|
|
900
953
|
chain = chain.agg(
|
|
901
954
|
total=lambda category, amount: [sum(amount)],
|
|
@@ -904,6 +957,26 @@ class DataChain:
|
|
|
904
957
|
)
|
|
905
958
|
chain.save("new_dataset")
|
|
906
959
|
```
|
|
960
|
+
|
|
961
|
+
An alternative syntax, when you need to specify a more complex function:
|
|
962
|
+
|
|
963
|
+
```py
|
|
964
|
+
# It automatically resolves which columns to pass to the function
|
|
965
|
+
# by looking at the function signature.
|
|
966
|
+
def agg_sum(
|
|
967
|
+
file: list[File], amount: list[float]
|
|
968
|
+
) -> Iterator[tuple[File, float]]:
|
|
969
|
+
yield file[0], sum(amount)
|
|
970
|
+
|
|
971
|
+
chain = chain.agg(
|
|
972
|
+
agg_sum,
|
|
973
|
+
output={"file": File, "total": float},
|
|
974
|
+
# Alternative syntax is to use `C` (short for Column) to specify
|
|
975
|
+
# a column name or a nested column, e.g. C("file.path").
|
|
976
|
+
partition_by=C("category"),
|
|
977
|
+
)
|
|
978
|
+
chain.save("new_dataset")
|
|
979
|
+
```
|
|
907
980
|
"""
|
|
908
981
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
909
982
|
return self._evolve(
|
|
@@ -1242,15 +1315,15 @@ class DataChain:
|
|
|
1242
1315
|
return self.results(row_factory=to_dict)
|
|
1243
1316
|
|
|
1244
1317
|
@overload
|
|
1245
|
-
def collect(self) -> Iterator[tuple[
|
|
1318
|
+
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1246
1319
|
|
|
1247
1320
|
@overload
|
|
1248
|
-
def collect(self, col: str) -> Iterator[
|
|
1321
|
+
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1249
1322
|
|
|
1250
1323
|
@overload
|
|
1251
|
-
def collect(self, *cols: str) -> Iterator[tuple[
|
|
1324
|
+
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1252
1325
|
|
|
1253
|
-
def collect(self, *cols: str) -> Iterator[Union[
|
|
1326
|
+
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1254
1327
|
"""Yields rows of values, optionally limited to the specified columns.
|
|
1255
1328
|
|
|
1256
1329
|
Args:
|
|
@@ -114,6 +114,7 @@ def read_meta( # noqa: C901
|
|
|
114
114
|
)
|
|
115
115
|
)
|
|
116
116
|
(model_output,) = chain.collect("meta_schema")
|
|
117
|
+
assert isinstance(model_output, str)
|
|
117
118
|
if print_schema:
|
|
118
119
|
print(f"{model_output}")
|
|
119
120
|
# Below 'spec' should be a dynamically converted DataModel from Pydantic
|