datachain 0.3.10__tar.gz → 0.3.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.10 → datachain-0.3.12}/.pre-commit-config.yaml +1 -1
- {datachain-0.3.10/src/datachain.egg-info → datachain-0.3.12}/PKG-INFO +7 -5
- {datachain-0.3.10 → datachain-0.3.12}/README.rst +6 -3
- datachain-0.3.12/docs/assets/datachain-white.svg +1 -0
- datachain-0.3.12/docs/assets/datachain.svg +24 -0
- {datachain-0.3.10 → datachain-0.3.12}/docs/index.md +1 -1
- {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/udfs/stateful.py +4 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/multimodal/clip_inference.py +10 -9
- {datachain-0.3.10 → datachain-0.3.12}/examples/multimodal/wds.py +11 -12
- {datachain-0.3.10 → datachain-0.3.12}/mkdocs.yml +4 -4
- {datachain-0.3.10 → datachain-0.3.12}/pyproject.toml +4 -2
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/catalog/catalog.py +50 -230
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/error.py +0 -4
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/job.py +4 -3
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/clip.py +1 -1
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/dc.py +92 -38
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/file.py +9 -8
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/image.py +1 -1
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/meta_formats.py +38 -59
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/model_store.py +6 -1
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/text.py +1 -1
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/webdataset.py +13 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/webdataset_laion.py +13 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/dataset.py +9 -32
- {datachain-0.3.10 → datachain-0.3.12/src/datachain.egg-info}/PKG-INFO +7 -5
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain.egg-info/SOURCES.txt +3 -2
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain.egg-info/requires.txt +0 -1
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_catalog.py +23 -96
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_datasets.py +0 -2
- datachain-0.3.12/tests/func/test_meta_formats.py +87 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_pytorch.py +10 -3
- datachain-0.3.12/tests/func/test_query.py +173 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_datachain_merge.py +57 -4
- datachain-0.3.12/tests/unit/test_catalog.py +28 -0
- datachain-0.3.10/docs/assets/datachain.png +0 -0
- datachain-0.3.10/src/datachain/catalog/subclass.py +0 -60
- datachain-0.3.10/tests/func/test_query.py +0 -385
- datachain-0.3.10/tests/unit/test_catalog.py +0 -170
- {datachain-0.3.10 → datachain-0.3.12}/.cruft.json +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.gitattributes +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.github/codecov.yaml +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.github/dependabot.yml +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.github/workflows/release.yml +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.github/workflows/tests.yml +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/.gitignore +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/LICENSE +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/docs/references/datachain.md +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/docs/references/datatype.md +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/docs/references/file.md +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/docs/references/index.md +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/docs/references/sql.md +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/docs/references/torch.md +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/docs/references/udf.md +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/noxfile.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/setup.cfg +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/__main__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/asyn.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/cache.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/cli.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/hf.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/local.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/config.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/dataset.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/hf.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/listing.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/listing.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/node.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/progress.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/py.typed +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/builtins.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/params.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/schema.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/session.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/storage.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain/utils.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/conftest.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/data.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/examples/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_client.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_datachain.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_listing.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_ls.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_pull.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_client.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_session.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_udf.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.10 → datachain-0.3.12}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.12
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -71,7 +71,6 @@ Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
|
71
71
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
72
72
|
Requires-Dist: pytest-servers[all]>=0.5.5; extra == "tests"
|
|
73
73
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
74
|
-
Requires-Dist: pytest-asyncio>=0.23.2; extra == "tests"
|
|
75
74
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
76
75
|
Requires-Dist: virtualenv; extra == "tests"
|
|
77
76
|
Requires-Dist: dulwich; extra == "tests"
|
|
@@ -96,8 +95,14 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
|
96
95
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
97
96
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
98
97
|
|
|
98
|
+
================
|
|
99
|
+
|logo| DataChain
|
|
100
|
+
================
|
|
101
|
+
|
|
99
102
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
100
103
|
|
|
104
|
+
.. |logo| image:: docs/assets/datachain.svg
|
|
105
|
+
:height: 24
|
|
101
106
|
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
102
107
|
:target: https://pypi.org/project/datachain/
|
|
103
108
|
:alt: PyPI
|
|
@@ -111,9 +116,6 @@ Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
|
111
116
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
112
117
|
:alt: Tests
|
|
113
118
|
|
|
114
|
-
AI 🔗 DataChain
|
|
115
|
-
----------------
|
|
116
|
-
|
|
117
119
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
118
120
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
119
121
|
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
@@ -1,5 +1,11 @@
|
|
|
1
|
+
================
|
|
2
|
+
|logo| DataChain
|
|
3
|
+
================
|
|
4
|
+
|
|
1
5
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
2
6
|
|
|
7
|
+
.. |logo| image:: docs/assets/datachain.svg
|
|
8
|
+
:height: 24
|
|
3
9
|
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
4
10
|
:target: https://pypi.org/project/datachain/
|
|
5
11
|
:alt: PyPI
|
|
@@ -13,9 +19,6 @@
|
|
|
13
19
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
14
20
|
:alt: Tests
|
|
15
21
|
|
|
16
|
-
AI 🔗 DataChain
|
|
17
|
-
----------------
|
|
18
|
-
|
|
19
22
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
20
23
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
21
24
|
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
<svg width="180" height="33" fill="none" xmlns="http://www.w3.org/2000/svg"><style>.prefix__logo-fill{fill:#fff}</style><path fill-rule="evenodd" clip-rule="evenodd" d="M23.997 24.53l2.34-2.342 3.14 3.135-2.357 2.342a5.533 5.533 0 01-7.822 0l-4.704-4.7a5.536 5.536 0 010-7.823l4.76-4.763 3.124 3.14-4.745 4.747a1.106 1.106 0 000 1.57l4.699 4.694a1.107 1.107 0 001.565 0z" fill="url(#prefix__paint0_linear_449_28)"/><path fill-rule="evenodd" clip-rule="evenodd" d="M37.733 10.65a1.184 1.184 0 01-.234.357L26.253 22.255l3.13 3.135 11.234-11.242a5.536 5.536 0 000-7.824l-4.699-4.705a5.534 5.534 0 00-7.822 0l-3.278 3.263 3.134 3.135 3.268-3.268a1.107 1.107 0 011.564 0l4.694 4.694a1.108 1.108 0 01.244 1.208h.011z" fill="url(#prefix__paint1_linear_449_28)"/><path d="M24.54 14.722L22.2 17.063v.016l-2.405 2.388 3.14 3.134 4.741-4.75a5.534 5.534 0 000-7.822l-4.704-4.704a5.535 5.535 0 00-7.824 0l-5.955 5.954 3.14 3.13 5.944-5.945a1.107 1.107 0 011.565 0l4.7 4.694a1.107 1.107 0 010 1.564z" fill="url(#prefix__paint2_linear_449_28)"/><path d="M4.514 22.335c.054-.133.139-.256.24-.357L7.1 19.632l-.005-.011 3.14-3.129 2.147-2.135-3.135-3.14-7.629 7.638a5.534 5.534 0 000 7.822l4.705 4.704a5.536 5.536 0 007.824 0l3.175-3.18-3.134-3.13-3.165 3.165a1.106 1.106 0 01-1.57 0l-4.7-4.693a1.107 1.107 0 01-.24-1.208z" fill="url(#prefix__paint3_linear_449_28)"/><path d="M55.645 26.613c-.994 0-1.908-.182-2.745-.547a6.407 6.407 0 01-2.169-1.538 7.037 7.037 0 01-1.41-2.294 8.126 8.126 0 01-.497-2.867v-.547c0-1.008.157-1.955.47-2.841a7.478 7.478 0 011.36-2.32 6.201 6.201 0 012.116-1.538c.836-.382 1.76-.573 2.77-.573 1.115 0 2.09.243 2.927.73.854.469 1.533 1.181 2.038 2.137.506.956.784 2.155.837 3.597L60.27 16.76V7.117h3.633v19.027h-2.875v-6.02h.627c-.052 1.441-.348 2.649-.888 3.622-.54.956-1.255 1.677-2.143 2.163-.871.47-1.864.704-2.98.704zm.81-3.05c.714 0 1.367-.156 1.96-.469.592-.33 1.063-.799 1.41-1.407.367-.626.55-1.355.55-2.19v-1.042c0-.834-.183-1.53-.55-2.085a3.572 3.572 0 00-1.436-1.303 4.078 4.078 0 00-1.934-.47c-.784 0-1.481.192-2.091.574-.592.365-1.063.886-1.411 1.564-.331.678-.497 1.468-.497 2.372 0 .903.174 1.694.523 2.372.348.66.819 1.172 1.411 1.537.61.365 1.298.548 2.065.548zM76.635 26.144v-4.196h-.6v-4.666c0-.817-.201-1.425-.602-1.824-.4-.4-1.019-.6-1.855-.6a68.629 68.629 0 00-3.423.104c-.61.018-1.16.044-1.647.079v-3.076c.4-.035.854-.07 1.359-.104.505-.035 1.02-.052 1.542-.052.54-.018 1.045-.026 1.515-.026 1.464 0 2.675.19 3.633.573.976.382 1.707.982 2.195 1.799.505.816.758 1.885.758 3.205v8.784h-2.875zm-4.573.365c-1.028 0-1.934-.183-2.718-.547a4.274 4.274 0 01-1.803-1.564c-.418-.678-.627-1.495-.627-2.45 0-1.043.252-1.894.758-2.555.522-.66 1.245-1.155 2.169-1.485.94-.33 2.038-.495 3.293-.495h3.292v2.163h-3.345c-.836 0-1.48.208-1.934.625-.435.4-.653.921-.653 1.564s.218 1.164.653 1.564c.453.4 1.098.6 1.934.6.505 0 .967-.087 1.385-.261a2.413 2.413 0 001.072-.938c.296-.452.462-1.06.496-1.825l.889 1.017c-.087.99-.331 1.824-.732 2.502a3.899 3.899 0 01-1.62 1.564c-.68.347-1.516.52-2.509.52zM89.569 26.326c-1.307 0-2.387-.165-3.24-.495a3.635 3.635 0 01-1.882-1.72c-.419-.817-.628-1.911-.628-3.284l.026-12.824h3.398l-.026 13.058c0 .695.183 1.234.548 1.616.384.365.924.548 1.62.548h2.222v3.101h-2.038zM81.572 14.65V11.99h10.035v2.659H81.572zM103.203 26.144v-4.196h-.601v-4.666c0-.817-.201-1.425-.601-1.824-.401-.4-1.02-.6-1.856-.6a68.629 68.629 0 00-3.423.104c-.61.018-1.159.044-1.647.079v-3.076c.4-.035.854-.07 1.36-.104.504-.035 1.018-.052 1.541-.052.54-.018 1.045-.026 1.516-.026 1.463 0 2.674.19 3.632.573.976.382 1.708.982 2.196 1.799.505.816.757 1.885.757 3.205v8.784h-2.874zm-4.574.365c-1.028 0-1.934-.183-2.718-.547a4.274 4.274 0 01-1.803-1.564c-.418-.678-.627-1.495-.627-2.45 0-1.043.253-1.894.758-2.555.523-.66 1.246-1.155 2.169-1.485.94-.33 2.038-.495 3.293-.495h3.293v2.163h-3.345c-.837 0-1.481.208-1.934.625-.436.4-.654.921-.654 1.564s.218 1.164.654 1.564c.453.4 1.097.6 1.934.6.505 0 .966-.087 1.385-.261a2.417 2.417 0 001.071-.938c.296-.452.462-1.06.497-1.825l.888 1.017c-.087.99-.331 1.824-.732 2.502a3.897 3.897 0 01-1.62 1.564c-.679.347-1.516.52-2.509.52zM116.267 26.64c-1.237 0-2.309-.21-3.215-.626a6.773 6.773 0 01-2.247-1.668A7.117 7.117 0 01109.472 22a8.19 8.19 0 01-.444-2.659v-.495c0-.956.148-1.868.444-2.737a6.905 6.905 0 011.385-2.346 6.488 6.488 0 012.247-1.642c.906-.417 1.952-.625 3.136-.625 1.237 0 2.344.243 3.319.73.976.469 1.751 1.13 2.326 1.98.593.852.924 1.843.993 2.972h-3.528a2.824 2.824 0 00-.941-1.825c-.522-.486-1.245-.73-2.169-.73-.801 0-1.472.192-2.012.574-.523.382-.915.912-1.176 1.59-.261.66-.392 1.425-.392 2.294 0 .834.122 1.59.366 2.267.261.678.653 1.208 1.176 1.59.54.382 1.228.574 2.065.574.627 0 1.167-.114 1.62-.34.453-.225.81-.538 1.071-.938.279-.4.453-.851.523-1.355h3.528c-.07 1.147-.409 2.155-1.019 3.023-.593.852-1.385 1.52-2.378 2.007-.976.487-2.091.73-3.345.73zM125.919 26.144V7.117h3.633v11.104h-.628c0-1.425.183-2.633.549-3.623.366-.99.906-1.747 1.62-2.268.732-.521 1.656-.782 2.771-.782h.156c1.621 0 2.849.556 3.685 1.668.836 1.112 1.255 2.728 1.255 4.848v8.08h-3.633v-8.419c0-.903-.261-1.616-.784-2.137-.505-.521-1.176-.782-2.012-.782-.889 0-1.612.296-2.169.886-.54.574-.81 1.33-.81 2.268v8.184h-3.633zM151.463 26.144v-4.196h-.601v-4.666c0-.817-.201-1.425-.601-1.824-.401-.4-1.02-.6-1.856-.6a68.524 68.524 0 00-3.423.104c-.61.018-1.159.044-1.647.079v-3.076c.401-.035.854-.07 1.359-.104a22.491 22.491 0 011.542-.052c.54-.018 1.045-.026 1.516-.026 1.463 0 2.674.19 3.632.573.976.382 1.708.982 2.196 1.799.505.816.757 1.885.757 3.205v8.784h-2.874zm-4.574.365c-1.027 0-1.933-.183-2.717-.547a4.277 4.277 0 01-1.804-1.564c-.418-.678-.627-1.495-.627-2.45 0-1.043.253-1.894.758-2.555.523-.66 1.246-1.155 2.169-1.485.941-.33 2.038-.495 3.293-.495h3.293v2.163h-3.345c-.837 0-1.481.208-1.934.625-.436.4-.654.921-.654 1.564s.218 1.164.654 1.564c.453.4 1.097.6 1.934.6.505 0 .967-.087 1.385-.261a2.417 2.417 0 001.071-.938c.296-.452.462-1.06.497-1.825l.888 1.017c-.087.99-.331 1.824-.731 2.502a3.905 3.905 0 01-1.621 1.564c-.679.347-1.516.52-2.509.52zM158.908 26.144V11.99h3.632v14.153h-3.632zm-1.986-11.442v-2.71h5.618v2.71h-5.618zm3.319-4.405c-.715 0-1.246-.183-1.594-.547-.331-.383-.497-.86-.497-1.434 0-.573.166-1.042.497-1.407.348-.365.879-.548 1.594-.548.714 0 1.237.183 1.568.548.331.365.496.834.496 1.407 0 .574-.165 1.051-.496 1.434-.331.364-.854.547-1.568.547zM166.727 26.144V11.99h2.875v6.073h-.262c0-1.442.192-2.641.575-3.597.384-.973.95-1.703 1.699-2.19.766-.486 1.716-.729 2.848-.729h.157c1.69 0 2.971.547 3.842 1.642.871 1.077 1.307 2.693 1.307 4.848v8.106h-3.633v-8.419c0-.869-.253-1.572-.758-2.11-.488-.54-1.167-.809-2.038-.809-.889 0-1.612.278-2.169.834-.54.539-.811 1.269-.811 2.19v8.314h-3.632z" class="prefix__logo-fill"/><defs><linearGradient id="prefix__paint0_linear_449_28" x1="36.032" y1="5.404" x2="18.067" y2="23.054" gradientUnits="userSpaceOnUse"><stop stop-color="#F46837"/><stop offset="1" stop-color="#945DD6"/></linearGradient><linearGradient id="prefix__paint1_linear_449_28" x1="36.045" y1="5.607" x2="18.067" y2="23.363" gradientUnits="userSpaceOnUse"><stop stop-color="#F46837"/><stop offset="1" stop-color="#945DD6"/></linearGradient><linearGradient id="prefix__paint2_linear_449_28" x1="5.924" y1="27.432" x2="23.883" y2="10.239" gradientUnits="userSpaceOnUse"><stop stop-color="#13ADC7"/><stop offset="1" stop-color="#945DD6"/></linearGradient><linearGradient id="prefix__paint3_linear_449_28" x1="5.77" y1="27.586" x2="23.574" y2="9.776" gradientUnits="userSpaceOnUse"><stop stop-color="#13ADC7"/><stop offset="1" stop-color="#945DD6"/></linearGradient></defs></svg>
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
<svg width="33" height="33" viewBox="0 0 33 33" fill="none" xmlns="http://www.w3.org/2000/svg">
|
|
2
|
+
<path fill-rule="evenodd" clip-rule="evenodd" d="M18.7492 22.785L20.5786 20.9554L23.0316 23.4046L21.1898 25.2343C20.379 26.0444 19.2798 26.4994 18.1338 26.4994C16.9878 26.4994 15.8887 26.0444 15.0779 25.2343L11.4025 21.5625C10.5926 20.7516 10.1377 19.6523 10.1377 18.5061C10.1377 17.3599 10.5926 16.2606 11.4025 15.4497L15.1222 11.7285L17.5628 14.182L13.8556 17.8906C13.7748 17.971 13.7106 18.0666 13.6668 18.1719C13.6231 18.2771 13.6005 18.39 13.6005 18.504C13.6005 18.618 13.6231 18.7309 13.6668 18.8362C13.7106 18.9414 13.7748 19.037 13.8556 19.1174L17.5268 22.785C17.689 22.9471 17.9088 23.0381 18.138 23.0381C18.3672 23.0381 18.587 22.9471 18.7492 22.785Z" fill="url(#paint0_linear_426_297)"/>
|
|
3
|
+
<path fill-rule="evenodd" clip-rule="evenodd" d="M29.4817 11.941C29.436 12.0491 29.3736 12.1406 29.2988 12.2196L20.5124 21.0074L22.9571 23.4567L31.7352 14.673C32.5451 13.8621 33 12.7628 33 11.6166C33 10.4704 32.5451 9.3711 31.7352 8.5602L28.064 4.88419C27.2532 4.07415 26.1541 3.61914 25.0081 3.61914C23.8621 3.61914 22.7629 4.07415 21.9521 4.88419L19.3906 7.43354L21.8395 9.88282L24.3927 7.32932C24.5549 7.16731 24.7747 7.07631 25.0039 7.07631C25.2331 7.07631 25.4529 7.16731 25.6151 7.32932L29.2822 10.997C29.404 11.1177 29.4873 11.2718 29.5213 11.4399C29.5554 11.608 29.5387 11.7824 29.4734 11.941H29.4776H29.4817Z" fill="url(#paint1_linear_426_297)"/>
|
|
4
|
+
<path d="M19.1743 15.1218L17.3446 16.9511L17.3446 16.9636L15.4656 18.8289L17.919 21.2778L21.6235 17.5665C22.4336 16.7557 22.8886 15.6566 22.8886 14.5106C22.8886 13.3646 22.4336 12.2654 21.6235 11.4547L17.9475 7.77926C17.1366 6.96935 16.0373 6.51442 14.8911 6.51442C13.7449 6.51442 12.6456 6.96935 11.8347 7.77926L7.18188 12.4318L9.63532 14.8765L14.2799 10.2323C14.442 10.0703 14.6619 9.97933 14.8911 9.97933C15.1204 9.97933 15.3402 10.0703 15.5024 10.2323L19.1743 13.8994C19.3363 14.0615 19.4273 14.2814 19.4273 14.5106C19.4273 14.7398 19.3363 14.9596 19.1743 15.1218Z" fill="url(#paint2_linear_426_297)"/>
|
|
5
|
+
<path d="M3.52721 21.0699C3.56879 20.966 3.63532 20.8703 3.71433 20.7913L5.54818 18.9578L5.54402 18.9495L7.99746 16.5048L9.6749 14.8364L7.22562 12.3834L1.26505 18.3508C0.455006 19.1615 -3.99616e-07 20.2607 -3.49523e-07 21.4067C-2.9943e-07 22.5527 0.455006 23.6518 1.26505 24.4626L4.94105 28.138C5.75196 28.9479 6.85127 29.4028 7.99746 29.4028C9.14364 29.4028 10.243 28.9479 11.0539 28.138L13.5353 25.6527L11.086 23.208L8.6129 25.6808C8.53251 25.7616 8.43695 25.8258 8.33168 25.8695C8.22642 25.9133 8.11354 25.9358 7.99954 25.9358C7.88553 25.9358 7.77265 25.9133 7.66739 25.8695C7.56213 25.8258 7.46656 25.7616 7.38618 25.6808L3.71433 22.0137C3.59316 21.8926 3.51069 21.7383 3.47737 21.5702C3.44406 21.4022 3.4614 21.2281 3.52721 21.0699Z" fill="url(#paint3_linear_426_297)"/>
|
|
6
|
+
<defs>
|
|
7
|
+
<linearGradient id="paint0_linear_426_297" x1="28.1527" y1="7.84149" x2="14.1164" y2="21.6319" gradientUnits="userSpaceOnUse">
|
|
8
|
+
<stop stop-color="#F46837"/>
|
|
9
|
+
<stop offset="1" stop-color="#945DD6"/>
|
|
10
|
+
</linearGradient>
|
|
11
|
+
<linearGradient id="paint1_linear_426_297" x1="28.1626" y1="8.00042" x2="14.1164" y2="21.8731" gradientUnits="userSpaceOnUse">
|
|
12
|
+
<stop stop-color="#F46837"/>
|
|
13
|
+
<stop offset="1" stop-color="#945DD6"/>
|
|
14
|
+
</linearGradient>
|
|
15
|
+
<linearGradient id="paint2_linear_426_297" x1="4.62869" y1="25.0522" x2="18.6605" y2="11.619" gradientUnits="userSpaceOnUse">
|
|
16
|
+
<stop stop-color="#13ADC7"/>
|
|
17
|
+
<stop offset="1" stop-color="#945DD6"/>
|
|
18
|
+
</linearGradient>
|
|
19
|
+
<linearGradient id="paint3_linear_426_297" x1="4.50795" y1="25.1728" x2="18.4191" y2="11.2572" gradientUnits="userSpaceOnUse">
|
|
20
|
+
<stop stop-color="#13ADC7"/>
|
|
21
|
+
<stop offset="1" stop-color="#945DD6"/>
|
|
22
|
+
</linearGradient>
|
|
23
|
+
</defs>
|
|
24
|
+
</svg>
|
|
@@ -4,22 +4,23 @@ from torch.nn.functional import cosine_similarity
|
|
|
4
4
|
from torch.utils.data import DataLoader
|
|
5
5
|
|
|
6
6
|
from datachain import C, DataChain
|
|
7
|
+
from datachain.sql.functions import path
|
|
7
8
|
|
|
8
9
|
source = "gs://datachain-demo/50k-laion-files/000000/00000000*"
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
def create_dataset():
|
|
12
|
-
imgs = (
|
|
13
|
-
|
|
14
|
-
.filter(C("file.path").glob("*.jpg"))
|
|
15
|
-
.map(stem=lambda file: file.get_file_stem(), params=["file"], output=str)
|
|
13
|
+
imgs = DataChain.from_storage(source, type="image").filter(
|
|
14
|
+
C("file.path").glob("*.jpg")
|
|
16
15
|
)
|
|
17
|
-
captions = (
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
captions = DataChain.from_storage(source, type="text").filter(
|
|
17
|
+
C("file.path").glob("*.txt")
|
|
18
|
+
)
|
|
19
|
+
return imgs.merge(
|
|
20
|
+
captions,
|
|
21
|
+
on=path.file_stem(imgs.c("file.path")),
|
|
22
|
+
right_on=path.file_stem(captions.c("file.path")),
|
|
21
23
|
)
|
|
22
|
-
return imgs.merge(captions, on="stem")
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
if __name__ == "__main__":
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
-
from datachain import
|
|
3
|
+
from datachain import DataChain
|
|
4
4
|
from datachain.lib.webdataset import process_webdataset
|
|
5
5
|
from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
|
|
6
6
|
from datachain.sql.functions import path
|
|
@@ -16,7 +16,7 @@ NPZ_METADATA = os.getenv(
|
|
|
16
16
|
)
|
|
17
17
|
|
|
18
18
|
wds_images = (
|
|
19
|
-
DataChain.from_storage(IMAGE_TARS)
|
|
19
|
+
DataChain.from_storage(IMAGE_TARS, type="image")
|
|
20
20
|
.settings(cache=True)
|
|
21
21
|
.gen(laion=process_webdataset(spec=WDSLaion), params="file")
|
|
22
22
|
)
|
|
@@ -25,21 +25,20 @@ wds_with_pq = (
|
|
|
25
25
|
DataChain.from_parquet(PARQUET_METADATA)
|
|
26
26
|
.settings(cache=True)
|
|
27
27
|
.merge(wds_images, on="uid", right_on="laion.json.uid", inner=True)
|
|
28
|
-
.mutate(stem=path.file_stem(C("source.file.path")))
|
|
29
28
|
)
|
|
30
29
|
|
|
31
|
-
|
|
30
|
+
wds_npz = (
|
|
32
31
|
DataChain.from_storage(NPZ_METADATA)
|
|
33
32
|
.settings(cache=True)
|
|
34
33
|
.gen(emd=process_laion_meta)
|
|
35
|
-
.mutate(stem=path.file_stem(C("emd.file.path")))
|
|
36
|
-
.merge(
|
|
37
|
-
wds_with_pq,
|
|
38
|
-
on=["stem", "emd.index"],
|
|
39
|
-
right_on=["stem", "source.index"],
|
|
40
|
-
inner=True,
|
|
41
|
-
)
|
|
42
|
-
.save("wds")
|
|
43
34
|
)
|
|
44
35
|
|
|
36
|
+
|
|
37
|
+
res = wds_npz.merge(
|
|
38
|
+
wds_with_pq,
|
|
39
|
+
on=[path.file_stem(wds_npz.c("emd.file.path")), "emd.index"],
|
|
40
|
+
right_on=[path.file_stem(wds_with_pq.c("source.file.path")), "source.index"],
|
|
41
|
+
inner=True,
|
|
42
|
+
).save("wds")
|
|
43
|
+
|
|
45
44
|
res.show(5)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
site_name:
|
|
2
|
-
site_url: https://datachain.
|
|
1
|
+
site_name: ''
|
|
2
|
+
site_url: https://docs.datachain.ai
|
|
3
3
|
site_description: Wrangle unstructured AI data at scale
|
|
4
4
|
|
|
5
5
|
repo_url: "https://github.com/iterative/datachain"
|
|
@@ -15,8 +15,8 @@ validation:
|
|
|
15
15
|
|
|
16
16
|
theme:
|
|
17
17
|
name: material
|
|
18
|
-
logo: assets/datachain.
|
|
19
|
-
favicon: assets/datachain.
|
|
18
|
+
logo: assets/datachain-white.svg
|
|
19
|
+
favicon: assets/datachain.svg
|
|
20
20
|
icon:
|
|
21
21
|
repo: fontawesome/brands/github
|
|
22
22
|
features:
|
|
@@ -82,7 +82,6 @@ tests = [
|
|
|
82
82
|
"pytest-mock>=3.12.0",
|
|
83
83
|
"pytest-servers[all]>=0.5.5",
|
|
84
84
|
"pytest-benchmark[histogram]",
|
|
85
|
-
"pytest-asyncio>=0.23.2",
|
|
86
85
|
"pytest-xdist>=3.3.1",
|
|
87
86
|
"virtualenv",
|
|
88
87
|
"dulwich",
|
|
@@ -136,13 +135,16 @@ markers = [
|
|
|
136
135
|
"llm_and_nlp: LLM and NLP examples",
|
|
137
136
|
"multimodal: Multimodal examples"
|
|
138
137
|
]
|
|
139
|
-
asyncio_mode = "auto"
|
|
140
138
|
filterwarnings = [
|
|
141
139
|
"error::pandas.errors.PerformanceWarning",
|
|
142
140
|
"error::pydantic.warnings.PydanticDeprecatedSince20",
|
|
143
141
|
"error::pytest_mock.PytestMockWarning",
|
|
144
142
|
"error::pytest.PytestCollectionWarning",
|
|
145
143
|
"error::sqlalchemy.exc.SADeprecationWarning",
|
|
144
|
+
"ignore::DeprecationWarning:timm.*",
|
|
145
|
+
"ignore::DeprecationWarning:botocore.auth",
|
|
146
|
+
"ignore::DeprecationWarning:datasets.utils._dill",
|
|
147
|
+
"ignore::DeprecationWarning:librosa.core.intervals",
|
|
146
148
|
"ignore:Field name .* shadows an attribute in parent:UserWarning" # datachain.lib.feature
|
|
147
149
|
]
|
|
148
150
|
|
|
@@ -9,11 +9,9 @@ import os.path
|
|
|
9
9
|
import posixpath
|
|
10
10
|
import subprocess
|
|
11
11
|
import sys
|
|
12
|
-
import tempfile
|
|
13
12
|
import time
|
|
14
13
|
import traceback
|
|
15
14
|
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
16
|
-
from contextlib import contextmanager, nullcontext
|
|
17
15
|
from copy import copy
|
|
18
16
|
from dataclasses import dataclass
|
|
19
17
|
from functools import cached_property, reduce
|
|
@@ -24,7 +22,6 @@ from typing import (
|
|
|
24
22
|
TYPE_CHECKING,
|
|
25
23
|
Any,
|
|
26
24
|
Callable,
|
|
27
|
-
NamedTuple,
|
|
28
25
|
NoReturn,
|
|
29
26
|
Optional,
|
|
30
27
|
Union,
|
|
@@ -59,7 +56,6 @@ from datachain.error import (
|
|
|
59
56
|
PendingIndexingError,
|
|
60
57
|
QueryScriptCancelError,
|
|
61
58
|
QueryScriptCompileError,
|
|
62
|
-
QueryScriptDatasetNotFound,
|
|
63
59
|
QueryScriptRunError,
|
|
64
60
|
)
|
|
65
61
|
from datachain.listing import Listing
|
|
@@ -77,7 +73,6 @@ from datachain.utils import (
|
|
|
77
73
|
)
|
|
78
74
|
|
|
79
75
|
from .datasource import DataSource
|
|
80
|
-
from .subclass import SubclassFinder
|
|
81
76
|
|
|
82
77
|
if TYPE_CHECKING:
|
|
83
78
|
from datachain.data_storage import (
|
|
@@ -92,7 +87,6 @@ logger = logging.getLogger("datachain")
|
|
|
92
87
|
|
|
93
88
|
DEFAULT_DATASET_DIR = "dataset"
|
|
94
89
|
DATASET_FILE_SUFFIX = ".edatachain"
|
|
95
|
-
FEATURE_CLASSES = ["DataModel"]
|
|
96
90
|
|
|
97
91
|
TTL_INT = 4 * 60 * 60
|
|
98
92
|
|
|
@@ -118,44 +112,19 @@ def noop(_: str):
|
|
|
118
112
|
pass
|
|
119
113
|
|
|
120
114
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
stream:
|
|
124
|
-
|
|
125
|
-
lines: list[str] = []
|
|
126
|
-
append = lines.append
|
|
115
|
+
def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
|
|
116
|
+
buffer = b""
|
|
117
|
+
while byt := stream.read(1): # Read one byte at a time
|
|
118
|
+
buffer += byt
|
|
127
119
|
|
|
128
|
-
|
|
129
|
-
buffer = b""
|
|
130
|
-
while byt := stream.read(1): # Read one byte at a time
|
|
131
|
-
buffer += byt.encode("utf-8") if isinstance(byt, str) else byt
|
|
132
|
-
|
|
133
|
-
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
134
|
-
line = buffer.decode("utf-8")
|
|
135
|
-
print(line, end="")
|
|
136
|
-
callback(line)
|
|
137
|
-
append(line)
|
|
138
|
-
buffer = b"" # Clear buffer for next line
|
|
139
|
-
|
|
140
|
-
if buffer: # Handle any remaining data in the buffer
|
|
120
|
+
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
141
121
|
line = buffer.decode("utf-8")
|
|
142
|
-
print(line, end="")
|
|
143
122
|
callback(line)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
thread = Thread(target=loop, daemon=True)
|
|
147
|
-
thread.start()
|
|
148
|
-
|
|
149
|
-
try:
|
|
150
|
-
yield lines
|
|
151
|
-
finally:
|
|
152
|
-
thread.join()
|
|
123
|
+
buffer = b"" # Clear buffer for next line
|
|
153
124
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
version: Optional[int]
|
|
158
|
-
output: str
|
|
125
|
+
if buffer: # Handle any remaining data in the buffer
|
|
126
|
+
line = buffer.decode("utf-8")
|
|
127
|
+
callback(line)
|
|
159
128
|
|
|
160
129
|
|
|
161
130
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -569,12 +538,6 @@ def find_column_to_str( # noqa: PLR0911
|
|
|
569
538
|
return ""
|
|
570
539
|
|
|
571
540
|
|
|
572
|
-
def form_module_source(source_ast):
|
|
573
|
-
module = ast.Module(body=source_ast, type_ignores=[])
|
|
574
|
-
module = ast.fix_missing_locations(module)
|
|
575
|
-
return ast.unparse(module)
|
|
576
|
-
|
|
577
|
-
|
|
578
541
|
class Catalog:
|
|
579
542
|
def __init__(
|
|
580
543
|
self,
|
|
@@ -658,34 +621,8 @@ class Catalog:
|
|
|
658
621
|
),
|
|
659
622
|
]
|
|
660
623
|
code_ast.body[-1:] = new_expressions
|
|
661
|
-
else:
|
|
662
|
-
raise Exception("Last line in a script was not an expression")
|
|
663
624
|
return code_ast
|
|
664
625
|
|
|
665
|
-
def compile_query_script(
|
|
666
|
-
self, script: str, feature_module_name: str
|
|
667
|
-
) -> tuple[Union[str, None], str]:
|
|
668
|
-
code_ast = ast.parse(script)
|
|
669
|
-
code_ast = self.attach_query_wrapper(code_ast)
|
|
670
|
-
finder = SubclassFinder(FEATURE_CLASSES)
|
|
671
|
-
finder.visit(code_ast)
|
|
672
|
-
|
|
673
|
-
if not finder.feature_class:
|
|
674
|
-
main_module = form_module_source([*finder.imports, *finder.main_body])
|
|
675
|
-
return None, main_module
|
|
676
|
-
|
|
677
|
-
feature_import = ast.ImportFrom(
|
|
678
|
-
module=feature_module_name,
|
|
679
|
-
names=[ast.alias(name="*", asname=None)],
|
|
680
|
-
level=0,
|
|
681
|
-
)
|
|
682
|
-
feature_module = form_module_source([*finder.imports, *finder.feature_class])
|
|
683
|
-
main_module = form_module_source(
|
|
684
|
-
[*finder.imports, feature_import, *finder.main_body]
|
|
685
|
-
)
|
|
686
|
-
|
|
687
|
-
return feature_module, main_module
|
|
688
|
-
|
|
689
626
|
def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
|
|
690
627
|
config = config or self.client_config
|
|
691
628
|
return Client.parse_url(uri, self.cache, **config)
|
|
@@ -1416,7 +1353,8 @@ class Catalog:
|
|
|
1416
1353
|
|
|
1417
1354
|
for d in datasets:
|
|
1418
1355
|
yield from (
|
|
1419
|
-
(d, v, jobs.get(v.job_id) if v.job_id else None)
|
|
1356
|
+
(d, v, jobs.get(str(v.job_id)) if v.job_id else None)
|
|
1357
|
+
for v in d.versions
|
|
1420
1358
|
)
|
|
1421
1359
|
|
|
1422
1360
|
def ls_dataset_rows(
|
|
@@ -1834,14 +1772,15 @@ class Catalog:
|
|
|
1834
1772
|
def query(
|
|
1835
1773
|
self,
|
|
1836
1774
|
query_script: str,
|
|
1837
|
-
|
|
1838
|
-
python_executable:
|
|
1775
|
+
env: Optional[Mapping[str, str]] = None,
|
|
1776
|
+
python_executable: str = sys.executable,
|
|
1839
1777
|
save: bool = False,
|
|
1840
1778
|
capture_output: bool = True,
|
|
1841
1779
|
output_hook: Callable[[str], None] = noop,
|
|
1842
1780
|
params: Optional[dict[str, str]] = None,
|
|
1843
1781
|
job_id: Optional[str] = None,
|
|
1844
|
-
|
|
1782
|
+
_execute_last_expression: bool = False,
|
|
1783
|
+
) -> None:
|
|
1845
1784
|
"""
|
|
1846
1785
|
Method to run custom user Python script to run a query and, as result,
|
|
1847
1786
|
creates new dataset from the results of a query.
|
|
@@ -1864,170 +1803,51 @@ class Catalog:
|
|
|
1864
1803
|
C.size > 1000
|
|
1865
1804
|
)
|
|
1866
1805
|
"""
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
query_script,
|
|
1877
|
-
envs,
|
|
1878
|
-
feature_file,
|
|
1879
|
-
capture_output,
|
|
1880
|
-
feature_module,
|
|
1881
|
-
output_hook,
|
|
1882
|
-
params,
|
|
1883
|
-
save,
|
|
1884
|
-
job_id,
|
|
1885
|
-
)
|
|
1886
|
-
finally:
|
|
1887
|
-
feature_file.close()
|
|
1888
|
-
os.unlink(feature_file.name)
|
|
1889
|
-
|
|
1890
|
-
output = "".join(lines)
|
|
1891
|
-
|
|
1892
|
-
if proc.returncode:
|
|
1893
|
-
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1894
|
-
raise QueryScriptCancelError(
|
|
1895
|
-
"Query script was canceled by user",
|
|
1896
|
-
return_code=proc.returncode,
|
|
1897
|
-
output=output,
|
|
1898
|
-
)
|
|
1899
|
-
if proc.returncode == QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE:
|
|
1900
|
-
raise QueryScriptRunError(
|
|
1901
|
-
"Last line in a script was not an instance of DataChain",
|
|
1902
|
-
return_code=proc.returncode,
|
|
1903
|
-
output=output,
|
|
1904
|
-
)
|
|
1905
|
-
raise QueryScriptRunError(
|
|
1906
|
-
f"Query script exited with error code {proc.returncode}",
|
|
1907
|
-
return_code=proc.returncode,
|
|
1908
|
-
output=output,
|
|
1909
|
-
)
|
|
1910
|
-
|
|
1911
|
-
try:
|
|
1912
|
-
result = json.loads(response_text)
|
|
1913
|
-
except ValueError:
|
|
1914
|
-
result = None
|
|
1915
|
-
|
|
1916
|
-
dataset: Optional[DatasetRecord] = None
|
|
1917
|
-
version: Optional[int] = None
|
|
1918
|
-
if save:
|
|
1919
|
-
dataset, version = self.save_result(
|
|
1920
|
-
query_script, result, output, version, job_id
|
|
1921
|
-
)
|
|
1922
|
-
|
|
1923
|
-
return QueryResult(dataset=dataset, version=version, output=output)
|
|
1924
|
-
|
|
1925
|
-
def run_query(
|
|
1926
|
-
self,
|
|
1927
|
-
python_executable: str,
|
|
1928
|
-
query_script: str,
|
|
1929
|
-
envs: Optional[Mapping[str, str]],
|
|
1930
|
-
feature_file: IO[bytes],
|
|
1931
|
-
capture_output: bool,
|
|
1932
|
-
feature_module: str,
|
|
1933
|
-
output_hook: Callable[[str], None],
|
|
1934
|
-
params: Optional[dict[str, str]],
|
|
1935
|
-
save: bool,
|
|
1936
|
-
job_id: Optional[str],
|
|
1937
|
-
) -> tuple[list[str], subprocess.Popen, str]:
|
|
1938
|
-
try:
|
|
1939
|
-
feature_code, query_script_compiled = self.compile_query_script(
|
|
1940
|
-
query_script, feature_module[:-3]
|
|
1941
|
-
)
|
|
1942
|
-
if feature_code:
|
|
1943
|
-
feature_file.write(feature_code.encode())
|
|
1944
|
-
feature_file.flush()
|
|
1945
|
-
|
|
1946
|
-
except Exception as exc:
|
|
1947
|
-
raise QueryScriptCompileError(
|
|
1948
|
-
f"Query script failed to compile, reason: {exc}"
|
|
1949
|
-
) from exc
|
|
1950
|
-
r, w = os.pipe()
|
|
1951
|
-
if os.name == "nt":
|
|
1952
|
-
import msvcrt
|
|
1953
|
-
|
|
1954
|
-
os.set_inheritable(w, True)
|
|
1955
|
-
|
|
1956
|
-
startupinfo = subprocess.STARTUPINFO() # type: ignore[attr-defined]
|
|
1957
|
-
handle = msvcrt.get_osfhandle(w) # type: ignore[attr-defined]
|
|
1958
|
-
startupinfo.lpAttributeList["handle_list"].append(handle)
|
|
1959
|
-
kwargs: dict[str, Any] = {"startupinfo": startupinfo}
|
|
1806
|
+
if _execute_last_expression:
|
|
1807
|
+
try:
|
|
1808
|
+
code_ast = ast.parse(query_script)
|
|
1809
|
+
code_ast = self.attach_query_wrapper(code_ast)
|
|
1810
|
+
query_script_compiled = ast.unparse(code_ast)
|
|
1811
|
+
except Exception as exc:
|
|
1812
|
+
raise QueryScriptCompileError(
|
|
1813
|
+
f"Query script failed to compile, reason: {exc}"
|
|
1814
|
+
) from exc
|
|
1960
1815
|
else:
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
{feature_module: feature_code}
|
|
1967
|
-
)
|
|
1968
|
-
envs.update(
|
|
1816
|
+
query_script_compiled = query_script
|
|
1817
|
+
assert not save
|
|
1818
|
+
|
|
1819
|
+
env = dict(env or os.environ)
|
|
1820
|
+
env.update(
|
|
1969
1821
|
{
|
|
1970
1822
|
"DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
|
|
1971
1823
|
"PYTHONPATH": os.getcwd(), # For local imports
|
|
1972
1824
|
"DATACHAIN_QUERY_SAVE": "1" if save else "",
|
|
1973
1825
|
"PYTHONUNBUFFERED": "1",
|
|
1974
|
-
"DATACHAIN_OUTPUT_FD": str(handle),
|
|
1975
1826
|
"DATACHAIN_JOB_ID": job_id or "",
|
|
1976
1827
|
},
|
|
1977
1828
|
)
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
return lines, proc, response_text
|
|
2000
|
-
|
|
2001
|
-
def save_result(self, query_script, exec_result, output, version, job_id):
|
|
2002
|
-
if not exec_result:
|
|
2003
|
-
raise QueryScriptDatasetNotFound(
|
|
2004
|
-
"No dataset found after running Query script",
|
|
2005
|
-
output=output,
|
|
1829
|
+
popen_kwargs = {}
|
|
1830
|
+
if capture_output:
|
|
1831
|
+
popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
|
|
1832
|
+
|
|
1833
|
+
cmd = [python_executable, "-c", query_script_compiled]
|
|
1834
|
+
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # type: ignore[call-overload] # noqa: S603
|
|
1835
|
+
if capture_output:
|
|
1836
|
+
args = (proc.stdout, output_hook)
|
|
1837
|
+
thread = Thread(target=_process_stream, args=args, daemon=True)
|
|
1838
|
+
thread.start()
|
|
1839
|
+
thread.join() # wait for the reader thread
|
|
1840
|
+
|
|
1841
|
+
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1842
|
+
raise QueryScriptCancelError(
|
|
1843
|
+
"Query script was canceled by user",
|
|
1844
|
+
return_code=proc.returncode,
|
|
1845
|
+
)
|
|
1846
|
+
if proc.returncode:
|
|
1847
|
+
raise QueryScriptRunError(
|
|
1848
|
+
f"Query script exited with error code {proc.returncode}",
|
|
1849
|
+
return_code=proc.returncode,
|
|
2006
1850
|
)
|
|
2007
|
-
name, version = exec_result
|
|
2008
|
-
# finding returning dataset
|
|
2009
|
-
try:
|
|
2010
|
-
dataset = self.get_dataset(name)
|
|
2011
|
-
dataset.get_version(version)
|
|
2012
|
-
except (DatasetNotFoundError, ValueError) as e:
|
|
2013
|
-
raise QueryScriptDatasetNotFound(
|
|
2014
|
-
"No dataset found after running Query script",
|
|
2015
|
-
output=output,
|
|
2016
|
-
) from e
|
|
2017
|
-
dataset = self.update_dataset(
|
|
2018
|
-
dataset,
|
|
2019
|
-
script_output=output,
|
|
2020
|
-
query_script=query_script,
|
|
2021
|
-
)
|
|
2022
|
-
self.update_dataset_version_with_warehouse_info(
|
|
2023
|
-
dataset,
|
|
2024
|
-
version,
|
|
2025
|
-
script_output=output,
|
|
2026
|
-
query_script=query_script,
|
|
2027
|
-
job_id=job_id,
|
|
2028
|
-
is_job_result=True,
|
|
2029
|
-
)
|
|
2030
|
-
return dataset, version
|
|
2031
1851
|
|
|
2032
1852
|
def cp(
|
|
2033
1853
|
self,
|