datachain 0.3.11__tar.gz → 0.3.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.11 → datachain-0.3.12}/.pre-commit-config.yaml +1 -1
- {datachain-0.3.11/src/datachain.egg-info → datachain-0.3.12}/PKG-INFO +6 -7
- {datachain-0.3.11 → datachain-0.3.12}/README.rst +5 -5
- datachain-0.3.12/docs/assets/datachain-white.svg +1 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/get_started/udfs/stateful.py +4 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/multimodal/wds.py +1 -1
- {datachain-0.3.11 → datachain-0.3.12}/mkdocs.yml +2 -2
- {datachain-0.3.11 → datachain-0.3.12}/pyproject.toml +4 -2
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/catalog/catalog.py +49 -143
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/error.py +0 -4
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/clip.py +1 -1
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/dc.py +17 -4
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/file.py +9 -8
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/image.py +1 -1
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/meta_formats.py +4 -8
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/model_store.py +6 -1
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/text.py +1 -1
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/webdataset.py +13 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/webdataset_laion.py +13 -0
- {datachain-0.3.11 → datachain-0.3.12/src/datachain.egg-info}/PKG-INFO +6 -7
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain.egg-info/SOURCES.txt +1 -1
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain.egg-info/requires.txt +0 -1
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_catalog.py +17 -37
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_datasets.py +0 -2
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_meta_formats.py +0 -1
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_pytorch.py +10 -3
- datachain-0.3.12/tests/func/test_query.py +173 -0
- datachain-0.3.11/docs/assets/datachain_logotype.svg +0 -33
- datachain-0.3.11/tests/func/test_query.py +0 -377
- {datachain-0.3.11 → datachain-0.3.12}/.cruft.json +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.gitattributes +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.github/codecov.yaml +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.github/dependabot.yml +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.github/workflows/release.yml +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.github/workflows/tests.yml +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/.gitignore +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/LICENSE +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/assets/datachain.svg +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/index.md +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/references/datachain.md +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/references/datatype.md +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/references/file.md +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/references/index.md +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/references/sql.md +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/references/torch.md +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/docs/references/udf.md +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/noxfile.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/setup.cfg +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/__main__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/asyn.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/cache.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/cli.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/client/hf.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/client/local.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/config.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/dataset.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/job.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/hf.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/listing.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/listing.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/node.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/progress.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/py.typed +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/builtins.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/dataset.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/params.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/schema.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/session.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/query/udf.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/types.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/storage.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain/utils.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/conftest.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/data.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/examples/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_client.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_datachain.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_listing.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_ls.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/func/test_pull.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_cache.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_client.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_session.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_udf.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.11 → datachain-0.3.12}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.12
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -71,7 +71,6 @@ Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
|
71
71
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
72
72
|
Requires-Dist: pytest-servers[all]>=0.5.5; extra == "tests"
|
|
73
73
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
74
|
-
Requires-Dist: pytest-asyncio>=0.23.2; extra == "tests"
|
|
75
74
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
76
75
|
Requires-Dist: virtualenv; extra == "tests"
|
|
77
76
|
Requires-Dist: dulwich; extra == "tests"
|
|
@@ -96,12 +95,14 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
|
96
95
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
97
96
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
98
97
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
98
|
+
================
|
|
99
|
+
|logo| DataChain
|
|
100
|
+
================
|
|
102
101
|
|
|
103
102
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
104
103
|
|
|
104
|
+
.. |logo| image:: docs/assets/datachain.svg
|
|
105
|
+
:height: 24
|
|
105
106
|
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
106
107
|
:target: https://pypi.org/project/datachain/
|
|
107
108
|
:alt: PyPI
|
|
@@ -115,8 +116,6 @@ Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
|
115
116
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
116
117
|
:alt: Tests
|
|
117
118
|
|
|
118
|
-
----------------
|
|
119
|
-
|
|
120
119
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
121
120
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
122
121
|
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
================
|
|
2
|
+
|logo| DataChain
|
|
3
|
+
================
|
|
4
4
|
|
|
5
5
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
6
6
|
|
|
7
|
+
.. |logo| image:: docs/assets/datachain.svg
|
|
8
|
+
:height: 24
|
|
7
9
|
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
8
10
|
:target: https://pypi.org/project/datachain/
|
|
9
11
|
:alt: PyPI
|
|
@@ -17,8 +19,6 @@
|
|
|
17
19
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
18
20
|
:alt: Tests
|
|
19
21
|
|
|
20
|
-
----------------
|
|
21
|
-
|
|
22
22
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
23
23
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
24
24
|
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
<svg width="180" height="33" fill="none" xmlns="http://www.w3.org/2000/svg"><style>.prefix__logo-fill{fill:#fff}</style><path fill-rule="evenodd" clip-rule="evenodd" d="M23.997 24.53l2.34-2.342 3.14 3.135-2.357 2.342a5.533 5.533 0 01-7.822 0l-4.704-4.7a5.536 5.536 0 010-7.823l4.76-4.763 3.124 3.14-4.745 4.747a1.106 1.106 0 000 1.57l4.699 4.694a1.107 1.107 0 001.565 0z" fill="url(#prefix__paint0_linear_449_28)"/><path fill-rule="evenodd" clip-rule="evenodd" d="M37.733 10.65a1.184 1.184 0 01-.234.357L26.253 22.255l3.13 3.135 11.234-11.242a5.536 5.536 0 000-7.824l-4.699-4.705a5.534 5.534 0 00-7.822 0l-3.278 3.263 3.134 3.135 3.268-3.268a1.107 1.107 0 011.564 0l4.694 4.694a1.108 1.108 0 01.244 1.208h.011z" fill="url(#prefix__paint1_linear_449_28)"/><path d="M24.54 14.722L22.2 17.063v.016l-2.405 2.388 3.14 3.134 4.741-4.75a5.534 5.534 0 000-7.822l-4.704-4.704a5.535 5.535 0 00-7.824 0l-5.955 5.954 3.14 3.13 5.944-5.945a1.107 1.107 0 011.565 0l4.7 4.694a1.107 1.107 0 010 1.564z" fill="url(#prefix__paint2_linear_449_28)"/><path d="M4.514 22.335c.054-.133.139-.256.24-.357L7.1 19.632l-.005-.011 3.14-3.129 2.147-2.135-3.135-3.14-7.629 7.638a5.534 5.534 0 000 7.822l4.705 4.704a5.536 5.536 0 007.824 0l3.175-3.18-3.134-3.13-3.165 3.165a1.106 1.106 0 01-1.57 0l-4.7-4.693a1.107 1.107 0 01-.24-1.208z" fill="url(#prefix__paint3_linear_449_28)"/><path d="M55.645 26.613c-.994 0-1.908-.182-2.745-.547a6.407 6.407 0 01-2.169-1.538 7.037 7.037 0 01-1.41-2.294 8.126 8.126 0 01-.497-2.867v-.547c0-1.008.157-1.955.47-2.841a7.478 7.478 0 011.36-2.32 6.201 6.201 0 012.116-1.538c.836-.382 1.76-.573 2.77-.573 1.115 0 2.09.243 2.927.73.854.469 1.533 1.181 2.038 2.137.506.956.784 2.155.837 3.597L60.27 16.76V7.117h3.633v19.027h-2.875v-6.02h.627c-.052 1.441-.348 2.649-.888 3.622-.54.956-1.255 1.677-2.143 2.163-.871.47-1.864.704-2.98.704zm.81-3.05c.714 0 1.367-.156 1.96-.469.592-.33 1.063-.799 1.41-1.407.367-.626.55-1.355.55-2.19v-1.042c0-.834-.183-1.53-.55-2.085a3.572 3.572 0 00-1.436-1.303 4.078 4.078 0 00-1.934-.47c-.784 0-1.481.192-2.091.574-.592.365-1.063.886-1.411 1.564-.331.678-.497 1.468-.497 2.372 0 .903.174 1.694.523 2.372.348.66.819 1.172 1.411 1.537.61.365 1.298.548 2.065.548zM76.635 26.144v-4.196h-.6v-4.666c0-.817-.201-1.425-.602-1.824-.4-.4-1.019-.6-1.855-.6a68.629 68.629 0 00-3.423.104c-.61.018-1.16.044-1.647.079v-3.076c.4-.035.854-.07 1.359-.104.505-.035 1.02-.052 1.542-.052.54-.018 1.045-.026 1.515-.026 1.464 0 2.675.19 3.633.573.976.382 1.707.982 2.195 1.799.505.816.758 1.885.758 3.205v8.784h-2.875zm-4.573.365c-1.028 0-1.934-.183-2.718-.547a4.274 4.274 0 01-1.803-1.564c-.418-.678-.627-1.495-.627-2.45 0-1.043.252-1.894.758-2.555.522-.66 1.245-1.155 2.169-1.485.94-.33 2.038-.495 3.293-.495h3.292v2.163h-3.345c-.836 0-1.48.208-1.934.625-.435.4-.653.921-.653 1.564s.218 1.164.653 1.564c.453.4 1.098.6 1.934.6.505 0 .967-.087 1.385-.261a2.413 2.413 0 001.072-.938c.296-.452.462-1.06.496-1.825l.889 1.017c-.087.99-.331 1.824-.732 2.502a3.899 3.899 0 01-1.62 1.564c-.68.347-1.516.52-2.509.52zM89.569 26.326c-1.307 0-2.387-.165-3.24-.495a3.635 3.635 0 01-1.882-1.72c-.419-.817-.628-1.911-.628-3.284l.026-12.824h3.398l-.026 13.058c0 .695.183 1.234.548 1.616.384.365.924.548 1.62.548h2.222v3.101h-2.038zM81.572 14.65V11.99h10.035v2.659H81.572zM103.203 26.144v-4.196h-.601v-4.666c0-.817-.201-1.425-.601-1.824-.401-.4-1.02-.6-1.856-.6a68.629 68.629 0 00-3.423.104c-.61.018-1.159.044-1.647.079v-3.076c.4-.035.854-.07 1.36-.104.504-.035 1.018-.052 1.541-.052.54-.018 1.045-.026 1.516-.026 1.463 0 2.674.19 3.632.573.976.382 1.708.982 2.196 1.799.505.816.757 1.885.757 3.205v8.784h-2.874zm-4.574.365c-1.028 0-1.934-.183-2.718-.547a4.274 4.274 0 01-1.803-1.564c-.418-.678-.627-1.495-.627-2.45 0-1.043.253-1.894.758-2.555.523-.66 1.246-1.155 2.169-1.485.94-.33 2.038-.495 3.293-.495h3.293v2.163h-3.345c-.837 0-1.481.208-1.934.625-.436.4-.654.921-.654 1.564s.218 1.164.654 1.564c.453.4 1.097.6 1.934.6.505 0 .966-.087 1.385-.261a2.417 2.417 0 001.071-.938c.296-.452.462-1.06.497-1.825l.888 1.017c-.087.99-.331 1.824-.732 2.502a3.897 3.897 0 01-1.62 1.564c-.679.347-1.516.52-2.509.52zM116.267 26.64c-1.237 0-2.309-.21-3.215-.626a6.773 6.773 0 01-2.247-1.668A7.117 7.117 0 01109.472 22a8.19 8.19 0 01-.444-2.659v-.495c0-.956.148-1.868.444-2.737a6.905 6.905 0 011.385-2.346 6.488 6.488 0 012.247-1.642c.906-.417 1.952-.625 3.136-.625 1.237 0 2.344.243 3.319.73.976.469 1.751 1.13 2.326 1.98.593.852.924 1.843.993 2.972h-3.528a2.824 2.824 0 00-.941-1.825c-.522-.486-1.245-.73-2.169-.73-.801 0-1.472.192-2.012.574-.523.382-.915.912-1.176 1.59-.261.66-.392 1.425-.392 2.294 0 .834.122 1.59.366 2.267.261.678.653 1.208 1.176 1.59.54.382 1.228.574 2.065.574.627 0 1.167-.114 1.62-.34.453-.225.81-.538 1.071-.938.279-.4.453-.851.523-1.355h3.528c-.07 1.147-.409 2.155-1.019 3.023-.593.852-1.385 1.52-2.378 2.007-.976.487-2.091.73-3.345.73zM125.919 26.144V7.117h3.633v11.104h-.628c0-1.425.183-2.633.549-3.623.366-.99.906-1.747 1.62-2.268.732-.521 1.656-.782 2.771-.782h.156c1.621 0 2.849.556 3.685 1.668.836 1.112 1.255 2.728 1.255 4.848v8.08h-3.633v-8.419c0-.903-.261-1.616-.784-2.137-.505-.521-1.176-.782-2.012-.782-.889 0-1.612.296-2.169.886-.54.574-.81 1.33-.81 2.268v8.184h-3.633zM151.463 26.144v-4.196h-.601v-4.666c0-.817-.201-1.425-.601-1.824-.401-.4-1.02-.6-1.856-.6a68.524 68.524 0 00-3.423.104c-.61.018-1.159.044-1.647.079v-3.076c.401-.035.854-.07 1.359-.104a22.491 22.491 0 011.542-.052c.54-.018 1.045-.026 1.516-.026 1.463 0 2.674.19 3.632.573.976.382 1.708.982 2.196 1.799.505.816.757 1.885.757 3.205v8.784h-2.874zm-4.574.365c-1.027 0-1.933-.183-2.717-.547a4.277 4.277 0 01-1.804-1.564c-.418-.678-.627-1.495-.627-2.45 0-1.043.253-1.894.758-2.555.523-.66 1.246-1.155 2.169-1.485.941-.33 2.038-.495 3.293-.495h3.293v2.163h-3.345c-.837 0-1.481.208-1.934.625-.436.4-.654.921-.654 1.564s.218 1.164.654 1.564c.453.4 1.097.6 1.934.6.505 0 .967-.087 1.385-.261a2.417 2.417 0 001.071-.938c.296-.452.462-1.06.497-1.825l.888 1.017c-.087.99-.331 1.824-.731 2.502a3.905 3.905 0 01-1.621 1.564c-.679.347-1.516.52-2.509.52zM158.908 26.144V11.99h3.632v14.153h-3.632zm-1.986-11.442v-2.71h5.618v2.71h-5.618zm3.319-4.405c-.715 0-1.246-.183-1.594-.547-.331-.383-.497-.86-.497-1.434 0-.573.166-1.042.497-1.407.348-.365.879-.548 1.594-.548.714 0 1.237.183 1.568.548.331.365.496.834.496 1.407 0 .574-.165 1.051-.496 1.434-.331.364-.854.547-1.568.547zM166.727 26.144V11.99h2.875v6.073h-.262c0-1.442.192-2.641.575-3.597.384-.973.95-1.703 1.699-2.19.766-.486 1.716-.729 2.848-.729h.157c1.69 0 2.971.547 3.842 1.642.871 1.077 1.307 2.693 1.307 4.848v8.106h-3.633v-8.419c0-.869-.253-1.572-.758-2.11-.488-.54-1.167-.809-2.038-.809-.889 0-1.612.278-2.169.834-.54.539-.811 1.269-.811 2.19v8.314h-3.632z" class="prefix__logo-fill"/><defs><linearGradient id="prefix__paint0_linear_449_28" x1="36.032" y1="5.404" x2="18.067" y2="23.054" gradientUnits="userSpaceOnUse"><stop stop-color="#F46837"/><stop offset="1" stop-color="#945DD6"/></linearGradient><linearGradient id="prefix__paint1_linear_449_28" x1="36.045" y1="5.607" x2="18.067" y2="23.363" gradientUnits="userSpaceOnUse"><stop stop-color="#F46837"/><stop offset="1" stop-color="#945DD6"/></linearGradient><linearGradient id="prefix__paint2_linear_449_28" x1="5.924" y1="27.432" x2="23.883" y2="10.239" gradientUnits="userSpaceOnUse"><stop stop-color="#13ADC7"/><stop offset="1" stop-color="#945DD6"/></linearGradient><linearGradient id="prefix__paint3_linear_449_28" x1="5.77" y1="27.586" x2="23.574" y2="9.776" gradientUnits="userSpaceOnUse"><stop stop-color="#13ADC7"/><stop offset="1" stop-color="#945DD6"/></linearGradient></defs></svg>
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
site_name: ''
|
|
2
|
-
site_url: https://datachain.
|
|
2
|
+
site_url: https://docs.datachain.ai
|
|
3
3
|
site_description: Wrangle unstructured AI data at scale
|
|
4
4
|
|
|
5
5
|
repo_url: "https://github.com/iterative/datachain"
|
|
@@ -15,7 +15,7 @@ validation:
|
|
|
15
15
|
|
|
16
16
|
theme:
|
|
17
17
|
name: material
|
|
18
|
-
logo: assets/
|
|
18
|
+
logo: assets/datachain-white.svg
|
|
19
19
|
favicon: assets/datachain.svg
|
|
20
20
|
icon:
|
|
21
21
|
repo: fontawesome/brands/github
|
|
@@ -82,7 +82,6 @@ tests = [
|
|
|
82
82
|
"pytest-mock>=3.12.0",
|
|
83
83
|
"pytest-servers[all]>=0.5.5",
|
|
84
84
|
"pytest-benchmark[histogram]",
|
|
85
|
-
"pytest-asyncio>=0.23.2",
|
|
86
85
|
"pytest-xdist>=3.3.1",
|
|
87
86
|
"virtualenv",
|
|
88
87
|
"dulwich",
|
|
@@ -136,13 +135,16 @@ markers = [
|
|
|
136
135
|
"llm_and_nlp: LLM and NLP examples",
|
|
137
136
|
"multimodal: Multimodal examples"
|
|
138
137
|
]
|
|
139
|
-
asyncio_mode = "auto"
|
|
140
138
|
filterwarnings = [
|
|
141
139
|
"error::pandas.errors.PerformanceWarning",
|
|
142
140
|
"error::pydantic.warnings.PydanticDeprecatedSince20",
|
|
143
141
|
"error::pytest_mock.PytestMockWarning",
|
|
144
142
|
"error::pytest.PytestCollectionWarning",
|
|
145
143
|
"error::sqlalchemy.exc.SADeprecationWarning",
|
|
144
|
+
"ignore::DeprecationWarning:timm.*",
|
|
145
|
+
"ignore::DeprecationWarning:botocore.auth",
|
|
146
|
+
"ignore::DeprecationWarning:datasets.utils._dill",
|
|
147
|
+
"ignore::DeprecationWarning:librosa.core.intervals",
|
|
146
148
|
"ignore:Field name .* shadows an attribute in parent:UserWarning" # datachain.lib.feature
|
|
147
149
|
]
|
|
148
150
|
|
|
@@ -12,7 +12,6 @@ import sys
|
|
|
12
12
|
import time
|
|
13
13
|
import traceback
|
|
14
14
|
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
15
|
-
from contextlib import contextmanager, nullcontext
|
|
16
15
|
from copy import copy
|
|
17
16
|
from dataclasses import dataclass
|
|
18
17
|
from functools import cached_property, reduce
|
|
@@ -23,7 +22,6 @@ from typing import (
|
|
|
23
22
|
TYPE_CHECKING,
|
|
24
23
|
Any,
|
|
25
24
|
Callable,
|
|
26
|
-
NamedTuple,
|
|
27
25
|
NoReturn,
|
|
28
26
|
Optional,
|
|
29
27
|
Union,
|
|
@@ -58,7 +56,6 @@ from datachain.error import (
|
|
|
58
56
|
PendingIndexingError,
|
|
59
57
|
QueryScriptCancelError,
|
|
60
58
|
QueryScriptCompileError,
|
|
61
|
-
QueryScriptDatasetNotFound,
|
|
62
59
|
QueryScriptRunError,
|
|
63
60
|
)
|
|
64
61
|
from datachain.listing import Listing
|
|
@@ -115,44 +112,19 @@ def noop(_: str):
|
|
|
115
112
|
pass
|
|
116
113
|
|
|
117
114
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
stream:
|
|
121
|
-
|
|
122
|
-
lines: list[str] = []
|
|
123
|
-
append = lines.append
|
|
115
|
+
def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
|
|
116
|
+
buffer = b""
|
|
117
|
+
while byt := stream.read(1): # Read one byte at a time
|
|
118
|
+
buffer += byt
|
|
124
119
|
|
|
125
|
-
|
|
126
|
-
buffer = b""
|
|
127
|
-
while byt := stream.read(1): # Read one byte at a time
|
|
128
|
-
buffer += byt.encode("utf-8") if isinstance(byt, str) else byt
|
|
129
|
-
|
|
130
|
-
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
131
|
-
line = buffer.decode("utf-8")
|
|
132
|
-
print(line, end="")
|
|
133
|
-
callback(line)
|
|
134
|
-
append(line)
|
|
135
|
-
buffer = b"" # Clear buffer for next line
|
|
136
|
-
|
|
137
|
-
if buffer: # Handle any remaining data in the buffer
|
|
120
|
+
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
138
121
|
line = buffer.decode("utf-8")
|
|
139
|
-
print(line, end="")
|
|
140
122
|
callback(line)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
thread = Thread(target=loop, daemon=True)
|
|
144
|
-
thread.start()
|
|
145
|
-
|
|
146
|
-
try:
|
|
147
|
-
yield lines
|
|
148
|
-
finally:
|
|
149
|
-
thread.join()
|
|
150
|
-
|
|
123
|
+
buffer = b"" # Clear buffer for next line
|
|
151
124
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
output: str
|
|
125
|
+
if buffer: # Handle any remaining data in the buffer
|
|
126
|
+
line = buffer.decode("utf-8")
|
|
127
|
+
callback(line)
|
|
156
128
|
|
|
157
129
|
|
|
158
130
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -651,11 +623,6 @@ class Catalog:
|
|
|
651
623
|
code_ast.body[-1:] = new_expressions
|
|
652
624
|
return code_ast
|
|
653
625
|
|
|
654
|
-
def compile_query_script(self, script: str) -> str:
|
|
655
|
-
code_ast = ast.parse(script)
|
|
656
|
-
code_ast = self.attach_query_wrapper(code_ast)
|
|
657
|
-
return ast.unparse(code_ast)
|
|
658
|
-
|
|
659
626
|
def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
|
|
660
627
|
config = config or self.client_config
|
|
661
628
|
return Client.parse_url(uri, self.cache, **config)
|
|
@@ -1805,14 +1772,15 @@ class Catalog:
|
|
|
1805
1772
|
def query(
|
|
1806
1773
|
self,
|
|
1807
1774
|
query_script: str,
|
|
1808
|
-
|
|
1809
|
-
python_executable:
|
|
1775
|
+
env: Optional[Mapping[str, str]] = None,
|
|
1776
|
+
python_executable: str = sys.executable,
|
|
1810
1777
|
save: bool = False,
|
|
1811
1778
|
capture_output: bool = True,
|
|
1812
1779
|
output_hook: Callable[[str], None] = noop,
|
|
1813
1780
|
params: Optional[dict[str, str]] = None,
|
|
1814
1781
|
job_id: Optional[str] = None,
|
|
1815
|
-
|
|
1782
|
+
_execute_last_expression: bool = False,
|
|
1783
|
+
) -> None:
|
|
1816
1784
|
"""
|
|
1817
1785
|
Method to run custom user Python script to run a query and, as result,
|
|
1818
1786
|
creates new dataset from the results of a query.
|
|
@@ -1835,92 +1803,21 @@ class Catalog:
|
|
|
1835
1803
|
C.size > 1000
|
|
1836
1804
|
)
|
|
1837
1805
|
"""
|
|
1838
|
-
if
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
envs,
|
|
1851
|
-
capture_output,
|
|
1852
|
-
output_hook,
|
|
1853
|
-
params,
|
|
1854
|
-
save,
|
|
1855
|
-
job_id,
|
|
1856
|
-
)
|
|
1857
|
-
output = "".join(lines)
|
|
1858
|
-
|
|
1859
|
-
if proc.returncode:
|
|
1860
|
-
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1861
|
-
raise QueryScriptCancelError(
|
|
1862
|
-
"Query script was canceled by user",
|
|
1863
|
-
return_code=proc.returncode,
|
|
1864
|
-
output=output,
|
|
1865
|
-
)
|
|
1866
|
-
raise QueryScriptRunError(
|
|
1867
|
-
f"Query script exited with error code {proc.returncode}",
|
|
1868
|
-
return_code=proc.returncode,
|
|
1869
|
-
output=output,
|
|
1870
|
-
)
|
|
1871
|
-
|
|
1872
|
-
def _get_dataset_versions_by_job_id():
|
|
1873
|
-
for dr, dv, job in self.list_datasets_versions():
|
|
1874
|
-
if job and str(job.id) == job_id:
|
|
1875
|
-
yield dr, dv
|
|
1876
|
-
|
|
1877
|
-
try:
|
|
1878
|
-
dr, dv = max(
|
|
1879
|
-
_get_dataset_versions_by_job_id(), key=lambda x: x[1].created_at
|
|
1880
|
-
)
|
|
1881
|
-
except ValueError as e:
|
|
1882
|
-
if not save:
|
|
1883
|
-
return QueryResult(dataset=None, version=None, output=output)
|
|
1884
|
-
|
|
1885
|
-
raise QueryScriptDatasetNotFound(
|
|
1886
|
-
"No dataset found after running Query script",
|
|
1887
|
-
output=output,
|
|
1888
|
-
) from e
|
|
1889
|
-
|
|
1890
|
-
dr = self.update_dataset(
|
|
1891
|
-
dr,
|
|
1892
|
-
script_output=output,
|
|
1893
|
-
query_script=query_script,
|
|
1894
|
-
)
|
|
1895
|
-
self.update_dataset_version_with_warehouse_info(
|
|
1896
|
-
dr,
|
|
1897
|
-
dv.version,
|
|
1898
|
-
script_output=output,
|
|
1899
|
-
query_script=query_script,
|
|
1900
|
-
job_id=job_id,
|
|
1901
|
-
is_job_result=True,
|
|
1902
|
-
)
|
|
1903
|
-
return QueryResult(dataset=dr, version=dv.version, output=output)
|
|
1806
|
+
if _execute_last_expression:
|
|
1807
|
+
try:
|
|
1808
|
+
code_ast = ast.parse(query_script)
|
|
1809
|
+
code_ast = self.attach_query_wrapper(code_ast)
|
|
1810
|
+
query_script_compiled = ast.unparse(code_ast)
|
|
1811
|
+
except Exception as exc:
|
|
1812
|
+
raise QueryScriptCompileError(
|
|
1813
|
+
f"Query script failed to compile, reason: {exc}"
|
|
1814
|
+
) from exc
|
|
1815
|
+
else:
|
|
1816
|
+
query_script_compiled = query_script
|
|
1817
|
+
assert not save
|
|
1904
1818
|
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
python_executable: str,
|
|
1908
|
-
query_script: str,
|
|
1909
|
-
envs: Optional[Mapping[str, str]],
|
|
1910
|
-
capture_output: bool,
|
|
1911
|
-
output_hook: Callable[[str], None],
|
|
1912
|
-
params: Optional[dict[str, str]],
|
|
1913
|
-
save: bool,
|
|
1914
|
-
job_id: Optional[str],
|
|
1915
|
-
) -> tuple[list[str], subprocess.Popen]:
|
|
1916
|
-
try:
|
|
1917
|
-
query_script_compiled = self.compile_query_script(query_script)
|
|
1918
|
-
except Exception as exc:
|
|
1919
|
-
raise QueryScriptCompileError(
|
|
1920
|
-
f"Query script failed to compile, reason: {exc}"
|
|
1921
|
-
) from exc
|
|
1922
|
-
envs = dict(envs or os.environ)
|
|
1923
|
-
envs.update(
|
|
1819
|
+
env = dict(env or os.environ)
|
|
1820
|
+
env.update(
|
|
1924
1821
|
{
|
|
1925
1822
|
"DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
|
|
1926
1823
|
"PYTHONPATH": os.getcwd(), # For local imports
|
|
@@ -1929,19 +1826,28 @@ class Catalog:
|
|
|
1929
1826
|
"DATACHAIN_JOB_ID": job_id or "",
|
|
1930
1827
|
},
|
|
1931
1828
|
)
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1829
|
+
popen_kwargs = {}
|
|
1830
|
+
if capture_output:
|
|
1831
|
+
popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
|
|
1832
|
+
|
|
1833
|
+
cmd = [python_executable, "-c", query_script_compiled]
|
|
1834
|
+
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # type: ignore[call-overload] # noqa: S603
|
|
1835
|
+
if capture_output:
|
|
1836
|
+
args = (proc.stdout, output_hook)
|
|
1837
|
+
thread = Thread(target=_process_stream, args=args, daemon=True)
|
|
1838
|
+
thread.start()
|
|
1839
|
+
thread.join() # wait for the reader thread
|
|
1840
|
+
|
|
1841
|
+
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1842
|
+
raise QueryScriptCancelError(
|
|
1843
|
+
"Query script was canceled by user",
|
|
1844
|
+
return_code=proc.returncode,
|
|
1845
|
+
)
|
|
1846
|
+
if proc.returncode:
|
|
1847
|
+
raise QueryScriptRunError(
|
|
1848
|
+
f"Query script exited with error code {proc.returncode}",
|
|
1849
|
+
return_code=proc.returncode,
|
|
1850
|
+
)
|
|
1945
1851
|
|
|
1946
1852
|
def cp(
|
|
1947
1853
|
self,
|
|
@@ -18,7 +18,7 @@ def _get_encoder(model: Any, type: Literal["image", "text"]) -> Callable:
|
|
|
18
18
|
hasattr(model, method_name) and inspect.ismethod(getattr(model, method_name))
|
|
19
19
|
):
|
|
20
20
|
method = getattr(model, method_name)
|
|
21
|
-
return lambda x: method(torch.
|
|
21
|
+
return lambda x: method(torch.as_tensor(x).clone().detach())
|
|
22
22
|
|
|
23
23
|
# Check for model from clip or open_clip library
|
|
24
24
|
method_name = f"encode_{type}"
|
|
@@ -415,7 +415,7 @@ class DataChain(DatasetQuery):
|
|
|
415
415
|
.save(list_dataset_name, listing=True)
|
|
416
416
|
)
|
|
417
417
|
|
|
418
|
-
dc = cls.from_dataset(list_dataset_name, session=session)
|
|
418
|
+
dc = cls.from_dataset(list_dataset_name, session=session, settings=settings)
|
|
419
419
|
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
420
420
|
|
|
421
421
|
return ls(dc, list_path, recursive=recursive, object_name=object_name)
|
|
@@ -426,6 +426,7 @@ class DataChain(DatasetQuery):
|
|
|
426
426
|
name: str,
|
|
427
427
|
version: Optional[int] = None,
|
|
428
428
|
session: Optional[Session] = None,
|
|
429
|
+
settings: Optional[dict] = None,
|
|
429
430
|
) -> "DataChain":
|
|
430
431
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
431
432
|
|
|
@@ -438,7 +439,7 @@ class DataChain(DatasetQuery):
|
|
|
438
439
|
chain = DataChain.from_dataset("my_cats")
|
|
439
440
|
```
|
|
440
441
|
"""
|
|
441
|
-
return DataChain(name=name, version=version, session=session)
|
|
442
|
+
return DataChain(name=name, version=version, session=session, settings=settings)
|
|
442
443
|
|
|
443
444
|
@classmethod
|
|
444
445
|
def from_json(
|
|
@@ -1622,6 +1623,8 @@ class DataChain(DatasetQuery):
|
|
|
1622
1623
|
model_name: str = "",
|
|
1623
1624
|
source: bool = True,
|
|
1624
1625
|
nrows=None,
|
|
1626
|
+
session: Optional[Session] = None,
|
|
1627
|
+
settings: Optional[dict] = None,
|
|
1625
1628
|
**kwargs,
|
|
1626
1629
|
) -> "DataChain":
|
|
1627
1630
|
"""Generate chain from csv files.
|
|
@@ -1638,6 +1641,8 @@ class DataChain(DatasetQuery):
|
|
|
1638
1641
|
model_name : Generated model name.
|
|
1639
1642
|
source : Whether to include info about the source file.
|
|
1640
1643
|
nrows : Optional row limit.
|
|
1644
|
+
session : Session to use for the chain.
|
|
1645
|
+
settings : Settings to use for the chain.
|
|
1641
1646
|
|
|
1642
1647
|
Example:
|
|
1643
1648
|
Reading a csv file:
|
|
@@ -1654,7 +1659,9 @@ class DataChain(DatasetQuery):
|
|
|
1654
1659
|
from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
|
|
1655
1660
|
from pyarrow.dataset import CsvFileFormat
|
|
1656
1661
|
|
|
1657
|
-
chain = DataChain.from_storage(
|
|
1662
|
+
chain = DataChain.from_storage(
|
|
1663
|
+
path, session=session, settings=settings, **kwargs
|
|
1664
|
+
)
|
|
1658
1665
|
|
|
1659
1666
|
column_names = None
|
|
1660
1667
|
if not header:
|
|
@@ -1701,6 +1708,8 @@ class DataChain(DatasetQuery):
|
|
|
1701
1708
|
object_name: str = "",
|
|
1702
1709
|
model_name: str = "",
|
|
1703
1710
|
source: bool = True,
|
|
1711
|
+
session: Optional[Session] = None,
|
|
1712
|
+
settings: Optional[dict] = None,
|
|
1704
1713
|
**kwargs,
|
|
1705
1714
|
) -> "DataChain":
|
|
1706
1715
|
"""Generate chain from parquet files.
|
|
@@ -1713,6 +1722,8 @@ class DataChain(DatasetQuery):
|
|
|
1713
1722
|
object_name : Created object column name.
|
|
1714
1723
|
model_name : Generated model name.
|
|
1715
1724
|
source : Whether to include info about the source file.
|
|
1725
|
+
session : Session to use for the chain.
|
|
1726
|
+
settings : Settings to use for the chain.
|
|
1716
1727
|
|
|
1717
1728
|
Example:
|
|
1718
1729
|
Reading a single file:
|
|
@@ -1725,7 +1736,9 @@ class DataChain(DatasetQuery):
|
|
|
1725
1736
|
dc = DataChain.from_parquet("s3://mybucket/dir")
|
|
1726
1737
|
```
|
|
1727
1738
|
"""
|
|
1728
|
-
chain = DataChain.from_storage(
|
|
1739
|
+
chain = DataChain.from_storage(
|
|
1740
|
+
path, session=session, settings=settings, **kwargs
|
|
1741
|
+
)
|
|
1729
1742
|
return chain.parse_tabular(
|
|
1730
1743
|
output=output,
|
|
1731
1744
|
object_name=object_name,
|
|
@@ -195,14 +195,15 @@ class File(DataModel):
|
|
|
195
195
|
with VFileRegistry.resolve(self, self.location) as f: # type: ignore[arg-type]
|
|
196
196
|
yield f
|
|
197
197
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
198
|
+
else:
|
|
199
|
+
uid = self.get_uid()
|
|
200
|
+
client = self._catalog.get_client(self.source)
|
|
201
|
+
if self._caching_enabled:
|
|
202
|
+
client.download(uid, callback=self._download_cb)
|
|
203
|
+
with client.open_object(
|
|
204
|
+
uid, use_cache=self._caching_enabled, cb=self._download_cb
|
|
205
|
+
) as f:
|
|
206
|
+
yield io.TextIOWrapper(f) if mode == "r" else f
|
|
206
207
|
|
|
207
208
|
def read(self, length: int = -1):
|
|
208
209
|
"""Returns file contents."""
|
|
@@ -34,7 +34,7 @@ def convert_image(
|
|
|
34
34
|
from transformers.image_processing_utils import BaseImageProcessor
|
|
35
35
|
|
|
36
36
|
if isinstance(transform, BaseImageProcessor):
|
|
37
|
-
img = torch.
|
|
37
|
+
img = torch.as_tensor(img.pixel_values[0]).clone().detach() # type: ignore[assignment,attr-defined]
|
|
38
38
|
except ImportError:
|
|
39
39
|
pass
|
|
40
40
|
if device:
|
|
@@ -1,13 +1,10 @@
|
|
|
1
|
-
# pip install datamodel-code-generator
|
|
2
|
-
# pip install jmespath
|
|
3
|
-
#
|
|
4
1
|
import csv
|
|
5
2
|
import json
|
|
6
3
|
import tempfile
|
|
7
4
|
import uuid
|
|
8
5
|
from collections.abc import Iterator
|
|
9
6
|
from pathlib import Path
|
|
10
|
-
from typing import
|
|
7
|
+
from typing import Callable
|
|
11
8
|
|
|
12
9
|
import datamodel_code_generator
|
|
13
10
|
import jmespath as jsp
|
|
@@ -85,7 +82,6 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
85
82
|
use_standard_collections=True,
|
|
86
83
|
)
|
|
87
84
|
epilogue = f"""
|
|
88
|
-
{model_name}.model_rebuild()
|
|
89
85
|
DataModel.register({model_name})
|
|
90
86
|
spec = {model_name}
|
|
91
87
|
"""
|
|
@@ -122,9 +118,9 @@ def read_meta( # noqa: C901
|
|
|
122
118
|
print(f"{model_output}")
|
|
123
119
|
# Below 'spec' should be a dynamically converted DataModel from Pydantic
|
|
124
120
|
if not spec:
|
|
125
|
-
|
|
126
|
-
exec(model_output,
|
|
127
|
-
spec =
|
|
121
|
+
gl = globals()
|
|
122
|
+
exec(model_output, gl) # type: ignore[arg-type] # noqa: S102
|
|
123
|
+
spec = gl["spec"]
|
|
128
124
|
|
|
129
125
|
if not (spec) and not (schema_from):
|
|
130
126
|
raise ValueError(
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
from typing import ClassVar, Optional
|
|
3
4
|
|
|
@@ -69,7 +70,11 @@ class ModelStore:
|
|
|
69
70
|
|
|
70
71
|
@staticmethod
|
|
71
72
|
def is_pydantic(val):
|
|
72
|
-
return
|
|
73
|
+
return (
|
|
74
|
+
not hasattr(val, "__origin__")
|
|
75
|
+
and inspect.isclass(val)
|
|
76
|
+
and issubclass(val, BaseModel)
|
|
77
|
+
)
|
|
73
78
|
|
|
74
79
|
@staticmethod
|
|
75
80
|
def to_pydantic(val) -> Optional[type[BaseModel]]:
|
|
@@ -33,7 +33,7 @@ def convert_text(
|
|
|
33
33
|
res = tokenizer(text)
|
|
34
34
|
|
|
35
35
|
tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
|
|
36
|
-
tokens = torch.
|
|
36
|
+
tokens = torch.as_tensor(tokens).clone().detach()
|
|
37
37
|
if device:
|
|
38
38
|
tokens = tokens.to(device)
|
|
39
39
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import json
|
|
3
3
|
import tarfile
|
|
4
|
+
import warnings
|
|
4
5
|
from collections.abc import Iterator, Sequence
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import (
|
|
@@ -19,6 +20,18 @@ from datachain.lib.data_model import DataModel
|
|
|
19
20
|
from datachain.lib.file import File, TarVFile
|
|
20
21
|
from datachain.lib.utils import DataChainError
|
|
21
22
|
|
|
23
|
+
# The `json` method of the Pydantic `BaseModel` class has been deprecated
|
|
24
|
+
# and will be removed in Pydantic v3. For more details, see:
|
|
25
|
+
# https://github.com/pydantic/pydantic/issues/10033
|
|
26
|
+
# Until then, we can ignore the warning.
|
|
27
|
+
warnings.filterwarnings(
|
|
28
|
+
"ignore",
|
|
29
|
+
category=UserWarning,
|
|
30
|
+
message=(
|
|
31
|
+
'Field name "json" in "WDSAllFile" shadows an attribute in parent "WDSBasic"'
|
|
32
|
+
),
|
|
33
|
+
)
|
|
34
|
+
|
|
22
35
|
|
|
23
36
|
class WDSError(DataChainError):
|
|
24
37
|
def __init__(self, tar_stream, message: str):
|