datachain 0.2.13__tar.gz → 0.2.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.2.13/src/datachain.egg-info → datachain-0.2.14}/PKG-INFO +2 -3
- {datachain-0.2.13 → datachain-0.2.14}/docs/index.md +2 -2
- {datachain-0.2.13 → datachain-0.2.14}/pyproject.toml +7 -3
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/metastore.py +0 -4
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/schema.py +7 -3
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/sqlite.py +1 -4
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/warehouse.py +1 -24
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/flatten.py +4 -4
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/values_to_tuples.py +4 -1
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/dc.py +100 -5
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/file.py +6 -11
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/meta_formats.py +6 -5
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/dataset.py +19 -21
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/sqlite/base.py +3 -3
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/sqlite/types.py +5 -13
- {datachain-0.2.13 → datachain-0.2.14/src/datachain.egg-info}/PKG-INFO +2 -3
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain.egg-info/requires.txt +1 -2
- {datachain-0.2.13 → datachain-0.2.14}/tests/examples/test_wds_e2e.py +1 -1
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_datachain.py +2 -6
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_datasets.py +7 -6
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_feature_pickling.py +10 -3
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_datachain.py +56 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_datachain_merge.py +19 -19
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_feature.py +7 -7
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_module_exports.py +25 -18
- {datachain-0.2.13 → datachain-0.2.14}/.cruft.json +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.gitattributes +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.github/codecov.yaml +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.github/dependabot.yml +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.github/workflows/release.yml +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.github/workflows/tests.yml +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.gitignore +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/.pre-commit-config.yaml +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/CONTRIBUTING.rst +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/LICENSE +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/README.rst +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/docs/assets/datachain.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/docs/assets/flowchart.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/docs/references/datachain.md +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/docs/references/datatype.md +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/docs/references/file.md +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/docs/references/index.md +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/docs/references/sql.md +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/docs/references/torch.md +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/docs/references/udf.md +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/README.md +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/llm/llm_chatbot_evaluation.ipynb +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/llm_and_nlp/unstructured-text.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/clip.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/wds.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/mkdocs.yml +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/noxfile.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/setup.cfg +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/__main__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/asyn.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/cache.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/catalog/subclass.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/cli.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/cli_utils.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/azure.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/gcs.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/local.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/s3.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/config.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/dataset.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/error.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/job.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/clip.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/image.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/settings.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/text.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/udf.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/utils.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/listing.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/node.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/progress.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/py.typed +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/batch.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/builtins.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/metrics.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/params.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/schema.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/session.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/udf.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/remote/studio.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/types.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/utils.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/storage.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain/utils.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/conftest.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/data.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/examples/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/examples/wds_data.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_catalog.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_client.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_ls.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_pull.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_pytorch.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_query.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/scripts/feature_class.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/scripts/name_len_normal.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/test_cli_e2e.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/test_query_e2e.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_asyn.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_cache.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_catalog.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_client.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_dataset.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_listing.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_metastore.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_query_params.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_serializer.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_session.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_storage.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_udf.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_utils.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.2.13 → datachain-0.2.14}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.14
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -36,7 +36,7 @@ Requires-Dist: sqlalchemy>=2
|
|
|
36
36
|
Requires-Dist: multiprocess==0.70.16
|
|
37
37
|
Requires-Dist: dill==0.3.8
|
|
38
38
|
Requires-Dist: cloudpickle
|
|
39
|
-
Requires-Dist:
|
|
39
|
+
Requires-Dist: orjson>=3.10.5
|
|
40
40
|
Requires-Dist: pydantic<3,>=2
|
|
41
41
|
Requires-Dist: jmespath>=1.0
|
|
42
42
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
@@ -81,7 +81,6 @@ Requires-Dist: types-python-dateutil; extra == "dev"
|
|
|
81
81
|
Requires-Dist: types-pytz; extra == "dev"
|
|
82
82
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
83
83
|
Requires-Dist: types-requests; extra == "dev"
|
|
84
|
-
Requires-Dist: types-ujson; extra == "dev"
|
|
85
84
|
|
|
86
85
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
87
86
|
|
|
@@ -58,8 +58,8 @@ def trim_text(text):
|
|
|
58
58
|
match = re.search(r'[A-Z][^.]*\.', text)
|
|
59
59
|
return match.group(0) if match else ''
|
|
60
60
|
|
|
61
|
-
images = chain.
|
|
62
|
-
captions = chain.
|
|
61
|
+
images = chain.collect("file")
|
|
62
|
+
captions = chain.collect("scene")
|
|
63
63
|
_ , axes = plt.subplots(1, len(captions), figsize=(15, 5))
|
|
64
64
|
|
|
65
65
|
for ax, img, caption in zip(axes, images, captions):
|
|
@@ -39,7 +39,7 @@ dependencies = [
|
|
|
39
39
|
"multiprocess==0.70.16",
|
|
40
40
|
"dill==0.3.8",
|
|
41
41
|
"cloudpickle",
|
|
42
|
-
"
|
|
42
|
+
"orjson>=3.10.5",
|
|
43
43
|
"pydantic>=2,<3",
|
|
44
44
|
"jmespath>=1.0",
|
|
45
45
|
"datamodel-code-generator>=0.25",
|
|
@@ -91,8 +91,7 @@ dev = [
|
|
|
91
91
|
"types-python-dateutil",
|
|
92
92
|
"types-pytz",
|
|
93
93
|
"types-PyYAML",
|
|
94
|
-
"types-requests"
|
|
95
|
-
"types-ujson"
|
|
94
|
+
"types-requests"
|
|
96
95
|
]
|
|
97
96
|
|
|
98
97
|
[project.urls]
|
|
@@ -118,6 +117,11 @@ markers = [
|
|
|
118
117
|
]
|
|
119
118
|
asyncio_mode = "auto"
|
|
120
119
|
filterwarnings = [
|
|
120
|
+
"error::pandas.errors.PerformanceWarning",
|
|
121
|
+
"error::pydantic.warnings.PydanticDeprecatedSince20",
|
|
122
|
+
"error::pytest_mock.PytestMockWarning",
|
|
123
|
+
"error::pytest.PytestCollectionWarning",
|
|
124
|
+
"error::sqlalchemy.exc.SADeprecationWarning",
|
|
121
125
|
"ignore:Field name .* shadows an attribute in parent:UserWarning" # datachain.lib.feature
|
|
122
126
|
]
|
|
123
127
|
|
|
@@ -421,10 +421,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
421
421
|
) -> None:
|
|
422
422
|
"""Set the status of the given job and dataset."""
|
|
423
423
|
|
|
424
|
-
@abstractmethod
|
|
425
|
-
def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
|
|
426
|
-
"""Returns the possibly stale jobs."""
|
|
427
|
-
|
|
428
424
|
|
|
429
425
|
class AbstractDBMetastore(AbstractMetastore):
|
|
430
426
|
"""
|
|
@@ -19,8 +19,12 @@ from datachain.sql.types import Int, SQLType, UInt64
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
from sqlalchemy import Engine
|
|
21
21
|
from sqlalchemy.engine.interfaces import Dialect
|
|
22
|
-
from sqlalchemy.sql.base import
|
|
23
|
-
|
|
22
|
+
from sqlalchemy.sql.base import (
|
|
23
|
+
ColumnCollection,
|
|
24
|
+
Executable,
|
|
25
|
+
ReadOnlyColumnCollection,
|
|
26
|
+
)
|
|
27
|
+
from sqlalchemy.sql.elements import ColumnElement
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
@@ -43,7 +47,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
|
43
47
|
|
|
44
48
|
|
|
45
49
|
def convert_rows_custom_column_types(
|
|
46
|
-
columns: "
|
|
50
|
+
columns: "ColumnCollection[str, ColumnElement[Any]]",
|
|
47
51
|
rows: Iterator[tuple[Any, ...]],
|
|
48
52
|
dialect: "Dialect",
|
|
49
53
|
):
|
|
@@ -496,9 +496,6 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
496
496
|
def _jobs_insert(self) -> "Insert":
|
|
497
497
|
return sqlite.insert(self._jobs)
|
|
498
498
|
|
|
499
|
-
def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
|
|
500
|
-
raise NotImplementedError("get_possibly_stale_jobs not implemented for SQLite")
|
|
501
|
-
|
|
502
499
|
|
|
503
500
|
class SQLiteWarehouse(AbstractWarehouse):
|
|
504
501
|
"""
|
|
@@ -594,7 +591,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
594
591
|
):
|
|
595
592
|
rows = self.db.execute(select_query, **kwargs)
|
|
596
593
|
yield from convert_rows_custom_column_types(
|
|
597
|
-
select_query.
|
|
594
|
+
select_query.selected_columns, rows, sqlite_dialect
|
|
598
595
|
)
|
|
599
596
|
|
|
600
597
|
def get_dataset_sources(
|
|
@@ -494,7 +494,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
494
494
|
This gets nodes based on the provided query, and should be used sparingly,
|
|
495
495
|
as it will be slow on any OLAP database systems.
|
|
496
496
|
"""
|
|
497
|
-
columns = [c.name for c in query.
|
|
497
|
+
columns = [c.name for c in query.selected_columns]
|
|
498
498
|
for row in self.db.execute(query):
|
|
499
499
|
d = dict(zip(columns, row))
|
|
500
500
|
yield Node(**d)
|
|
@@ -912,29 +912,6 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
912
912
|
for name in names:
|
|
913
913
|
self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
|
|
914
914
|
|
|
915
|
-
def subtract_query(
|
|
916
|
-
self,
|
|
917
|
-
source_query: sa.sql.selectable.Select,
|
|
918
|
-
target_query: sa.sql.selectable.Select,
|
|
919
|
-
) -> sa.sql.selectable.Select:
|
|
920
|
-
sq = source_query.alias("source_query")
|
|
921
|
-
tq = target_query.alias("target_query")
|
|
922
|
-
|
|
923
|
-
source_target_join = sa.join(
|
|
924
|
-
sq,
|
|
925
|
-
tq,
|
|
926
|
-
(sq.c.source == tq.c.source)
|
|
927
|
-
& (sq.c.parent == tq.c.parent)
|
|
928
|
-
& (sq.c.name == tq.c.name),
|
|
929
|
-
isouter=True,
|
|
930
|
-
)
|
|
931
|
-
|
|
932
|
-
return (
|
|
933
|
-
select(*sq.c)
|
|
934
|
-
.select_from(source_target_join)
|
|
935
|
-
.where((tq.c.name == None) | (tq.c.name == "")) # noqa: E711
|
|
936
|
-
)
|
|
937
|
-
|
|
938
915
|
def changed_query(
|
|
939
916
|
self,
|
|
940
917
|
source_query: sa.sql.selectable.Select,
|
|
@@ -48,10 +48,10 @@ def _flatten_fields_values(fields, obj: BaseModel):
|
|
|
48
48
|
value = getattr(obj, name)
|
|
49
49
|
|
|
50
50
|
if isinstance(value, list):
|
|
51
|
-
|
|
52
|
-
val.model_dump()
|
|
53
|
-
|
|
54
|
-
|
|
51
|
+
if value and ModelStore.is_pydantic(type(value[0])):
|
|
52
|
+
yield [val.model_dump() for val in value]
|
|
53
|
+
else:
|
|
54
|
+
yield value
|
|
55
55
|
elif isinstance(value, dict):
|
|
56
56
|
yield {
|
|
57
57
|
key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val
|
|
@@ -71,7 +71,10 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
71
71
|
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
72
72
|
f" Please use DataModel types: {DataTypeNames}",
|
|
73
73
|
)
|
|
74
|
-
|
|
74
|
+
if typ is list:
|
|
75
|
+
types_map[k] = list[type(v[0][0])] # type: ignore[misc]
|
|
76
|
+
else:
|
|
77
|
+
types_map[k] = typ
|
|
75
78
|
|
|
76
79
|
if length < 0:
|
|
77
80
|
length = len_
|
|
@@ -342,7 +342,7 @@ class DataChain(DatasetQuery):
|
|
|
342
342
|
spec: Optional[DataType] = None,
|
|
343
343
|
schema_from: Optional[str] = "auto",
|
|
344
344
|
jmespath: Optional[str] = None,
|
|
345
|
-
object_name: str = "",
|
|
345
|
+
object_name: Optional[str] = "",
|
|
346
346
|
model_name: Optional[str] = None,
|
|
347
347
|
show_schema: Optional[bool] = False,
|
|
348
348
|
meta_type: Optional[str] = "json",
|
|
@@ -364,12 +364,12 @@ class DataChain(DatasetQuery):
|
|
|
364
364
|
nrows : optional row limit for jsonl and JSON arrays
|
|
365
365
|
|
|
366
366
|
Example:
|
|
367
|
-
infer JSON schema from data, reduce using JMESPATH
|
|
367
|
+
infer JSON schema from data, reduce using JMESPATH
|
|
368
368
|
```py
|
|
369
369
|
chain = DataChain.from_json("gs://json", jmespath="key1.key2")
|
|
370
370
|
```
|
|
371
371
|
|
|
372
|
-
infer JSON schema from a particular path
|
|
372
|
+
infer JSON schema from a particular path
|
|
373
373
|
```py
|
|
374
374
|
chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
|
|
375
375
|
```
|
|
@@ -384,7 +384,7 @@ class DataChain(DatasetQuery):
|
|
|
384
384
|
if (not object_name) and jmespath:
|
|
385
385
|
object_name = jmespath_to_name(jmespath)
|
|
386
386
|
if not object_name:
|
|
387
|
-
object_name =
|
|
387
|
+
object_name = meta_type
|
|
388
388
|
chain = DataChain.from_storage(path=path, type=type, **kwargs)
|
|
389
389
|
signal_dict = {
|
|
390
390
|
object_name: read_meta(
|
|
@@ -397,7 +397,67 @@ class DataChain(DatasetQuery):
|
|
|
397
397
|
nrows=nrows,
|
|
398
398
|
)
|
|
399
399
|
}
|
|
400
|
-
return chain.gen(**signal_dict) # type: ignore[arg-type]
|
|
400
|
+
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
401
|
+
|
|
402
|
+
@classmethod
|
|
403
|
+
def from_jsonl(
|
|
404
|
+
cls,
|
|
405
|
+
path,
|
|
406
|
+
type: Literal["binary", "text", "image"] = "text",
|
|
407
|
+
spec: Optional[DataType] = None,
|
|
408
|
+
schema_from: Optional[str] = "auto",
|
|
409
|
+
jmespath: Optional[str] = None,
|
|
410
|
+
object_name: Optional[str] = "",
|
|
411
|
+
model_name: Optional[str] = None,
|
|
412
|
+
show_schema: Optional[bool] = False,
|
|
413
|
+
meta_type: Optional[str] = "jsonl",
|
|
414
|
+
nrows=None,
|
|
415
|
+
**kwargs,
|
|
416
|
+
) -> "DataChain":
|
|
417
|
+
"""Get data from JSON lines. It returns the chain itself.
|
|
418
|
+
|
|
419
|
+
Parameters:
|
|
420
|
+
path : storage URI with directory. URI must start with storage prefix such
|
|
421
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
422
|
+
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
423
|
+
spec : optional Data Model
|
|
424
|
+
schema_from : path to sample to infer spec (if schema not provided)
|
|
425
|
+
object_name : generated object column name
|
|
426
|
+
model_name : optional generated model name
|
|
427
|
+
show_schema : print auto-generated schema
|
|
428
|
+
jmespath : optional JMESPATH expression to reduce JSON
|
|
429
|
+
nrows : optional row limit for jsonl and JSON arrays
|
|
430
|
+
|
|
431
|
+
Example:
|
|
432
|
+
infer JSONl schema from data, limit parsing to 1 row
|
|
433
|
+
```py
|
|
434
|
+
chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
|
|
435
|
+
```
|
|
436
|
+
"""
|
|
437
|
+
if schema_from == "auto":
|
|
438
|
+
schema_from = path
|
|
439
|
+
|
|
440
|
+
def jmespath_to_name(s: str):
|
|
441
|
+
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
442
|
+
return s[:name_end]
|
|
443
|
+
|
|
444
|
+
if (not object_name) and jmespath:
|
|
445
|
+
object_name = jmespath_to_name(jmespath)
|
|
446
|
+
if not object_name:
|
|
447
|
+
object_name = meta_type
|
|
448
|
+
chain = DataChain.from_storage(path=path, type=type, **kwargs)
|
|
449
|
+
signal_dict = {
|
|
450
|
+
object_name: read_meta(
|
|
451
|
+
schema_from=schema_from,
|
|
452
|
+
meta_type=meta_type,
|
|
453
|
+
spec=spec,
|
|
454
|
+
model_name=model_name,
|
|
455
|
+
show_schema=show_schema,
|
|
456
|
+
jmespath=jmespath,
|
|
457
|
+
nrows=nrows,
|
|
458
|
+
)
|
|
459
|
+
}
|
|
460
|
+
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
401
461
|
|
|
402
462
|
@classmethod
|
|
403
463
|
def datasets(
|
|
@@ -951,6 +1011,41 @@ class DataChain(DatasetQuery):
|
|
|
951
1011
|
|
|
952
1012
|
return ds
|
|
953
1013
|
|
|
1014
|
+
def subtract( # type: ignore[override]
|
|
1015
|
+
self,
|
|
1016
|
+
other: "DataChain",
|
|
1017
|
+
on: Optional[Union[str, Sequence[str]]] = None,
|
|
1018
|
+
) -> "Self":
|
|
1019
|
+
"""Remove rows that appear in another chain.
|
|
1020
|
+
|
|
1021
|
+
Parameters:
|
|
1022
|
+
other: chain whose rows will be removed from `self`
|
|
1023
|
+
on: columns to consider for determining row equality. If unspecified,
|
|
1024
|
+
defaults to all common columns between `self` and `other`.
|
|
1025
|
+
"""
|
|
1026
|
+
if isinstance(on, str):
|
|
1027
|
+
on = [on]
|
|
1028
|
+
if on is None:
|
|
1029
|
+
other_columns = set(other._effective_signals_schema.db_signals())
|
|
1030
|
+
signals = [
|
|
1031
|
+
c
|
|
1032
|
+
for c in self._effective_signals_schema.db_signals()
|
|
1033
|
+
if c in other_columns
|
|
1034
|
+
]
|
|
1035
|
+
if not signals:
|
|
1036
|
+
raise DataChainParamsError("subtract(): no common columns")
|
|
1037
|
+
elif not isinstance(on, Sequence):
|
|
1038
|
+
raise TypeError(
|
|
1039
|
+
f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
|
|
1040
|
+
)
|
|
1041
|
+
elif not on:
|
|
1042
|
+
raise DataChainParamsError(
|
|
1043
|
+
"'on' cannot be empty",
|
|
1044
|
+
)
|
|
1045
|
+
else:
|
|
1046
|
+
signals = self.signals_schema.resolve(*on).db_signals()
|
|
1047
|
+
return super()._subtract(other, signals)
|
|
1048
|
+
|
|
954
1049
|
@classmethod
|
|
955
1050
|
def from_values(
|
|
956
1051
|
cls,
|
|
@@ -12,7 +12,6 @@ from urllib.parse import unquote, urlparse
|
|
|
12
12
|
from urllib.request import url2pathname
|
|
13
13
|
|
|
14
14
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
15
|
-
from fsspec.implementations.local import LocalFileSystem
|
|
16
15
|
from PIL import Image
|
|
17
16
|
from pydantic import Field, field_validator
|
|
18
17
|
|
|
@@ -283,9 +282,8 @@ class File(DataModel):
|
|
|
283
282
|
def get_path(self) -> str:
|
|
284
283
|
"""Returns file path."""
|
|
285
284
|
path = unquote(self.get_uri())
|
|
286
|
-
|
|
287
|
-
if
|
|
288
|
-
# Drop file:// protocol
|
|
285
|
+
source = urlparse(self.source)
|
|
286
|
+
if source.scheme == "file":
|
|
289
287
|
path = urlparse(path).path
|
|
290
288
|
path = url2pathname(path)
|
|
291
289
|
return path
|
|
@@ -300,13 +298,10 @@ class File(DataModel):
|
|
|
300
298
|
elif placement == "etag":
|
|
301
299
|
path = f"{self.etag}{self.get_file_suffix()}"
|
|
302
300
|
elif placement == "fullpath":
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
path = (
|
|
308
|
-
Path(urlparse(self.source).netloc) / unquote(self.get_full_name())
|
|
309
|
-
).as_posix()
|
|
301
|
+
path = unquote(self.get_full_name())
|
|
302
|
+
source = urlparse(self.source)
|
|
303
|
+
if source.scheme and source.scheme != "file":
|
|
304
|
+
path = posixpath.join(source.netloc, path)
|
|
310
305
|
elif placement == "checksum":
|
|
311
306
|
raise NotImplementedError("Checksum placement not implemented yet")
|
|
312
307
|
else:
|
|
@@ -11,9 +11,9 @@ from collections.abc import Iterator
|
|
|
11
11
|
from typing import Any, Callable
|
|
12
12
|
|
|
13
13
|
import jmespath as jsp
|
|
14
|
-
from pydantic import ValidationError
|
|
14
|
+
from pydantic import Field, ValidationError # noqa: F401
|
|
15
15
|
|
|
16
|
-
from datachain.lib.data_model import
|
|
16
|
+
from datachain.lib.data_model import DataModel # noqa: F401
|
|
17
17
|
from datachain.lib.file import File
|
|
18
18
|
|
|
19
19
|
|
|
@@ -87,7 +87,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
87
87
|
except subprocess.CalledProcessError as e:
|
|
88
88
|
model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
|
|
89
89
|
print(f"{model_output}")
|
|
90
|
-
print("\n" +
|
|
90
|
+
print("\n" + "from datachain.lib.data_model import DataModel" + "\n")
|
|
91
|
+
print("\n" + f"DataModel.register({model_name})" + "\n")
|
|
91
92
|
print("\n" + f"spec={model_name}" + "\n")
|
|
92
93
|
return model_output
|
|
93
94
|
|
|
@@ -147,7 +148,7 @@ def read_meta( # noqa: C901
|
|
|
147
148
|
|
|
148
149
|
def parse_data(
|
|
149
150
|
file: File,
|
|
150
|
-
|
|
151
|
+
data_model=spec,
|
|
151
152
|
meta_type=meta_type,
|
|
152
153
|
jmespath=jmespath,
|
|
153
154
|
nrows=nrows,
|
|
@@ -155,7 +156,7 @@ def read_meta( # noqa: C901
|
|
|
155
156
|
def validator(json_object: dict) -> spec:
|
|
156
157
|
json_string = json.dumps(json_object)
|
|
157
158
|
try:
|
|
158
|
-
data_instance =
|
|
159
|
+
data_instance = data_model.model_validate_json(json_string)
|
|
159
160
|
yield data_instance
|
|
160
161
|
except ValidationError as e:
|
|
161
162
|
print(f"Validation error occurred in file {file.name}:", e)
|
|
@@ -25,6 +25,7 @@ from typing import (
|
|
|
25
25
|
|
|
26
26
|
import attrs
|
|
27
27
|
import sqlalchemy
|
|
28
|
+
import sqlalchemy as sa
|
|
28
29
|
from attrs import frozen
|
|
29
30
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
|
|
30
31
|
from sqlalchemy import Column
|
|
@@ -250,7 +251,7 @@ class DatasetDiffOperation(Step):
|
|
|
250
251
|
self,
|
|
251
252
|
source_query: Select,
|
|
252
253
|
target_query: Select,
|
|
253
|
-
) ->
|
|
254
|
+
) -> sa.Selectable:
|
|
254
255
|
"""
|
|
255
256
|
Should return select query that calculates desired diff between dataset queries
|
|
256
257
|
"""
|
|
@@ -268,7 +269,7 @@ class DatasetDiffOperation(Step):
|
|
|
268
269
|
|
|
269
270
|
columns = [
|
|
270
271
|
c if isinstance(c, Column) else Column(c.name, c.type)
|
|
271
|
-
for c in source_query.
|
|
272
|
+
for c in source_query.selected_columns
|
|
272
273
|
]
|
|
273
274
|
temp_table = self.catalog.warehouse.create_dataset_rows_table(
|
|
274
275
|
temp_table_name,
|
|
@@ -292,23 +293,16 @@ class DatasetDiffOperation(Step):
|
|
|
292
293
|
|
|
293
294
|
@frozen
|
|
294
295
|
class Subtract(DatasetDiffOperation):
|
|
295
|
-
|
|
296
|
-
Calculates rows that are in a source query but are not in target query (diff)
|
|
297
|
-
This can be used to do delta updates (calculate UDF only on newly added rows)
|
|
298
|
-
Example:
|
|
299
|
-
>>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
|
|
300
|
-
>>> ds_updated = (
|
|
301
|
-
DatasetQuery("gs://dvcx-datalakes/dogs-and-cats")
|
|
302
|
-
.filter(C.size > 1000) # we can also filter out source query
|
|
303
|
-
.subtract(ds)
|
|
304
|
-
.add_signals(calc_embeddings) # calculae embeddings only on new rows
|
|
305
|
-
.union(ds) # union with old dataset that's missing new rows
|
|
306
|
-
.save("dogs_cats_updated")
|
|
307
|
-
)
|
|
308
|
-
"""
|
|
296
|
+
on: Sequence[str]
|
|
309
297
|
|
|
310
|
-
def query(self, source_query: Select, target_query: Select) ->
|
|
311
|
-
|
|
298
|
+
def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
|
|
299
|
+
sq = source_query.alias("source_query")
|
|
300
|
+
tq = target_query.alias("target_query")
|
|
301
|
+
where_clause = sa.and_(
|
|
302
|
+
getattr(sq.c, col_name).is_not_distinct_from(getattr(tq.c, col_name))
|
|
303
|
+
for col_name in self.on
|
|
304
|
+
) # type: ignore[arg-type]
|
|
305
|
+
return sq.select().except_(sq.select().where(where_clause))
|
|
312
306
|
|
|
313
307
|
|
|
314
308
|
@frozen
|
|
@@ -1260,7 +1254,7 @@ class DatasetQuery:
|
|
|
1260
1254
|
def as_iterable(self, **kwargs) -> Iterator[ResultIter]:
|
|
1261
1255
|
try:
|
|
1262
1256
|
query = self.apply_steps().select()
|
|
1263
|
-
selected_columns = [c.name for c in query.
|
|
1257
|
+
selected_columns = [c.name for c in query.selected_columns]
|
|
1264
1258
|
yield ResultIter(
|
|
1265
1259
|
self.catalog.warehouse.dataset_rows_select(query, **kwargs),
|
|
1266
1260
|
selected_columns,
|
|
@@ -1564,8 +1558,12 @@ class DatasetQuery:
|
|
|
1564
1558
|
|
|
1565
1559
|
@detach
|
|
1566
1560
|
def subtract(self, dq: "DatasetQuery") -> "Self":
|
|
1561
|
+
return self._subtract(dq, on=["source", "parent", "name"])
|
|
1562
|
+
|
|
1563
|
+
@detach
|
|
1564
|
+
def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":
|
|
1567
1565
|
query = self.clone()
|
|
1568
|
-
query.steps.append(Subtract(dq, self.catalog))
|
|
1566
|
+
query.steps.append(Subtract(dq, self.catalog, on=on))
|
|
1569
1567
|
return query
|
|
1570
1568
|
|
|
1571
1569
|
@detach
|
|
@@ -1684,7 +1682,7 @@ class DatasetQuery:
|
|
|
1684
1682
|
f.row_number().over(order_by=q._order_by_clauses).label("sys__id")
|
|
1685
1683
|
)
|
|
1686
1684
|
|
|
1687
|
-
cols = tuple(c.name for c in q.
|
|
1685
|
+
cols = tuple(c.name for c in q.selected_columns)
|
|
1688
1686
|
insert_q = sqlalchemy.insert(dr.get_table()).from_select(cols, q)
|
|
1689
1687
|
self.catalog.warehouse.db.execute(insert_q, **kwargs)
|
|
1690
1688
|
self.catalog.metastore.update_dataset_status(
|
|
@@ -5,8 +5,8 @@ from datetime import MAXYEAR, MINYEAR, datetime, timezone
|
|
|
5
5
|
from types import MappingProxyType
|
|
6
6
|
from typing import Callable, Optional
|
|
7
7
|
|
|
8
|
+
import orjson
|
|
8
9
|
import sqlalchemy as sa
|
|
9
|
-
import ujson
|
|
10
10
|
from sqlalchemy.dialects import sqlite
|
|
11
11
|
from sqlalchemy.ext.compiler import compiles
|
|
12
12
|
from sqlalchemy.sql.elements import literal
|
|
@@ -149,7 +149,7 @@ def missing_vector_function(name, exc):
|
|
|
149
149
|
|
|
150
150
|
|
|
151
151
|
def sqlite_string_split(string: str, sep: str, maxsplit: int = -1) -> str:
|
|
152
|
-
return
|
|
152
|
+
return orjson.dumps(string.split(sep, maxsplit)).decode("utf-8")
|
|
153
153
|
|
|
154
154
|
|
|
155
155
|
def register_user_defined_sql_functions() -> None:
|
|
@@ -274,7 +274,7 @@ def compile_euclidean_distance(element, compiler, **kwargs):
|
|
|
274
274
|
|
|
275
275
|
|
|
276
276
|
def py_json_array_length(arr):
|
|
277
|
-
return len(
|
|
277
|
+
return len(orjson.loads(arr))
|
|
278
278
|
|
|
279
279
|
|
|
280
280
|
def compile_array_length(element, compiler, **kwargs):
|
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import sqlite3
|
|
3
2
|
|
|
4
|
-
import
|
|
3
|
+
import orjson
|
|
5
4
|
from sqlalchemy import types
|
|
6
5
|
|
|
7
6
|
from datachain.sql.types import TypeConverter, TypeReadConverter
|
|
@@ -29,22 +28,15 @@ class Array(types.UserDefinedType):
|
|
|
29
28
|
|
|
30
29
|
|
|
31
30
|
def adapt_array(arr):
|
|
32
|
-
return
|
|
31
|
+
return orjson.dumps(arr).decode("utf-8")
|
|
33
32
|
|
|
34
33
|
|
|
35
34
|
def convert_array(arr):
|
|
36
|
-
return
|
|
35
|
+
return orjson.loads(arr)
|
|
37
36
|
|
|
38
37
|
|
|
39
38
|
def adapt_np_array(arr):
|
|
40
|
-
|
|
41
|
-
if isinstance(obj, np.ndarray):
|
|
42
|
-
return obj.tolist()
|
|
43
|
-
return obj
|
|
44
|
-
|
|
45
|
-
if np.issubdtype(arr.dtype, np.object_):
|
|
46
|
-
return json.dumps(arr.tolist(), default=_json_serialize)
|
|
47
|
-
return ujson.dumps(arr.tolist())
|
|
39
|
+
return orjson.dumps(arr, option=orjson.OPT_SERIALIZE_NUMPY).decode("utf-8")
|
|
48
40
|
|
|
49
41
|
|
|
50
42
|
def adapt_np_generic(val):
|
|
@@ -70,5 +62,5 @@ class SQLiteTypeConverter(TypeConverter):
|
|
|
70
62
|
class SQLiteTypeReadConverter(TypeReadConverter):
|
|
71
63
|
def array(self, value, item_type, dialect):
|
|
72
64
|
if isinstance(value, str):
|
|
73
|
-
value =
|
|
65
|
+
value = orjson.loads(value)
|
|
74
66
|
return super().array(value, item_type, dialect)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.14
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -36,7 +36,7 @@ Requires-Dist: sqlalchemy>=2
|
|
|
36
36
|
Requires-Dist: multiprocess==0.70.16
|
|
37
37
|
Requires-Dist: dill==0.3.8
|
|
38
38
|
Requires-Dist: cloudpickle
|
|
39
|
-
Requires-Dist:
|
|
39
|
+
Requires-Dist: orjson>=3.10.5
|
|
40
40
|
Requires-Dist: pydantic<3,>=2
|
|
41
41
|
Requires-Dist: jmespath>=1.0
|
|
42
42
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
@@ -81,7 +81,6 @@ Requires-Dist: types-python-dateutil; extra == "dev"
|
|
|
81
81
|
Requires-Dist: types-pytz; extra == "dev"
|
|
82
82
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
83
83
|
Requires-Dist: types-requests; extra == "dev"
|
|
84
|
-
Requires-Dist: types-ujson; extra == "dev"
|
|
85
84
|
|
|
86
85
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
87
86
|
|
|
@@ -17,7 +17,7 @@ sqlalchemy>=2
|
|
|
17
17
|
multiprocess==0.70.16
|
|
18
18
|
dill==0.3.8
|
|
19
19
|
cloudpickle
|
|
20
|
-
|
|
20
|
+
orjson>=3.10.5
|
|
21
21
|
pydantic<3,>=2
|
|
22
22
|
jmespath>=1.0
|
|
23
23
|
datamodel-code-generator>=0.25
|
|
@@ -33,7 +33,6 @@ types-python-dateutil
|
|
|
33
33
|
types-pytz
|
|
34
34
|
types-PyYAML
|
|
35
35
|
types-requests
|
|
36
|
-
types-ujson
|
|
37
36
|
|
|
38
37
|
[docs]
|
|
39
38
|
mkdocs>=1.5.2
|
|
@@ -90,7 +90,7 @@ def test_wds(catalog, webdataset_tars):
|
|
|
90
90
|
assert laion_wds.file.parent
|
|
91
91
|
assert laion_wds.file.name == f"{idx}.jpg"
|
|
92
92
|
assert laion_wds.file.location
|
|
93
|
-
assert laion_wds.json.
|
|
93
|
+
assert laion_wds.json.model_dump() == Laion(**data).model_dump()
|
|
94
94
|
|
|
95
95
|
assert num_rows == len(WDS_TAR_SHARDS)
|
|
96
96
|
|
|
@@ -17,12 +17,8 @@ from tests.utils import images_equal
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
@pytest.mark.parametrize("anon", [True, False])
|
|
20
|
-
def test_catalog_anon(catalog, anon):
|
|
21
|
-
chain = (
|
|
22
|
-
DataChain.from_storage("gs://dvcx-datalakes/dogs-and-cats/", anon=anon)
|
|
23
|
-
.limit(5)
|
|
24
|
-
.save("test_catalog_anon")
|
|
25
|
-
)
|
|
20
|
+
def test_catalog_anon(tmp_dir, catalog, anon):
|
|
21
|
+
chain = DataChain.from_storage(tmp_dir.as_uri(), anon=anon)
|
|
26
22
|
assert chain.catalog.client_config.get("anon", False) is anon
|
|
27
23
|
|
|
28
24
|
|
|
@@ -210,15 +210,16 @@ def test_create_dataset_from_sources_failed(listed_bucket, cloud_test_catalog, m
|
|
|
210
210
|
dataset_name = uuid.uuid4().hex
|
|
211
211
|
src_uri = cloud_test_catalog.src_uri
|
|
212
212
|
catalog = cloud_test_catalog.catalog
|
|
213
|
-
|
|
213
|
+
# Mocks are automatically undone at the end of a test.
|
|
214
|
+
mocker.patch.object(
|
|
214
215
|
catalog.warehouse.__class__,
|
|
215
216
|
"create_dataset_rows_table",
|
|
216
217
|
side_effect=RuntimeError("Error"),
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
218
|
+
)
|
|
219
|
+
with pytest.raises(RuntimeError):
|
|
220
|
+
catalog.create_dataset_from_sources(
|
|
221
|
+
dataset_name, [f"{src_uri}/dogs/*"], recursive=True
|
|
222
|
+
)
|
|
222
223
|
|
|
223
224
|
dataset = catalog.get_dataset(dataset_name)
|
|
224
225
|
dataset_version = dataset.get_version(dataset.latest_version)
|