datachain 0.7.0__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.7.0 → datachain-0.7.2}/.github/workflows/benchmarks.yml +1 -1
- {datachain-0.7.0 → datachain-0.7.2}/.github/workflows/release.yml +1 -1
- {datachain-0.7.0 → datachain-0.7.2}/.github/workflows/tests-studio.yml +1 -1
- {datachain-0.7.0 → datachain-0.7.2}/.github/workflows/tests.yml +3 -3
- {datachain-0.7.0 → datachain-0.7.2}/.pre-commit-config.yaml +1 -1
- {datachain-0.7.0/src/datachain.egg-info → datachain-0.7.2}/PKG-INFO +2 -1
- datachain-0.7.2/docs/references/sql.md +18 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/computer_vision/openimage-detect.py +11 -17
- datachain-0.7.2/examples/computer_vision/ultralytics-bbox.py +22 -0
- datachain-0.7.2/examples/computer_vision/ultralytics-pose.py +22 -0
- datachain-0.7.2/examples/computer_vision/ultralytics-segment.py +22 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/get_started/common_sql_functions.py +4 -5
- {datachain-0.7.0 → datachain-0.7.2}/examples/multimodal/clip_inference.py +3 -4
- {datachain-0.7.0 → datachain-0.7.2}/examples/multimodal/wds.py +1 -1
- {datachain-0.7.0 → datachain-0.7.2}/examples/multimodal/wds_filtered.py +6 -10
- {datachain-0.7.0 → datachain-0.7.2}/pyproject.toml +2 -1
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/__init__.py +0 -3
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/catalog/catalog.py +8 -6
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/cli.py +1 -1
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/client/fsspec.py +9 -9
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/data_storage/schema.py +2 -2
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/data_storage/sqlite.py +5 -4
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/data_storage/warehouse.py +18 -18
- datachain-0.7.2/src/datachain/func/__init__.py +49 -0
- {datachain-0.7.0/src/datachain/lib → datachain-0.7.2/src/datachain}/func/aggregate.py +13 -11
- datachain-0.7.2/src/datachain/func/array.py +176 -0
- datachain-0.7.2/src/datachain/func/base.py +23 -0
- datachain-0.7.2/src/datachain/func/conditional.py +81 -0
- datachain-0.7.2/src/datachain/func/func.py +384 -0
- datachain-0.7.2/src/datachain/func/path.py +110 -0
- datachain-0.7.2/src/datachain/func/random.py +23 -0
- datachain-0.7.2/src/datachain/func/string.py +154 -0
- datachain-0.7.2/src/datachain/func/window.py +49 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/arrow.py +24 -12
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/data_model.py +25 -9
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/dataset_info.py +2 -2
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/dc.py +94 -56
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/hf.py +1 -1
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/signal_schema.py +1 -1
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/utils.py +1 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/webdataset_laion.py +5 -5
- datachain-0.7.2/src/datachain/model/__init__.py +6 -0
- datachain-0.7.2/src/datachain/model/bbox.py +102 -0
- datachain-0.7.2/src/datachain/model/pose.py +88 -0
- datachain-0.7.2/src/datachain/model/segment.py +47 -0
- datachain-0.7.2/src/datachain/model/ultralytics/__init__.py +27 -0
- datachain-0.7.2/src/datachain/model/ultralytics/bbox.py +147 -0
- datachain-0.7.2/src/datachain/model/ultralytics/pose.py +113 -0
- datachain-0.7.2/src/datachain/model/ultralytics/segment.py +91 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/nodes_fetcher.py +2 -2
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/query/dataset.py +57 -34
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/__init__.py +0 -2
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/selectable.py +11 -5
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/sqlite/base.py +11 -2
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/toolkit/split.py +6 -2
- {datachain-0.7.0 → datachain-0.7.2/src/datachain.egg-info}/PKG-INFO +2 -1
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain.egg-info/SOURCES.txt +22 -7
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain.egg-info/requires.txt +1 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/conftest.py +20 -20
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_datachain.py +37 -6
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_datasets.py +1 -1
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_pull.py +2 -2
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_toolkit.py +3 -3
- {datachain-0.7.0 → datachain-0.7.2}/tests/test_query_e2e.py +30 -40
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_arrow.py +34 -6
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_datachain.py +37 -22
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_hf.py +2 -2
- datachain-0.7.2/tests/unit/lib/test_models.py +142 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_sql_to_python.py +0 -3
- datachain-0.7.2/tests/unit/sql/sqlite/__init__.py +0 -0
- datachain-0.7.2/tests/unit/sql/test_array.py +73 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/sql/test_conditional.py +25 -10
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/sql/test_path.py +10 -9
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/sql/test_random.py +2 -2
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/sql/test_string.py +2 -2
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_database_engine.py +15 -4
- datachain-0.7.2/tests/unit/test_func.py +256 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_session.py +2 -1
- datachain-0.7.0/docs/references/sql.md +0 -18
- datachain-0.7.0/src/datachain/lib/func/__init__.py +0 -32
- datachain-0.7.0/src/datachain/lib/func/func.py +0 -152
- datachain-0.7.0/src/datachain/lib/models/__init__.py +0 -5
- datachain-0.7.0/src/datachain/lib/models/bbox.py +0 -45
- datachain-0.7.0/src/datachain/lib/models/pose.py +0 -37
- datachain-0.7.0/src/datachain/lib/models/yolo.py +0 -39
- datachain-0.7.0/src/datachain/sql/functions/__init__.py +0 -26
- datachain-0.7.0/tests/unit/lib/test_models.py +0 -50
- datachain-0.7.0/tests/unit/sql/test_array.py +0 -20
- {datachain-0.7.0 → datachain-0.7.2}/.cruft.json +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/.gitattributes +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/.github/codecov.yaml +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/.github/dependabot.yml +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/.gitignore +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/CONTRIBUTING.rst +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/LICENSE +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/README.rst +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/assets/datachain.svg +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/index.md +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/overrides/main.html +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/references/datachain.md +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/references/datatype.md +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/references/file.md +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/references/index.md +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/references/torch.md +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/docs/references/udf.md +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/mkdocs.yml +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/noxfile.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/setup.cfg +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/__main__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/asyn.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/cache.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/cli_utils.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/client/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/client/azure.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/client/gcs.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/client/hf.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/client/local.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/client/s3.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/config.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/dataset.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/error.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/job.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/clip.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/file.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/image.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/listing.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/settings.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/tar.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/text.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/udf.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/listing.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/node.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/progress.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/py.typed +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/query/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/query/batch.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/query/metrics.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/query/params.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/query/queue.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/query/schema.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/query/session.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/remote/studio.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.7.0/tests/benchmarks → datachain-0.7.2/src/datachain/sql/functions}/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/types.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/sql/utils.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/studio.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/telemetry.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain/utils.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/__init__.py +0 -0
- {datachain-0.7.0/tests/examples → datachain-0.7.2/tests/benchmarks}/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/data.py +0 -0
- {datachain-0.7.0/tests/func → datachain-0.7.2/tests/examples}/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/examples/test_examples.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/examples/wds_data.py +0 -0
- {datachain-0.7.0/tests/unit → datachain-0.7.2/tests/func}/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_catalog.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_client.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_listing.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_ls.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_metrics.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_pytorch.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/func/test_query.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/scripts/feature_class.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/test_atomicity.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/test_cli_e2e.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/test_cli_studio.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/test_telemetry.py +0 -0
- {datachain-0.7.0/tests/unit/lib → datachain-0.7.2/tests/unit}/__init__.py +0 -0
- {datachain-0.7.0/tests/unit/sql → datachain-0.7.2/tests/unit/lib}/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.7.0/tests/unit/sql/sqlite → datachain-0.7.2/tests/unit/sql}/__init__.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_asyn.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_cache.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_catalog.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_client.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_config.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_dataset.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_listing.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_metastore.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_query.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_query_params.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_serializer.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_utils.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.7.0 → datachain-0.7.2}/tests/utils.py +0 -0
|
@@ -28,7 +28,7 @@ jobs:
|
|
|
28
28
|
python-version: '3.9'
|
|
29
29
|
|
|
30
30
|
- name: Setup uv
|
|
31
|
-
uses: astral-sh/setup-uv@
|
|
31
|
+
uses: astral-sh/setup-uv@v4
|
|
32
32
|
with:
|
|
33
33
|
enable-cache: true
|
|
34
34
|
cache-suffix: lint
|
|
@@ -82,7 +82,7 @@ jobs:
|
|
|
82
82
|
python-version: ${{ matrix.pyv }}
|
|
83
83
|
|
|
84
84
|
- name: Setup uv
|
|
85
|
-
uses: astral-sh/setup-uv@
|
|
85
|
+
uses: astral-sh/setup-uv@v4
|
|
86
86
|
with:
|
|
87
87
|
enable-cache: true
|
|
88
88
|
cache-suffix: tests-${{ matrix.pyv }}
|
|
@@ -142,7 +142,7 @@ jobs:
|
|
|
142
142
|
python-version: ${{ matrix.pyv }}
|
|
143
143
|
|
|
144
144
|
- name: Setup uv
|
|
145
|
-
uses: astral-sh/setup-uv@
|
|
145
|
+
uses: astral-sh/setup-uv@v4
|
|
146
146
|
with:
|
|
147
147
|
enable-cache: true
|
|
148
148
|
cache-suffix: examples-${{ matrix.pyv }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -98,6 +98,7 @@ Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
|
|
|
98
98
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
99
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
100
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
+
Requires-Dist: ultralytics==8.3.29; extra == "examples"
|
|
101
102
|
|
|
102
103
|
================
|
|
103
104
|
|logo| DataChain
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# SQL
|
|
2
|
+
|
|
3
|
+
Use SQL functions to operate on the underlying database storing the chain data. Useful
|
|
4
|
+
for operations like [`DataChain.filter`](datachain.md#datachain.lib.dc.DataChain.filter)
|
|
5
|
+
and [`DataChain.mutate`](datachain.md#datachain.lib.dc.DataChain.mutate). Import
|
|
6
|
+
these functions from `datachain.sql.functions`.
|
|
7
|
+
|
|
8
|
+
::: datachain.func.avg
|
|
9
|
+
::: datachain.func.count
|
|
10
|
+
::: datachain.func.greatest
|
|
11
|
+
::: datachain.func.least
|
|
12
|
+
::: datachain.func.max
|
|
13
|
+
::: datachain.func.min
|
|
14
|
+
::: datachain.func.rand
|
|
15
|
+
::: datachain.func.sum
|
|
16
|
+
::: datachain.func.array
|
|
17
|
+
::: datachain.func.path
|
|
18
|
+
::: datachain.func.string
|
|
@@ -1,17 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
3
|
from PIL import Image
|
|
4
|
-
from pydantic import BaseModel
|
|
5
4
|
|
|
6
|
-
from datachain import C, DataChain, File
|
|
7
|
-
from datachain.
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class BBox(BaseModel):
|
|
11
|
-
x_min: int
|
|
12
|
-
x_max: int
|
|
13
|
-
y_min: int
|
|
14
|
-
y_max: int
|
|
5
|
+
from datachain import C, DataChain, File, model
|
|
6
|
+
from datachain.func import path
|
|
15
7
|
|
|
16
8
|
|
|
17
9
|
def openimage_detect(args):
|
|
@@ -30,11 +22,13 @@ def openimage_detect(args):
|
|
|
30
22
|
detections = json.load(stream_json).get("detections", [])
|
|
31
23
|
|
|
32
24
|
for i, detect in enumerate(detections):
|
|
33
|
-
bbox = BBox(
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
25
|
+
bbox = model.BBox.from_list(
|
|
26
|
+
[
|
|
27
|
+
detect["XMin"] * img.width,
|
|
28
|
+
detect["XMax"] * img.width,
|
|
29
|
+
detect["YMin"] * img.height,
|
|
30
|
+
detect["YMax"] * img.height,
|
|
31
|
+
]
|
|
38
32
|
)
|
|
39
33
|
|
|
40
34
|
fstream = File(
|
|
@@ -54,9 +48,9 @@ source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
|
|
|
54
48
|
.filter(C("file.path").glob("*.jpg") | C("file.path").glob("*.json"))
|
|
55
49
|
.agg(
|
|
56
50
|
openimage_detect,
|
|
57
|
-
partition_by=path.file_stem(
|
|
51
|
+
partition_by=path.file_stem("file.path"),
|
|
58
52
|
params=["file"],
|
|
59
|
-
output={"file": File, "bbox": BBox},
|
|
53
|
+
output={"file": File, "bbox": model.BBox},
|
|
60
54
|
)
|
|
61
55
|
.show()
|
|
62
56
|
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
|
|
3
|
+
from PIL import Image
|
|
4
|
+
from ultralytics import YOLO
|
|
5
|
+
|
|
6
|
+
from datachain import C, DataChain, File
|
|
7
|
+
from datachain.model.ultralytics import YoloBBoxes
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def process_bboxes(yolo: YOLO, file: File) -> YoloBBoxes:
|
|
11
|
+
results = yolo(Image.open(BytesIO(file.read())))
|
|
12
|
+
return YoloBBoxes.from_results(results)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
(
|
|
16
|
+
DataChain.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
|
|
17
|
+
.filter(C("file.path").glob("*.jpg"))
|
|
18
|
+
.limit(20)
|
|
19
|
+
.setup(yolo=lambda: YOLO("yolo11n.pt"))
|
|
20
|
+
.map(boxes=process_bboxes)
|
|
21
|
+
.show()
|
|
22
|
+
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
|
|
3
|
+
from PIL import Image
|
|
4
|
+
from ultralytics import YOLO
|
|
5
|
+
|
|
6
|
+
from datachain import C, DataChain, File
|
|
7
|
+
from datachain.model.ultralytics import YoloPoses
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def process_poses(yolo: YOLO, file: File) -> YoloPoses:
|
|
11
|
+
results = yolo(Image.open(BytesIO(file.read())))
|
|
12
|
+
return YoloPoses.from_results(results)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
(
|
|
16
|
+
DataChain.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
|
|
17
|
+
.filter(C("file.path").glob("*.jpg"))
|
|
18
|
+
.limit(20)
|
|
19
|
+
.setup(yolo=lambda: YOLO("yolo11n-pose.pt"))
|
|
20
|
+
.map(poses=process_poses)
|
|
21
|
+
.show()
|
|
22
|
+
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
|
|
3
|
+
from PIL import Image
|
|
4
|
+
from ultralytics import YOLO
|
|
5
|
+
|
|
6
|
+
from datachain import C, DataChain, File
|
|
7
|
+
from datachain.model.ultralytics import YoloSegments
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def process_segments(yolo: YOLO, file: File) -> YoloSegments:
|
|
11
|
+
results = yolo(Image.open(BytesIO(file.read())))
|
|
12
|
+
return YoloSegments.from_results(results)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
(
|
|
16
|
+
DataChain.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
|
|
17
|
+
.filter(C("file.path").glob("*.jpg"))
|
|
18
|
+
.limit(20)
|
|
19
|
+
.setup(yolo=lambda: YOLO("yolo11n-seg.pt"))
|
|
20
|
+
.map(segments=process_segments)
|
|
21
|
+
.show()
|
|
22
|
+
)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from datachain import C, DataChain
|
|
2
|
-
from datachain.
|
|
3
|
-
from datachain.sql.functions import array, greatest, least, path, string
|
|
2
|
+
from datachain.func import array, greatest, least, path, string
|
|
4
3
|
|
|
5
4
|
|
|
6
5
|
def num_chars_udf(file):
|
|
@@ -18,7 +17,7 @@ dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
|
|
|
18
17
|
(
|
|
19
18
|
dc.mutate(
|
|
20
19
|
length=string.length(path.name(C("file.path"))),
|
|
21
|
-
parts=string.split(path.name(C("file.path")),
|
|
20
|
+
parts=string.split(path.name(C("file.path")), "."),
|
|
22
21
|
)
|
|
23
22
|
.select("file.path", "length", "parts")
|
|
24
23
|
.show(5)
|
|
@@ -35,8 +34,8 @@ dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
|
|
|
35
34
|
|
|
36
35
|
|
|
37
36
|
chain = dc.mutate(
|
|
38
|
-
a=array.length(string.split(
|
|
39
|
-
b=array.length(string.split(path.name(
|
|
37
|
+
a=array.length(string.split("file.path", "/")),
|
|
38
|
+
b=array.length(string.split(path.name("file.path"), "0")),
|
|
40
39
|
)
|
|
41
40
|
|
|
42
41
|
(
|
|
@@ -3,8 +3,7 @@ import torch
|
|
|
3
3
|
from torch.nn.functional import cosine_similarity
|
|
4
4
|
from torch.utils.data import DataLoader
|
|
5
5
|
|
|
6
|
-
from datachain import C, DataChain
|
|
7
|
-
from datachain.sql.functions import path
|
|
6
|
+
from datachain import C, DataChain, func
|
|
8
7
|
|
|
9
8
|
source = "gs://datachain-demo/50k-laion-files/000000/00000000*"
|
|
10
9
|
|
|
@@ -18,8 +17,8 @@ def create_dataset():
|
|
|
18
17
|
)
|
|
19
18
|
return imgs.merge(
|
|
20
19
|
captions,
|
|
21
|
-
on=path.file_stem(imgs.c("file.path")),
|
|
22
|
-
right_on=path.file_stem(captions.c("file.path")),
|
|
20
|
+
on=func.path.file_stem(imgs.c("file.path")),
|
|
21
|
+
right_on=func.path.file_stem(captions.c("file.path")),
|
|
23
22
|
)
|
|
24
23
|
|
|
25
24
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
3
|
from datachain import DataChain
|
|
4
|
+
from datachain.func import path
|
|
4
5
|
from datachain.lib.webdataset import process_webdataset
|
|
5
6
|
from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
|
|
6
|
-
from datachain.sql.functions import path
|
|
7
7
|
|
|
8
8
|
IMAGE_TARS = os.getenv(
|
|
9
9
|
"IMAGE_TARS", "gs://datachain-demo/datacomp-small/shards/000000[0-5]*.tar"
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import datachain.error
|
|
2
|
-
from datachain import C, DataChain
|
|
2
|
+
from datachain import C, DataChain, func
|
|
3
3
|
from datachain.lib.webdataset import process_webdataset
|
|
4
4
|
from datachain.lib.webdataset_laion import WDSLaion
|
|
5
|
-
from datachain.sql import literal
|
|
6
|
-
from datachain.sql.functions import array, greatest, least, string
|
|
7
5
|
|
|
8
6
|
name = "wds"
|
|
9
7
|
try:
|
|
@@ -20,14 +18,12 @@ except datachain.error.DatasetNotFoundError:
|
|
|
20
18
|
wds.print_schema()
|
|
21
19
|
|
|
22
20
|
filtered = (
|
|
23
|
-
wds.filter(string.length(
|
|
24
|
-
.filter(array.length(string.split(
|
|
21
|
+
wds.filter(func.string.length("laion.txt") > 5)
|
|
22
|
+
.filter(func.array.length(func.string.split("laion.txt", " ")) > 2)
|
|
23
|
+
.filter(func.least("laion.json.original_width", "laion.json.original_height") > 200)
|
|
25
24
|
.filter(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
.filter(
|
|
29
|
-
greatest(C("laion.json.original_width"), C("laion.json.original_height"))
|
|
30
|
-
/ least(C("laion.json.original_width"), C("laion.json.original_height"))
|
|
25
|
+
func.greatest("laion.json.original_width", "laion.json.original_height")
|
|
26
|
+
/ func.least("laion.json.original_width", "laion.json.original_height")
|
|
31
27
|
< 3.0
|
|
32
28
|
)
|
|
33
29
|
.save()
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from datachain.lib import func, models
|
|
2
1
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
3
2
|
from datachain.lib.dc import C, Column, DataChain, Sys
|
|
4
3
|
from datachain.lib.file import (
|
|
@@ -35,9 +34,7 @@ __all__ = [
|
|
|
35
34
|
"Sys",
|
|
36
35
|
"TarVFile",
|
|
37
36
|
"TextFile",
|
|
38
|
-
"func",
|
|
39
37
|
"is_chain_type",
|
|
40
38
|
"metrics",
|
|
41
|
-
"models",
|
|
42
39
|
"param",
|
|
43
40
|
]
|
|
@@ -54,7 +54,6 @@ from datachain.error import (
|
|
|
54
54
|
QueryScriptCancelError,
|
|
55
55
|
QueryScriptRunError,
|
|
56
56
|
)
|
|
57
|
-
from datachain.listing import Listing
|
|
58
57
|
from datachain.node import DirType, Node, NodeWithPath
|
|
59
58
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
60
59
|
from datachain.remote.studio import StudioClient
|
|
@@ -76,6 +75,7 @@ if TYPE_CHECKING:
|
|
|
76
75
|
from datachain.dataset import DatasetVersion
|
|
77
76
|
from datachain.job import Job
|
|
78
77
|
from datachain.lib.file import File
|
|
78
|
+
from datachain.listing import Listing
|
|
79
79
|
|
|
80
80
|
logger = logging.getLogger("datachain")
|
|
81
81
|
|
|
@@ -236,7 +236,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
236
236
|
class NodeGroup:
|
|
237
237
|
"""Class for a group of nodes from the same source"""
|
|
238
238
|
|
|
239
|
-
listing: Listing
|
|
239
|
+
listing: "Listing"
|
|
240
240
|
sources: list[DataSource]
|
|
241
241
|
|
|
242
242
|
# The source path within the bucket
|
|
@@ -591,8 +591,9 @@ class Catalog:
|
|
|
591
591
|
client_config=None,
|
|
592
592
|
object_name="file",
|
|
593
593
|
skip_indexing=False,
|
|
594
|
-
) -> tuple[Listing, str]:
|
|
594
|
+
) -> tuple["Listing", str]:
|
|
595
595
|
from datachain.lib.dc import DataChain
|
|
596
|
+
from datachain.listing import Listing
|
|
596
597
|
|
|
597
598
|
DataChain.from_storage(
|
|
598
599
|
source, session=self.session, update=update, object_name=object_name
|
|
@@ -660,7 +661,8 @@ class Catalog:
|
|
|
660
661
|
no_glob: bool = False,
|
|
661
662
|
client_config=None,
|
|
662
663
|
) -> list[NodeGroup]:
|
|
663
|
-
from datachain.
|
|
664
|
+
from datachain.listing import Listing
|
|
665
|
+
from datachain.query.dataset import DatasetQuery
|
|
664
666
|
|
|
665
667
|
def _row_to_node(d: dict[str, Any]) -> Node:
|
|
666
668
|
del d["file__source"]
|
|
@@ -876,7 +878,7 @@ class Catalog:
|
|
|
876
878
|
def update_dataset_version_with_warehouse_info(
|
|
877
879
|
self, dataset: DatasetRecord, version: int, rows_dropped=False, **kwargs
|
|
878
880
|
) -> None:
|
|
879
|
-
from datachain.query import DatasetQuery
|
|
881
|
+
from datachain.query.dataset import DatasetQuery
|
|
880
882
|
|
|
881
883
|
dataset_version = dataset.get_version(version)
|
|
882
884
|
|
|
@@ -1177,7 +1179,7 @@ class Catalog:
|
|
|
1177
1179
|
def ls_dataset_rows(
|
|
1178
1180
|
self, name: str, version: int, offset=None, limit=None
|
|
1179
1181
|
) -> list[dict]:
|
|
1180
|
-
from datachain.query import DatasetQuery
|
|
1182
|
+
from datachain.query.dataset import DatasetQuery
|
|
1181
1183
|
|
|
1182
1184
|
dataset = self.get_dataset(name)
|
|
1183
1185
|
|
|
@@ -957,7 +957,7 @@ def show(
|
|
|
957
957
|
schema: bool = False,
|
|
958
958
|
) -> None:
|
|
959
959
|
from datachain.lib.dc import DataChain
|
|
960
|
-
from datachain.query import DatasetQuery
|
|
960
|
+
from datachain.query.dataset import DatasetQuery
|
|
961
961
|
from datachain.utils import show_records
|
|
962
962
|
|
|
963
963
|
dataset = catalog.get_dataset(name)
|
|
@@ -28,7 +28,6 @@ from tqdm import tqdm
|
|
|
28
28
|
from datachain.cache import DataChainCache
|
|
29
29
|
from datachain.client.fileslice import FileWrapper
|
|
30
30
|
from datachain.error import ClientError as DataChainClientError
|
|
31
|
-
from datachain.lib.file import File
|
|
32
31
|
from datachain.nodes_fetcher import NodesFetcher
|
|
33
32
|
from datachain.nodes_thread_pool import NodeChunk
|
|
34
33
|
|
|
@@ -36,6 +35,7 @@ if TYPE_CHECKING:
|
|
|
36
35
|
from fsspec.spec import AbstractFileSystem
|
|
37
36
|
|
|
38
37
|
from datachain.dataset import StorageURI
|
|
38
|
+
from datachain.lib.file import File
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
logger = logging.getLogger("datachain")
|
|
@@ -45,7 +45,7 @@ DELIMITER = "/" # Path delimiter.
|
|
|
45
45
|
|
|
46
46
|
DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
|
|
47
47
|
|
|
48
|
-
ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
|
|
48
|
+
ResultQueue = asyncio.Queue[Optional[Sequence["File"]]]
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
def _is_win_local_path(uri: str) -> bool:
|
|
@@ -212,7 +212,7 @@ class Client(ABC):
|
|
|
212
212
|
|
|
213
213
|
async def scandir(
|
|
214
214
|
self, start_prefix: str, method: str = "default"
|
|
215
|
-
) -> AsyncIterator[Sequence[File]]:
|
|
215
|
+
) -> AsyncIterator[Sequence["File"]]:
|
|
216
216
|
try:
|
|
217
217
|
impl = getattr(self, f"_fetch_{method}")
|
|
218
218
|
except AttributeError:
|
|
@@ -317,7 +317,7 @@ class Client(ABC):
|
|
|
317
317
|
return f"{self.PREFIX}{self.name}/{rel_path}"
|
|
318
318
|
|
|
319
319
|
@abstractmethod
|
|
320
|
-
def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
|
|
320
|
+
def info_to_file(self, v: dict[str, Any], parent: str) -> "File": ...
|
|
321
321
|
|
|
322
322
|
def fetch_nodes(
|
|
323
323
|
self,
|
|
@@ -354,7 +354,7 @@ class Client(ABC):
|
|
|
354
354
|
copy2(src, dst)
|
|
355
355
|
|
|
356
356
|
def open_object(
|
|
357
|
-
self, file: File, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
|
|
357
|
+
self, file: "File", use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
|
|
358
358
|
) -> BinaryIO:
|
|
359
359
|
"""Open a file, including files in tar archives."""
|
|
360
360
|
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
@@ -362,19 +362,19 @@ class Client(ABC):
|
|
|
362
362
|
assert not file.location
|
|
363
363
|
return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb) # type: ignore[return-value]
|
|
364
364
|
|
|
365
|
-
def download(self, file: File, *, callback: Callback = DEFAULT_CALLBACK) -> None:
|
|
365
|
+
def download(self, file: "File", *, callback: Callback = DEFAULT_CALLBACK) -> None:
|
|
366
366
|
sync(get_loop(), functools.partial(self._download, file, callback=callback))
|
|
367
367
|
|
|
368
|
-
async def _download(self, file: File, *, callback: "Callback" = None) -> None:
|
|
368
|
+
async def _download(self, file: "File", *, callback: "Callback" = None) -> None:
|
|
369
369
|
if self.cache.contains(file):
|
|
370
370
|
# Already in cache, so there's nothing to do.
|
|
371
371
|
return
|
|
372
372
|
await self._put_in_cache(file, callback=callback)
|
|
373
373
|
|
|
374
|
-
def put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
|
|
374
|
+
def put_in_cache(self, file: "File", *, callback: "Callback" = None) -> None:
|
|
375
375
|
sync(get_loop(), functools.partial(self._put_in_cache, file, callback=callback))
|
|
376
376
|
|
|
377
|
-
async def _put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
|
|
377
|
+
async def _put_in_cache(self, file: "File", *, callback: "Callback" = None) -> None:
|
|
378
378
|
assert not file.location
|
|
379
379
|
if file.etag:
|
|
380
380
|
etag = await self.get_current_etag(file)
|
|
@@ -12,7 +12,7 @@ import sqlalchemy as sa
|
|
|
12
12
|
from sqlalchemy.sql import func as f
|
|
13
13
|
from sqlalchemy.sql.expression import false, null, true
|
|
14
14
|
|
|
15
|
-
from datachain.sql.functions import path
|
|
15
|
+
from datachain.sql.functions import path as pathfunc
|
|
16
16
|
from datachain.sql.types import Int, SQLType, UInt64
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
@@ -130,7 +130,7 @@ class DirExpansion:
|
|
|
130
130
|
|
|
131
131
|
def query(self, q):
|
|
132
132
|
q = self.base_select(q).cte(recursive=True)
|
|
133
|
-
parent =
|
|
133
|
+
parent = pathfunc.parent(self.c(q, "path"))
|
|
134
134
|
q = q.union_all(
|
|
135
135
|
sa.select(
|
|
136
136
|
sa.literal(-1).label("sys__id"),
|
|
@@ -122,7 +122,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
122
122
|
return cls(*cls._connect(db_file=db_file))
|
|
123
123
|
|
|
124
124
|
@staticmethod
|
|
125
|
-
def _connect(
|
|
125
|
+
def _connect(
|
|
126
|
+
db_file: Optional[str] = None,
|
|
127
|
+
) -> tuple["Engine", "MetaData", sqlite3.Connection, str]:
|
|
126
128
|
try:
|
|
127
129
|
if db_file == ":memory:":
|
|
128
130
|
# Enable multithreaded usage of the same in-memory db
|
|
@@ -130,9 +132,8 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
130
132
|
_get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
|
|
131
133
|
)
|
|
132
134
|
else:
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
)
|
|
135
|
+
db_file = db_file or DataChainDir.find().db
|
|
136
|
+
db = sqlite3.connect(db_file, detect_types=DETECT_TYPES)
|
|
136
137
|
create_user_defined_sql_functions(db)
|
|
137
138
|
engine = sqlalchemy.create_engine(
|
|
138
139
|
"sqlite+pysqlite:///", creator=lambda: db, future=True
|
|
@@ -224,28 +224,28 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
224
224
|
offset = 0
|
|
225
225
|
num_yielded = 0
|
|
226
226
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
if limit
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
227
|
+
# Ensure we're using a thread-local connection
|
|
228
|
+
with self.clone() as wh:
|
|
229
|
+
while True:
|
|
230
|
+
if limit is not None:
|
|
231
|
+
limit -= num_yielded
|
|
232
|
+
if limit == 0:
|
|
233
|
+
break
|
|
234
|
+
if limit < page_size:
|
|
235
|
+
paginated_query = paginated_query.limit(None).limit(limit)
|
|
236
|
+
|
|
237
237
|
# Cursor results are not thread-safe, so we convert them to a list
|
|
238
238
|
results = list(wh.dataset_rows_select(paginated_query.offset(offset)))
|
|
239
239
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
240
|
+
processed = False
|
|
241
|
+
for row in results:
|
|
242
|
+
processed = True
|
|
243
|
+
yield row
|
|
244
|
+
num_yielded += 1
|
|
245
245
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
246
|
+
if not processed:
|
|
247
|
+
break # no more results
|
|
248
|
+
offset += page_size
|
|
249
249
|
|
|
250
250
|
#
|
|
251
251
|
# Table Name Internal Functions
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from sqlalchemy import literal
|
|
2
|
+
|
|
3
|
+
from . import array, path, random, string
|
|
4
|
+
from .aggregate import (
|
|
5
|
+
any_value,
|
|
6
|
+
avg,
|
|
7
|
+
collect,
|
|
8
|
+
concat,
|
|
9
|
+
count,
|
|
10
|
+
dense_rank,
|
|
11
|
+
first,
|
|
12
|
+
max,
|
|
13
|
+
min,
|
|
14
|
+
rank,
|
|
15
|
+
row_number,
|
|
16
|
+
sum,
|
|
17
|
+
)
|
|
18
|
+
from .array import cosine_distance, euclidean_distance, length, sip_hash_64
|
|
19
|
+
from .conditional import greatest, least
|
|
20
|
+
from .random import rand
|
|
21
|
+
from .window import window
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"any_value",
|
|
25
|
+
"array",
|
|
26
|
+
"avg",
|
|
27
|
+
"collect",
|
|
28
|
+
"concat",
|
|
29
|
+
"cosine_distance",
|
|
30
|
+
"count",
|
|
31
|
+
"dense_rank",
|
|
32
|
+
"euclidean_distance",
|
|
33
|
+
"first",
|
|
34
|
+
"greatest",
|
|
35
|
+
"least",
|
|
36
|
+
"length",
|
|
37
|
+
"literal",
|
|
38
|
+
"max",
|
|
39
|
+
"min",
|
|
40
|
+
"path",
|
|
41
|
+
"rand",
|
|
42
|
+
"random",
|
|
43
|
+
"rank",
|
|
44
|
+
"row_number",
|
|
45
|
+
"sip_hash_64",
|
|
46
|
+
"string",
|
|
47
|
+
"sum",
|
|
48
|
+
"window",
|
|
49
|
+
]
|