datachain 0.14.4__tar.gz → 0.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.14.4 → datachain-0.15.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.14.4/src/datachain.egg-info → datachain-0.15.0}/PKG-INFO +3 -3
- {datachain-0.14.4 → datachain-0.15.0}/README.rst +2 -2
- {datachain-0.14.4 → datachain-0.15.0}/docs/quick-start.md +4 -4
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/datachain.md +4 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/get_started/json-csv-reader.py +2 -2
- {datachain-0.14.4 → datachain-0.15.0}/examples/multimodal/hf_pipeline.py +1 -1
- {datachain-0.14.4 → datachain-0.15.0}/examples/multimodal/wds_filtered.py +1 -1
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/__init__.py +4 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/catalog/catalog.py +13 -5
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/catalog/loader.py +11 -7
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/data_storage/schema.py +21 -23
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/data_storage/sqlite.py +1 -1
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/data_storage/warehouse.py +6 -8
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/convert/values_to_tuples.py +23 -14
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/__init__.py +4 -1
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/csv.py +3 -3
- datachain-0.15.0/src/datachain/lib/dc/database.py +151 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/datachain.py +25 -15
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/datasets.py +70 -10
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/hf.py +5 -5
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/json.py +7 -7
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/listings.py +3 -3
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/pandas.py +13 -6
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/parquet.py +3 -3
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/records.py +12 -14
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/storage.py +6 -6
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/values.py +3 -3
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/listing.py +2 -2
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/signal_schema.py +34 -10
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/listing.py +4 -4
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/dataset.py +10 -12
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/dispatch.py +7 -2
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/schema.py +4 -1
- {datachain-0.14.4 → datachain-0.15.0/src/datachain.egg-info}/PKG-INFO +3 -3
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain.egg-info/SOURCES.txt +2 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/conftest.py +4 -1
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_catalog.py +3 -3
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_data_storage.py +1 -1
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_datachain.py +31 -7
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_dataset_query.py +0 -60
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_hidden_field.py +1 -1
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_pull.py +9 -7
- datachain-0.15.0/tests/func/test_read_database.py +175 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/test_import_time.py +1 -1
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_datachain.py +119 -34
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_feature_utils.py +0 -5
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_signal_schema.py +2 -1
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_catalog_loader.py +21 -10
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_listing.py +1 -1
- {datachain-0.14.4 → datachain-0.15.0}/.cruft.json +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.gitattributes +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.github/codecov.yaml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.github/dependabot.yml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.github/workflows/release.yml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/.gitignore +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/LICENSE +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/contributing.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/examples.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/index.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/overrides/main.html +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/func.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/index.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/remotes.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/toolkit.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/torch.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/references/udf.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/docs/tutorials.md +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/mkdocs.yml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/noxfile.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/pyproject.toml +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/setup.cfg +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/__main__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/asyn.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cache.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/client/local.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/config.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/dataset.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/error.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/array.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/base.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/func.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/path.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/random.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/string.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/func/window.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/job.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/file.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/node.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/progress.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/py.typed +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/params.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/session.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/remote/studio.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/studio.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain/utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/data.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/examples/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_client.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_file.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_hf.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_image.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_listing.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_ls.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_query.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_session.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_video.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/test_atomicity.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/test_cli_studio.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/test_telemetry.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_client.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_config.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_func.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_query.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_session.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.14.4 → datachain-0.15.0}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.15.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -171,7 +171,7 @@ high confidence scores.
|
|
|
171
171
|
|
|
172
172
|
import datachain as dc
|
|
173
173
|
|
|
174
|
-
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json",
|
|
174
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
|
|
175
175
|
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
176
176
|
|
|
177
177
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
@@ -213,7 +213,7 @@ Python code:
|
|
|
213
213
|
return result.lower().startswith("success")
|
|
214
214
|
|
|
215
215
|
chain = (
|
|
216
|
-
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
216
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
|
|
217
217
|
.settings(parallel=4, cache=True)
|
|
218
218
|
.map(is_success=eval_dialogue)
|
|
219
219
|
.save("mistral_files")
|
|
@@ -60,7 +60,7 @@ high confidence scores.
|
|
|
60
60
|
|
|
61
61
|
import datachain as dc
|
|
62
62
|
|
|
63
|
-
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json",
|
|
63
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
|
|
64
64
|
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
65
65
|
|
|
66
66
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
@@ -102,7 +102,7 @@ Python code:
|
|
|
102
102
|
return result.lower().startswith("success")
|
|
103
103
|
|
|
104
104
|
chain = (
|
|
105
|
-
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
105
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
|
|
106
106
|
.settings(parallel=4, cache=True)
|
|
107
107
|
.map(is_success=eval_dialogue)
|
|
108
108
|
.save("mistral_files")
|
|
@@ -39,7 +39,7 @@ using JSON metadata:
|
|
|
39
39
|
``` py
|
|
40
40
|
import datachain as dc
|
|
41
41
|
|
|
42
|
-
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json",
|
|
42
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
|
|
43
43
|
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
44
44
|
|
|
45
45
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
@@ -78,7 +78,7 @@ def is_positive_dialogue_ending(file) -> bool:
|
|
|
78
78
|
|
|
79
79
|
chain = (
|
|
80
80
|
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
81
|
-
|
|
81
|
+
column="file", type="text", anon=True)
|
|
82
82
|
.settings(parallel=8, cache=True)
|
|
83
83
|
.map(is_positive=is_positive_dialogue_ending)
|
|
84
84
|
.save("file_response")
|
|
@@ -132,7 +132,7 @@ def eval_dialogue(file: dc.File) -> bool:
|
|
|
132
132
|
return result.lower().startswith("success")
|
|
133
133
|
|
|
134
134
|
chain = (
|
|
135
|
-
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
135
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
|
|
136
136
|
.map(is_success=eval_dialogue)
|
|
137
137
|
.save("mistral_files")
|
|
138
138
|
)
|
|
@@ -177,7 +177,7 @@ def eval_dialog(file: dc.File) -> ChatCompletionResponse:
|
|
|
177
177
|
{"role": "user", "content": file.read()}])
|
|
178
178
|
|
|
179
179
|
chain = (
|
|
180
|
-
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
180
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
|
|
181
181
|
.settings(parallel=4, cache=True)
|
|
182
182
|
.map(response=eval_dialog)
|
|
183
183
|
.map(status=lambda response: response.choices[0].message.content.lower()[:7])
|
|
@@ -31,6 +31,10 @@ for examples of how to create a chain.
|
|
|
31
31
|
|
|
32
32
|
::: datachain.lib.dc.values.read_values
|
|
33
33
|
|
|
34
|
+
::: datachain.lib.dc.database.read_database
|
|
35
|
+
|
|
36
|
+
::: datachain.lib.dc.database.ConnectionType
|
|
37
|
+
|
|
34
38
|
::: datachain.lib.dc.DataChain
|
|
35
39
|
|
|
36
40
|
::: datachain.lib.utils.DataChainError
|
|
@@ -63,13 +63,13 @@ def main():
|
|
|
63
63
|
|
|
64
64
|
# Static CSV with header schema test parsing 3.5K objects
|
|
65
65
|
uri = "gs://datachain-demo/chatbot-csv/"
|
|
66
|
-
static_csv_ds = dc.read_csv(uri, output=ChatDialog,
|
|
66
|
+
static_csv_ds = dc.read_csv(uri, output=ChatDialog, column="chat", anon="True")
|
|
67
67
|
static_csv_ds.print_schema()
|
|
68
68
|
static_csv_ds.show()
|
|
69
69
|
|
|
70
70
|
# Dynamic CSV with header schema test parsing 3/3M objects
|
|
71
71
|
uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
|
|
72
|
-
dynamic_csv_ds = dc.read_csv(uri,
|
|
72
|
+
dynamic_csv_ds = dc.read_csv(uri, column="laion", nrows=3, anon="True")
|
|
73
73
|
dynamic_csv_ds.print_schema()
|
|
74
74
|
dynamic_csv_ds.show()
|
|
75
75
|
|
|
@@ -5,8 +5,10 @@ from datachain.lib.dc import (
|
|
|
5
5
|
DataChain,
|
|
6
6
|
Sys,
|
|
7
7
|
datasets,
|
|
8
|
+
delete_dataset,
|
|
8
9
|
listings,
|
|
9
10
|
read_csv,
|
|
11
|
+
read_database,
|
|
10
12
|
read_dataset,
|
|
11
13
|
read_hf,
|
|
12
14
|
read_json,
|
|
@@ -61,11 +63,13 @@ __all__ = [
|
|
|
61
63
|
"VideoFragment",
|
|
62
64
|
"VideoFrame",
|
|
63
65
|
"datasets",
|
|
66
|
+
"delete_dataset",
|
|
64
67
|
"is_chain_type",
|
|
65
68
|
"listings",
|
|
66
69
|
"metrics",
|
|
67
70
|
"param",
|
|
68
71
|
"read_csv",
|
|
72
|
+
"read_database",
|
|
69
73
|
"read_dataset",
|
|
70
74
|
"read_hf",
|
|
71
75
|
"read_json",
|
|
@@ -580,15 +580,13 @@ class Catalog:
|
|
|
580
580
|
source: str,
|
|
581
581
|
update=False,
|
|
582
582
|
client_config=None,
|
|
583
|
-
|
|
583
|
+
column="file",
|
|
584
584
|
skip_indexing=False,
|
|
585
585
|
) -> tuple[Optional["Listing"], "Client", str]:
|
|
586
586
|
from datachain import read_storage
|
|
587
587
|
from datachain.listing import Listing
|
|
588
588
|
|
|
589
|
-
read_storage(
|
|
590
|
-
source, session=self.session, update=update, object_name=object_name
|
|
591
|
-
).exec()
|
|
589
|
+
read_storage(source, session=self.session, update=update, column=column).exec()
|
|
592
590
|
|
|
593
591
|
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
594
592
|
source, self.session, update=update
|
|
@@ -602,7 +600,7 @@ class Catalog:
|
|
|
602
600
|
self.warehouse.clone(),
|
|
603
601
|
client,
|
|
604
602
|
dataset_name=list_ds_name,
|
|
605
|
-
|
|
603
|
+
column=column,
|
|
606
604
|
)
|
|
607
605
|
|
|
608
606
|
return lst, client, list_path
|
|
@@ -1301,7 +1299,17 @@ class Catalog:
|
|
|
1301
1299
|
name: str,
|
|
1302
1300
|
version: Optional[int] = None,
|
|
1303
1301
|
force: Optional[bool] = False,
|
|
1302
|
+
studio: Optional[bool] = False,
|
|
1304
1303
|
):
|
|
1304
|
+
from datachain.remote.studio import StudioClient
|
|
1305
|
+
|
|
1306
|
+
if studio:
|
|
1307
|
+
client = StudioClient()
|
|
1308
|
+
response = client.rm_dataset(name, version=version, force=force)
|
|
1309
|
+
if not response.ok:
|
|
1310
|
+
raise DataChainError(response.message)
|
|
1311
|
+
return
|
|
1312
|
+
|
|
1305
1313
|
dataset = self.get_dataset(name)
|
|
1306
1314
|
if not version and not force:
|
|
1307
1315
|
raise ValueError(f"Missing dataset version from input for dataset {name}")
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import sys
|
|
2
3
|
from importlib import import_module
|
|
3
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
5
|
|
|
@@ -15,6 +16,7 @@ METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
|
|
|
15
16
|
WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
|
|
16
17
|
WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
|
|
17
18
|
WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
|
|
19
|
+
DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
|
|
18
20
|
DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
|
|
19
21
|
|
|
20
22
|
IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
@@ -100,19 +102,21 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
|
100
102
|
return warehouse_class(**warehouse_args)
|
|
101
103
|
|
|
102
104
|
|
|
103
|
-
def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
|
|
104
|
-
distributed_import_path
|
|
105
|
+
def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
|
|
106
|
+
if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
|
|
107
|
+
return None
|
|
105
108
|
|
|
106
|
-
if not distributed_import_path:
|
|
107
|
-
raise RuntimeError(
|
|
108
|
-
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
109
|
-
"for distributed UDF processing."
|
|
110
|
-
)
|
|
111
109
|
# Distributed class paths are specified as (for example): module.classname
|
|
112
110
|
if "." not in distributed_import_path:
|
|
113
111
|
raise RuntimeError(
|
|
114
112
|
f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
|
|
115
113
|
)
|
|
114
|
+
|
|
115
|
+
# Optional: set the Python path to look for the module
|
|
116
|
+
distributed_import_pythonpath = os.environ.get(DISTRIBUTED_IMPORT_PYTHONPATH)
|
|
117
|
+
if distributed_import_pythonpath and distributed_import_pythonpath not in sys.path:
|
|
118
|
+
sys.path.insert(0, distributed_import_pythonpath)
|
|
119
|
+
|
|
116
120
|
module_name, _, class_name = distributed_import_path.rpartition(".")
|
|
117
121
|
distributed = import_module(module_name)
|
|
118
122
|
return getattr(distributed, class_name)
|
|
@@ -30,8 +30,8 @@ if TYPE_CHECKING:
|
|
|
30
30
|
DEFAULT_DELIMITER = "__"
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def col_name(name: str,
|
|
34
|
-
return f"{
|
|
33
|
+
def col_name(name: str, column: str = "file") -> str:
|
|
34
|
+
return f"{column}{DEFAULT_DELIMITER}{name}"
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
@@ -84,19 +84,19 @@ def convert_rows_custom_column_types(
|
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
class DirExpansion:
|
|
87
|
-
def __init__(self,
|
|
88
|
-
self.
|
|
87
|
+
def __init__(self, column: str):
|
|
88
|
+
self.column = column
|
|
89
89
|
|
|
90
|
-
def col_name(self, name: str,
|
|
91
|
-
|
|
92
|
-
return col_name(name,
|
|
90
|
+
def col_name(self, name: str, column: Optional[str] = None) -> str:
|
|
91
|
+
column = column or self.column
|
|
92
|
+
return col_name(name, column)
|
|
93
93
|
|
|
94
|
-
def c(self, query, name: str,
|
|
95
|
-
return getattr(query.c, self.col_name(name,
|
|
94
|
+
def c(self, query, name: str, column: Optional[str] = None) -> str:
|
|
95
|
+
return getattr(query.c, self.col_name(name, column=column))
|
|
96
96
|
|
|
97
97
|
def base_select(self, q):
|
|
98
98
|
return sa.select(
|
|
99
|
-
self.c(q, "id",
|
|
99
|
+
self.c(q, "id", column="sys"),
|
|
100
100
|
false().label(self.col_name("is_dir")),
|
|
101
101
|
self.c(q, "source"),
|
|
102
102
|
self.c(q, "path"),
|
|
@@ -153,12 +153,12 @@ class DataTable:
|
|
|
153
153
|
name: str,
|
|
154
154
|
engine: "DatabaseEngine",
|
|
155
155
|
column_types: Optional[dict[str, SQLType]] = None,
|
|
156
|
-
|
|
156
|
+
column: str = "file",
|
|
157
157
|
):
|
|
158
158
|
self.name: str = name
|
|
159
159
|
self.engine = engine
|
|
160
160
|
self.column_types: dict[str, SQLType] = column_types or {}
|
|
161
|
-
self.
|
|
161
|
+
self.column = column
|
|
162
162
|
|
|
163
163
|
@staticmethod
|
|
164
164
|
def copy_column(
|
|
@@ -224,18 +224,16 @@ class DataTable:
|
|
|
224
224
|
def columns(self) -> "ReadOnlyColumnCollection[str, sa.Column[Any]]":
|
|
225
225
|
return self.table.columns
|
|
226
226
|
|
|
227
|
-
def col_name(self, name: str,
|
|
228
|
-
|
|
229
|
-
return col_name(name,
|
|
227
|
+
def col_name(self, name: str, column: Optional[str] = None) -> str:
|
|
228
|
+
column = column or self.column
|
|
229
|
+
return col_name(name, column)
|
|
230
230
|
|
|
231
|
-
def without_object(
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
object_name = object_name or self.object_name
|
|
235
|
-
return column_name.removeprefix(f"{object_name}{DEFAULT_DELIMITER}")
|
|
231
|
+
def without_object(self, column_name: str, column: Optional[str] = None) -> str:
|
|
232
|
+
column = column or self.column
|
|
233
|
+
return column_name.removeprefix(f"{column}{DEFAULT_DELIMITER}")
|
|
236
234
|
|
|
237
|
-
def c(self, name: str,
|
|
238
|
-
return getattr(self.columns, self.col_name(name,
|
|
235
|
+
def c(self, name: str, column: Optional[str] = None):
|
|
236
|
+
return getattr(self.columns, self.col_name(name, column=column))
|
|
239
237
|
|
|
240
238
|
@property
|
|
241
239
|
def table(self) -> "sa.Table":
|
|
@@ -275,7 +273,7 @@ class DataTable:
|
|
|
275
273
|
]
|
|
276
274
|
|
|
277
275
|
def dir_expansion(self):
|
|
278
|
-
return DirExpansion(self.
|
|
276
|
+
return DirExpansion(self.column)
|
|
279
277
|
|
|
280
278
|
|
|
281
279
|
PARTITION_COLUMN_ID = "partition_id"
|
|
@@ -489,7 +489,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
489
489
|
self, dataset: DatasetRecord, version: int
|
|
490
490
|
) -> list[StorageURI]:
|
|
491
491
|
dr = self.dataset_rows(dataset, version)
|
|
492
|
-
query = dr.select(dr.c("source",
|
|
492
|
+
query = dr.select(dr.c("source", column="file")).distinct()
|
|
493
493
|
cur = self.db.cursor()
|
|
494
494
|
cur.row_factory = sqlite3.Row # type: ignore[assignment]
|
|
495
495
|
|
|
@@ -179,7 +179,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
179
179
|
self,
|
|
180
180
|
dataset: DatasetRecord,
|
|
181
181
|
version: Optional[int] = None,
|
|
182
|
-
|
|
182
|
+
column: str = "file",
|
|
183
183
|
):
|
|
184
184
|
version = version or dataset.latest_version
|
|
185
185
|
|
|
@@ -188,7 +188,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
188
188
|
table_name,
|
|
189
189
|
self.db,
|
|
190
190
|
dataset.get_schema(version),
|
|
191
|
-
|
|
191
|
+
column=column,
|
|
192
192
|
)
|
|
193
193
|
|
|
194
194
|
@property
|
|
@@ -487,7 +487,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
487
487
|
dataset_rows: "DataTable",
|
|
488
488
|
path_list: list[str],
|
|
489
489
|
glob_name: str,
|
|
490
|
-
|
|
490
|
+
column="file",
|
|
491
491
|
) -> Iterator[Node]:
|
|
492
492
|
"""Finds all Nodes that correspond to GLOB like path pattern."""
|
|
493
493
|
dr = dataset_rows
|
|
@@ -521,7 +521,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
521
521
|
de = dr.dir_expansion()
|
|
522
522
|
q = de.query(
|
|
523
523
|
dr.select().where(dr.c("is_latest") == true()).subquery(),
|
|
524
|
-
|
|
524
|
+
column=dr.column,
|
|
525
525
|
).subquery()
|
|
526
526
|
q = self.expand_query(de, q, dr)
|
|
527
527
|
|
|
@@ -597,12 +597,10 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
597
597
|
with_default(dr.c("is_latest")),
|
|
598
598
|
dr.c("last_modified"),
|
|
599
599
|
with_default(dr.c("size")),
|
|
600
|
-
with_default(dr.c("rand",
|
|
600
|
+
with_default(dr.c("rand", column="sys")),
|
|
601
601
|
dr.c("location"),
|
|
602
602
|
de.c(q, "source"),
|
|
603
|
-
).select_from(
|
|
604
|
-
q.outerjoin(dr.table, q.c.sys__id == dr.c("id", object_name="sys"))
|
|
605
|
-
)
|
|
603
|
+
).select_from(q.outerjoin(dr.table, q.c.sys__id == dr.c("id", column="sys")))
|
|
606
604
|
|
|
607
605
|
def get_node_by_path(self, dataset_rows: "DataTable", path: str) -> Node:
|
|
608
606
|
"""Gets node that corresponds to some path"""
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
from collections.abc import Sequence
|
|
2
|
-
from typing import Any, Union
|
|
3
|
+
from typing import Any, Optional, Union
|
|
3
4
|
|
|
4
5
|
from datachain.lib.data_model import (
|
|
5
6
|
DataType,
|
|
@@ -66,21 +67,29 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
66
67
|
f"signal '{k}' is not present in the output",
|
|
67
68
|
)
|
|
68
69
|
else:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
raise ValuesToTupleError(
|
|
76
|
-
ds_name,
|
|
77
|
-
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
78
|
-
f" Please use DataModel types: {DataTypeNames}",
|
|
70
|
+
# FIXME: Stops as soon as it finds the first non-None value.
|
|
71
|
+
# If a non-None value appears early, it won't check the remaining items for
|
|
72
|
+
# `None` values.
|
|
73
|
+
try:
|
|
74
|
+
pos, first_not_none_element = next(
|
|
75
|
+
itertools.dropwhile(lambda pair: pair[1] is None, enumerate(v))
|
|
79
76
|
)
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
except StopIteration:
|
|
78
|
+
typ = str # default to str if all values are None or has length 0
|
|
79
|
+
nullable = True
|
|
82
80
|
else:
|
|
83
|
-
|
|
81
|
+
nullable = pos > 0
|
|
82
|
+
typ = type(first_not_none_element) # type: ignore[assignment]
|
|
83
|
+
if not is_chain_type(typ):
|
|
84
|
+
raise ValuesToTupleError(
|
|
85
|
+
ds_name,
|
|
86
|
+
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
87
|
+
f" Please use DataModel types: {DataTypeNames}",
|
|
88
|
+
)
|
|
89
|
+
if isinstance(first_not_none_element, list):
|
|
90
|
+
typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
|
|
91
|
+
|
|
92
|
+
types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
|
|
84
93
|
|
|
85
94
|
if length < 0:
|
|
86
95
|
length = len_
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from .csv import read_csv
|
|
2
|
+
from .database import read_database
|
|
2
3
|
from .datachain import C, Column, DataChain
|
|
3
|
-
from .datasets import datasets, read_dataset
|
|
4
|
+
from .datasets import datasets, delete_dataset, read_dataset
|
|
4
5
|
from .hf import read_hf
|
|
5
6
|
from .json import read_json
|
|
6
7
|
from .listings import listings
|
|
@@ -19,8 +20,10 @@ __all__ = [
|
|
|
19
20
|
"DatasetPrepareError",
|
|
20
21
|
"Sys",
|
|
21
22
|
"datasets",
|
|
23
|
+
"delete_dataset",
|
|
22
24
|
"listings",
|
|
23
25
|
"read_csv",
|
|
26
|
+
"read_database",
|
|
24
27
|
"read_dataset",
|
|
25
28
|
"read_hf",
|
|
26
29
|
"read_json",
|
|
@@ -21,7 +21,7 @@ def read_csv(
|
|
|
21
21
|
delimiter: Optional[str] = None,
|
|
22
22
|
header: bool = True,
|
|
23
23
|
output: OutputType = None,
|
|
24
|
-
|
|
24
|
+
column: str = "",
|
|
25
25
|
model_name: str = "",
|
|
26
26
|
source: bool = True,
|
|
27
27
|
nrows=None,
|
|
@@ -42,7 +42,7 @@ def read_csv(
|
|
|
42
42
|
output : Dictionary or feature class defining column names and their
|
|
43
43
|
corresponding types. List of column names is also accepted, in which
|
|
44
44
|
case types will be inferred.
|
|
45
|
-
|
|
45
|
+
column : Created column name.
|
|
46
46
|
model_name : Generated model name.
|
|
47
47
|
source : Whether to include info about the source file.
|
|
48
48
|
nrows : Optional row limit.
|
|
@@ -119,7 +119,7 @@ def read_csv(
|
|
|
119
119
|
)
|
|
120
120
|
return chain.parse_tabular(
|
|
121
121
|
output=output,
|
|
122
|
-
|
|
122
|
+
column=column,
|
|
123
123
|
model_name=model_name,
|
|
124
124
|
source=source,
|
|
125
125
|
nrows=nrows,
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import itertools
|
|
3
|
+
import os
|
|
4
|
+
import sqlite3
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
6
|
+
|
|
7
|
+
import sqlalchemy
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
11
|
+
|
|
12
|
+
import sqlalchemy.orm # noqa: TC004
|
|
13
|
+
|
|
14
|
+
from datachain.lib.data_model import DataType
|
|
15
|
+
from datachain.query import Session
|
|
16
|
+
|
|
17
|
+
from .datachain import DataChain
|
|
18
|
+
|
|
19
|
+
ConnectionType = Union[
|
|
20
|
+
str,
|
|
21
|
+
sqlalchemy.engine.URL,
|
|
22
|
+
sqlalchemy.engine.interfaces.Connectable,
|
|
23
|
+
sqlalchemy.engine.Engine,
|
|
24
|
+
sqlalchemy.engine.Connection,
|
|
25
|
+
sqlalchemy.orm.Session,
|
|
26
|
+
sqlite3.Connection,
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@contextlib.contextmanager
|
|
31
|
+
def _connect(
|
|
32
|
+
connection: "ConnectionType",
|
|
33
|
+
) -> "Iterator[Union[sqlalchemy.engine.Connection, sqlalchemy.orm.Session]]":
|
|
34
|
+
import sqlalchemy.orm
|
|
35
|
+
|
|
36
|
+
with contextlib.ExitStack() as stack:
|
|
37
|
+
engine_kwargs = {"echo": bool(os.environ.get("DEBUG_SHOW_SQL_QUERIES"))}
|
|
38
|
+
if isinstance(connection, (str, sqlalchemy.URL)):
|
|
39
|
+
engine = sqlalchemy.create_engine(connection, **engine_kwargs)
|
|
40
|
+
stack.callback(engine.dispose)
|
|
41
|
+
yield stack.enter_context(engine.connect())
|
|
42
|
+
elif isinstance(connection, sqlite3.Connection):
|
|
43
|
+
engine = sqlalchemy.create_engine(
|
|
44
|
+
"sqlite://", creator=lambda: connection, **engine_kwargs
|
|
45
|
+
)
|
|
46
|
+
# do not close the connection, as it is managed by the caller
|
|
47
|
+
yield engine.connect()
|
|
48
|
+
elif isinstance(connection, sqlalchemy.Engine):
|
|
49
|
+
yield stack.enter_context(connection.connect())
|
|
50
|
+
elif isinstance(connection, (sqlalchemy.Connection, sqlalchemy.orm.Session)):
|
|
51
|
+
# do not close the connection, as it is managed by the caller
|
|
52
|
+
yield connection
|
|
53
|
+
else:
|
|
54
|
+
raise TypeError(f"Unsupported connection type: {type(connection).__name__}")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _infer_schema(
|
|
58
|
+
result: "sqlalchemy.engine.Result",
|
|
59
|
+
to_infer: list[str],
|
|
60
|
+
infer_schema_length: Optional[int] = 100,
|
|
61
|
+
) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
|
|
62
|
+
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
63
|
+
|
|
64
|
+
if not to_infer:
|
|
65
|
+
return [], {}
|
|
66
|
+
|
|
67
|
+
rows = list(itertools.islice(result, infer_schema_length))
|
|
68
|
+
values = {col: [row._mapping[col] for row in rows] for col in to_infer}
|
|
69
|
+
_, output_schema, _ = values_to_tuples("", **values)
|
|
70
|
+
return rows, output_schema
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def read_database(
|
|
74
|
+
query: Union[str, "sqlalchemy.sql.expression.Executable"],
|
|
75
|
+
connection: "ConnectionType",
|
|
76
|
+
params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
|
|
77
|
+
*,
|
|
78
|
+
output: Optional["dict[str, DataType]"] = None,
|
|
79
|
+
session: Optional["Session"] = None,
|
|
80
|
+
settings: Optional[dict] = None,
|
|
81
|
+
in_memory: bool = False,
|
|
82
|
+
infer_schema_length: Optional[int] = 100,
|
|
83
|
+
) -> "DataChain":
|
|
84
|
+
"""
|
|
85
|
+
Read the results of a SQL query into a DataChain, using a given database connection.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
query:
|
|
89
|
+
The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
|
|
90
|
+
`Executable` object.
|
|
91
|
+
connection: SQLAlchemy connectable, str, or a sqlite3 connection
|
|
92
|
+
Using SQLAlchemy makes it possible to use any DB supported by that
|
|
93
|
+
library. If a DBAPI2 object, only sqlite3 is supported. The user is
|
|
94
|
+
responsible for engine disposal and connection closure for the
|
|
95
|
+
SQLAlchemy connectable; str connections are closed automatically.
|
|
96
|
+
params: Parameters to pass to execute method.
|
|
97
|
+
output: A dictionary mapping column names to types, used to override the
|
|
98
|
+
schema inferred from the query results.
|
|
99
|
+
session: Session to use for the chain.
|
|
100
|
+
settings: Settings to use for the chain.
|
|
101
|
+
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
102
|
+
infer_schema_length:
|
|
103
|
+
The maximum number of rows to scan for inferring schema.
|
|
104
|
+
If set to `None`, the full data may be scanned.
|
|
105
|
+
The rows used for schema inference are stored in memory,
|
|
106
|
+
so large values can lead to high memory usage.
|
|
107
|
+
Only applies if the `output` parameter is not set for the given column.
|
|
108
|
+
|
|
109
|
+
Examples:
|
|
110
|
+
Reading from a SQL query against a user-supplied connection:
|
|
111
|
+
```python
|
|
112
|
+
query = "SELECT key, value FROM tbl"
|
|
113
|
+
chain = dc.read_database(query, connection, output={"value": float})
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Load data from a SQLAlchemy driver/engine:
|
|
117
|
+
```python
|
|
118
|
+
from sqlalchemy import create_engine
|
|
119
|
+
engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
|
|
120
|
+
chain = dc.read_database("select * from tbl", engine)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Load data from a parameterized SQLAlchemy query:
|
|
124
|
+
```python
|
|
125
|
+
query = "SELECT key, value FROM tbl WHERE value > :value"
|
|
126
|
+
dc.read_database(query, engine, params={"value": 50})
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Notes:
|
|
130
|
+
This function works with a variety of databases — including, but not limited to,
|
|
131
|
+
SQLite, DuckDB, PostgreSQL, and Snowflake, provided the appropriate driver is
|
|
132
|
+
installed.
|
|
133
|
+
"""
|
|
134
|
+
from datachain.lib.dc.records import read_records
|
|
135
|
+
|
|
136
|
+
output = output or {}
|
|
137
|
+
if isinstance(query, str):
|
|
138
|
+
query = sqlalchemy.text(query)
|
|
139
|
+
kw = {"execution_options": {"stream_results": True}} # use server-side cursors
|
|
140
|
+
with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
|
|
141
|
+
cols = result.keys()
|
|
142
|
+
to_infer = [k for k in cols if k not in output] # preserve the order
|
|
143
|
+
rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
|
|
144
|
+
records = (row._asdict() for row in itertools.chain(rows, result))
|
|
145
|
+
return read_records(
|
|
146
|
+
records,
|
|
147
|
+
session=session,
|
|
148
|
+
settings=settings,
|
|
149
|
+
in_memory=in_memory,
|
|
150
|
+
schema=inferred_schema | output,
|
|
151
|
+
)
|