datachain 0.21.1__tar.gz → 0.23.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.21.1 → datachain-0.23.0}/.github/workflows/tests-studio.yml +1 -0
- {datachain-0.21.1 → datachain-0.23.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.21.1 → datachain-0.23.0}/PKG-INFO +2 -2
- datachain-0.23.0/docs/guide/db_migrations.md +114 -0
- datachain-0.23.0/docs/guide/env.md +22 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/guide/index.md +3 -0
- datachain-0.23.0/docs/guide/namespaces.md +161 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/get_started/json-csv-reader.py +1 -1
- {datachain-0.21.1 → datachain-0.23.0}/examples/incremental_processing/delta.py +1 -1
- {datachain-0.21.1 → datachain-0.23.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +15 -5
- {datachain-0.21.1 → datachain-0.23.0}/mkdocs.yml +3 -0
- {datachain-0.21.1 → datachain-0.23.0}/pyproject.toml +3 -2
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/__init__.py +2 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cache.py +2 -2
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/catalog/catalog.py +213 -65
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/__init__.py +0 -7
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/commands/datasets.py +35 -26
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/commands/ls.py +2 -2
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/parser/__init__.py +1 -35
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/client/fsspec.py +5 -3
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/client/hf.py +10 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/client/local.py +4 -4
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/data_storage/metastore.py +433 -37
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/data_storage/sqlite.py +140 -7
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/data_storage/warehouse.py +26 -7
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/dataset.py +128 -12
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/delta.py +11 -7
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/error.py +36 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/func.py +1 -1
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/arrow.py +3 -3
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dataset_info.py +4 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/datachain.py +253 -91
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/datasets.py +103 -50
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/listings.py +3 -3
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/records.py +2 -1
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/storage.py +38 -40
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/file.py +77 -23
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/listing.py +3 -1
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/meta_formats.py +1 -1
- datachain-0.23.0/src/datachain/lib/namespaces.py +71 -0
- datachain-0.23.0/src/datachain/lib/projects.py +86 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/pytorch.py +1 -1
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/settings.py +10 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/signal_schema.py +8 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/tar.py +1 -2
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/udf.py +1 -1
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/udf_signature.py +1 -1
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/webdataset.py +30 -20
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/listing.py +3 -1
- datachain-0.23.0/src/datachain/namespace.py +65 -0
- datachain-0.23.0/src/datachain/project.py +78 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/dataset.py +71 -46
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/session.py +1 -1
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/remote/studio.py +61 -26
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/studio.py +23 -6
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain.egg-info/PKG-INFO +2 -2
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain.egg-info/SOURCES.txt +9 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.21.1 → datachain-0.23.0}/tests/conftest.py +86 -4
- {datachain-0.21.1 → datachain-0.23.0}/tests/examples/test_examples.py +2 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/examples/test_wds_e2e.py +5 -5
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/functions/test_aggregate.py +7 -9
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/functions/test_array.py +20 -21
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/functions/test_conditional.py +6 -7
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/functions/test_numeric.py +4 -5
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/functions/test_path.py +6 -8
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/functions/test_random.py +3 -6
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/functions/test_string.py +6 -7
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_batching.py +5 -5
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_datachain.py +77 -36
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_dataset_query.py +20 -2
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_datasets.py +113 -81
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_delta.py +15 -29
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_file.py +33 -7
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_listing.py +1 -1
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_metastore.py +30 -10
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_pull.py +68 -18
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_retry.py +6 -8
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_toolkit.py +2 -2
- {datachain-0.21.1 → datachain-0.23.0}/tests/test_atomicity.py +3 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/test_cli_e2e.py +43 -10
- {datachain-0.21.1 → datachain-0.23.0}/tests/test_cli_studio.py +40 -29
- {datachain-0.21.1 → datachain-0.23.0}/tests/test_import_time.py +2 -2
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_datachain.py +341 -110
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_datachain_bootstrap.py +3 -3
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_datachain_merge.py +11 -11
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_diff.py +43 -45
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_feature_utils.py +2 -2
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_file.py +50 -8
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_listing_info.py +7 -2
- datachain-0.23.0/tests/unit/lib/test_namespace.py +79 -0
- datachain-0.23.0/tests/unit/lib/test_project.py +157 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_schema.py +1 -4
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_dataset.py +43 -1
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_func.py +149 -125
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_listing.py +20 -4
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_metastore.py +35 -3
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_session.py +31 -9
- {datachain-0.21.1 → datachain-0.23.0}/tests/utils.py +2 -2
- {datachain-0.21.1 → datachain-0.23.0}/.cruft.json +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.gitattributes +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.github/codecov.yaml +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.github/dependabot.yml +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.github/workflows/release.yml +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/.gitignore +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/LICENSE +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/README.rst +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/commands/auth/login.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/commands/auth/logout.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/commands/auth/team.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/commands/auth/token.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/commands/index.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/commands/job/cancel.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/commands/job/clusters.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/commands/job/logs.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/commands/job/ls.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/commands/job/run.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/contributing.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/examples.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/guide/delta.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/guide/processing.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/guide/remotes.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/guide/retry.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/index.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/overrides/main.html +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/quick-start.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/datachain.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/func.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/index.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/toolkit.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/torch.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/references/udf.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/docs/tutorials.md +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/noxfile.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/setup.cfg +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/__main__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/asyn.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/config.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/array.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/base.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/path.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/random.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/string.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/func/window.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/job.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/node.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/progress.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/py.typed +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/params.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/semver.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain/utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/data.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/examples/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/functions/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_client.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_hf.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_image.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_ls.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_query.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_read_database.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_session.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_video.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/test_telemetry.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_client.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_config.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_query.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_semver.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.21.1 → datachain-0.23.0}/tests/unit/test_warehouse.py +0 -0
|
@@ -98,6 +98,7 @@ jobs:
|
|
|
98
98
|
- name: Run tests
|
|
99
99
|
# Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
|
|
100
100
|
run: >
|
|
101
|
+
DATACHAIN_METASTORE_ARG_PROJECT=john
|
|
101
102
|
PYTHONPATH="$(pwd)/..:${PYTHONPATH}"
|
|
102
103
|
pytest
|
|
103
104
|
--config-file=pyproject.toml -rs
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.23.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -94,7 +94,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
94
94
|
Requires-Dist: ultralytics; extra == "tests"
|
|
95
95
|
Provides-Extra: dev
|
|
96
96
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
97
|
-
Requires-Dist: mypy==1.16.
|
|
97
|
+
Requires-Dist: mypy==1.16.1; extra == "dev"
|
|
98
98
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
99
99
|
Requires-Dist: types-pytz; extra == "dev"
|
|
100
100
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# Handling Local Database Migrations (CLI)
|
|
2
|
+
|
|
3
|
+
When using the DataChain CLI, datasets are stored in a local SQLite database located at:
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
.datachain/db
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
Unlike the SaaS version (Studio), the CLI does **not** support automatic database migrations. This means that after upgrading the DataChain CLI, the local database schema may become incompatible with the updated codebase.
|
|
10
|
+
|
|
11
|
+
## Schema Mismatch Detection
|
|
12
|
+
|
|
13
|
+
The CLI automatically checks for schema compatibility. If a mismatch is detected, you’ll see an error like:
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
OutdatedDatabaseSchemaError: You have an old version of the database schema. Please refer to the documentation for more information.
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
This typically happens after upgrading the CLI to a newer version.
|
|
20
|
+
|
|
21
|
+
## How to Fix It
|
|
22
|
+
|
|
23
|
+
The recommended fix is to **delete the local database** and let the CLI recreate it. To avoid losing datasets, you should **export them before removing the database**.
|
|
24
|
+
|
|
25
|
+
Before deleting the file, we strongly recommend making a backup of your current database:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
cp .datachain/db .datachain/db.backup
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
This allows you to recover data manually if needed later.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Exporting and Re-Importing All Local Datasets
|
|
36
|
+
|
|
37
|
+
**Important:** Exporting datasets must be done **before upgrading** to a new DataChain version. Export with the old version to avoid the `OutdatedDatabaseSchemaError` during export. After deleting the database file, upgrade/install the new DataChain version.
|
|
38
|
+
|
|
39
|
+
### Step 1: Export All Datasets to Parquet
|
|
40
|
+
|
|
41
|
+
Export all datasets into a folder named `exported_datasets` (created if it doesn't exist). Each dataset will be saved to a file in the format:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
<dataset_name>.<dataset_version>.parquet
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Example: `metrics.1.0.1.parquet`
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import os
|
|
51
|
+
import datachain as dc
|
|
52
|
+
|
|
53
|
+
export_dir = "exported_datasets"
|
|
54
|
+
os.makedirs(export_dir, exist_ok=True)
|
|
55
|
+
|
|
56
|
+
# dc.datasets() returns a chain of DatasetInfo objects
|
|
57
|
+
for ds_info in dc.datasets(column="dataset").to_values("dataset"):
|
|
58
|
+
ds = dc.read_dataset(ds_info.name, version=ds_info.version)
|
|
59
|
+
filename = f"{ds_info.name}.{ds_info.version}.parquet"
|
|
60
|
+
filepath = os.path.join(export_dir, filename)
|
|
61
|
+
ds.to_parquet(filepath)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Step 2: Delete Local Database
|
|
65
|
+
|
|
66
|
+
Make sure you've backed it up (see above), then:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
rm .datachain/db
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Step 3: Re-import All Datasets from Parquet (In Correct Version Order)
|
|
73
|
+
|
|
74
|
+
To avoid import errors due to semantic versioning constraints, datasets must be imported in ascending order by version for each dataset name.
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import os
|
|
78
|
+
import datachain as dc
|
|
79
|
+
from packaging.version import Version
|
|
80
|
+
|
|
81
|
+
import_dir = "exported_datasets"
|
|
82
|
+
|
|
83
|
+
# Gather all dataset files
|
|
84
|
+
datasets = []
|
|
85
|
+
|
|
86
|
+
for fname in os.listdir(import_dir):
|
|
87
|
+
if not fname.endswith(".parquet"):
|
|
88
|
+
continue
|
|
89
|
+
base = fname[:-8] # remove '.parquet'
|
|
90
|
+
name, version = base.split('.', 1) # split on first dot
|
|
91
|
+
filepath = os.path.join(import_dir, fname)
|
|
92
|
+
datasets.append((name, Version(version), filepath))
|
|
93
|
+
|
|
94
|
+
# Sort by dataset name and then by version ascending
|
|
95
|
+
datasets.sort(key=lambda x: (x[0], x[1]))
|
|
96
|
+
|
|
97
|
+
# Import datasets in order
|
|
98
|
+
for name, version, filepath in datasets:
|
|
99
|
+
dc.read_parquet(filepath).save(name, version=str(version))
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Note:** While exporting and importing datasets to Parquet files preserves the datasets and their data, some metadata — such as dataset dependencies — will **not** be preserved. This information will be lost during this process.
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Notes
|
|
107
|
+
|
|
108
|
+
- This limitation only applies to the **CLI**, which uses a local SQLite database.
|
|
109
|
+
- The **Studio (SaaS)** version handles all schema migrations automatically — no manual steps are required.
|
|
110
|
+
- The CLI only supports the default namespace/project: `local.local`.
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
This export/import workflow is the recommended way to preserve your datasets during local CLI upgrades that involve database schema changes.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Environment Variables
|
|
2
|
+
|
|
3
|
+
List of environment variables used to configure DataChain behavior.
|
|
4
|
+
|
|
5
|
+
### Core Configuration
|
|
6
|
+
|
|
7
|
+
- `DATACHAIN_ROOT_DIR` – Specifies the root directory where DataChain will create the `.datachain` folder to store its internal data. (default: the current working directory).
|
|
8
|
+
- `DATACHAIN_SYSTEM_CONFIG_DIR` – Overrides the system-wide configuration directory (default depends on the platform).
|
|
9
|
+
- `DATACHAIN_GLOBAL_CONFIG_DIR` – Overrides the user's global configuration directory (default depends on the platform).
|
|
10
|
+
- `DATACHAIN_NO_ANALYTICS` – Disables telemetry.
|
|
11
|
+
|
|
12
|
+
### Studio Integration
|
|
13
|
+
|
|
14
|
+
- `DATACHAIN_STUDIO_URL` – Custom Studio URL.
|
|
15
|
+
- `DATACHAIN_STUDIO_TOKEN` – Authentication token for Studio.
|
|
16
|
+
- `DATACHAIN_STUDIO_TEAM` – Studio team name.
|
|
17
|
+
|
|
18
|
+
### Namespaces and projects
|
|
19
|
+
- `DATACHAIN_NAMESPACE` – Namespace name to use as default.
|
|
20
|
+
- `DATACHAIN_PROJECT` – Project name or combination of namespace name and project name separated by `.` to use as default, example: `DATACHAIN_PROJECT=dev.analytics`
|
|
21
|
+
|
|
22
|
+
Note: Some environment variables are used internally and may not be documented here. For the most up-to-date list, refer to the source code.
|
|
@@ -10,3 +10,6 @@ Welcome to the DataChain User Guide! This section provides comprehensive documen
|
|
|
10
10
|
- [Data Processing Overview](./processing.md) - Discover DataChain's specialized data processing features.
|
|
11
11
|
- [Delta Processing](./delta.md) - Incremental data processing to efficiently handle large datasets that change over time.
|
|
12
12
|
- [Error Handling and Retries](./retry.md) - Learn how to handle processing errors and selectively reprocess problematic records.
|
|
13
|
+
- [Environment Variables](./env.md) - Configure DataChain's behavior using environment variables.
|
|
14
|
+
- [Namespaces](./namespaces.md) - Learn more about namespaces and projects.
|
|
15
|
+
- [Local DB Migrations](./namespaces.md) - Learn how to handle local DB migrations after upgrading datachain.
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# Organizing Datasets with Namespace and Project
|
|
2
|
+
|
|
3
|
+
DataChain allows you to organize datasets using namespaces and projects. These provide an additional structure for managing data across different workflows, use cases, or organizational structures.
|
|
4
|
+
|
|
5
|
+
A dataset in DataChain is organized as:
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
<namespace>.<project>.<dataset>
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
For example:
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
dev.analytics.metrics
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Default Namespace and Project
|
|
18
|
+
|
|
19
|
+
If no namespace or project is specified, DataChain uses defaults depending on whether you're using **Studio** or the **CLI**.
|
|
20
|
+
|
|
21
|
+
### Studio
|
|
22
|
+
|
|
23
|
+
- **Namespace:** `users`
|
|
24
|
+
- **Project:** your username (e.g. `jondoe`)
|
|
25
|
+
- Saving without namespace/project:
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
dc.read_values(scores=[1.2, 3.4, 2.5]).save("metrics")
|
|
29
|
+
# Saved as users.jondoe.metrics
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### CLI
|
|
33
|
+
|
|
34
|
+
- **Namespace:** `local`
|
|
35
|
+
- **Project:** `local`
|
|
36
|
+
- Saving without namespace/project:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
dc.read_values(scores=[2.0, 2.2, 2.8]).save("metrics")
|
|
40
|
+
# Saved as local.local.metrics
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
In the CLI, you cannot create or use any namespaces or projects other than the default `local.local`.
|
|
44
|
+
|
|
45
|
+
## Creating a Project (Studio only)
|
|
46
|
+
|
|
47
|
+
In Studio, you can explicitly create a project and namespace using:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import datachain as dc
|
|
51
|
+
|
|
52
|
+
dc.create_project("dev", "analytics")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
This creates the `dev` namespace (if it doesn't exist) and a project called `analytics` inside it.
|
|
56
|
+
|
|
57
|
+
**Note:** Creating custom namespaces and projects is only supported in **Studio**. In the **CLI**, only the default `local` namespace and `local` project are available.
|
|
58
|
+
|
|
59
|
+
## Saving a Dataset Using a Fully Qualified Name
|
|
60
|
+
|
|
61
|
+
You can implicitly create and use namespaces and projects by saving a dataset using a fully qualified name:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
dc.read_values(scores=[1.2, 3.4, 2.5]).save("dev.analytics.metrics")
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
In Studio, this automatically creates the namespace and project if they don’t already exist.
|
|
68
|
+
|
|
69
|
+
In CLI, only `local.local.<dataset>` is supported. Using any other namespace or project will result in an error.
|
|
70
|
+
|
|
71
|
+
## Using `.settings()` to Set Namespace and Project
|
|
72
|
+
|
|
73
|
+
You can also set the namespace and project using `.settings()`:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
dc.read_values(scores=[1.2, 3.4, 2.5])
|
|
77
|
+
.settings(namespace="dev", project="analytics")
|
|
78
|
+
.save("metrics")
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
This is equivalent to saving to `dev.analytics.metrics`.
|
|
82
|
+
|
|
83
|
+
In CLI, `.settings()` is only supported when both `namespace` and `project` are set to `"local"`.
|
|
84
|
+
|
|
85
|
+
## Setting Namespace and Project via Environment Variables
|
|
86
|
+
|
|
87
|
+
In addition to using `.settings()`, you can configure the namespace and project using environment variables:
|
|
88
|
+
|
|
89
|
+
- `DATACHAIN_NAMESPACE` sets the namespace.
|
|
90
|
+
- `DATACHAIN_PROJECT` sets the project name, or both the namespace and project using the format `namespace.project`.
|
|
91
|
+
|
|
92
|
+
### Examples
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
# Set namespace only
|
|
96
|
+
export DATACHAIN_NAMESPACE=dev
|
|
97
|
+
|
|
98
|
+
# Set project only
|
|
99
|
+
export DATACHAIN_PROJECT=analytics
|
|
100
|
+
|
|
101
|
+
# Set both namespace and project
|
|
102
|
+
export DATACHAIN_PROJECT=dev.analytics
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## How Namespace and Project Are Resolved
|
|
106
|
+
|
|
107
|
+
When determining which namespace and project to use, Datachain applies the following precedence:
|
|
108
|
+
|
|
109
|
+
1. **Fully qualified dataset name**
|
|
110
|
+
If the dataset name includes both the namespace and project, these values take highest precedence.
|
|
111
|
+
```python
|
|
112
|
+
dc.read_dataset("dev.analytics.metrics")
|
|
113
|
+
|
|
114
|
+
2. **Explicit settings in code**
|
|
115
|
+
Values provided via `.settings()` or passed directly to `read_dataset()` or similar methods.
|
|
116
|
+
```python
|
|
117
|
+
dc.settings(namespace="dev", project="analytics")
|
|
118
|
+
dc.read_dataset("metrics", namespace="dev", project="analytics")
|
|
119
|
+
```
|
|
120
|
+
3. **Environment variables**
|
|
121
|
+
Namespace and project set using environment variables:
|
|
122
|
+
```console
|
|
123
|
+
export DATACHAIN_PROJECT=dev.analytics
|
|
124
|
+
```
|
|
125
|
+
4. **Defaults**
|
|
126
|
+
If none of the above are provided, Datachain falls back to the default namespace and project.
|
|
127
|
+
|
|
128
|
+
## Reading a Dataset from a Project
|
|
129
|
+
|
|
130
|
+
To read a dataset from a specific namespace and project:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
ds = dc.read_dataset("dev.analytics.metrics")
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
In CLI, this only works for datasets saved in the default `local.local` project.
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
## Example (Studio)
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
import datachain as dc
|
|
143
|
+
|
|
144
|
+
dc.create_project("prod", "analytics")
|
|
145
|
+
|
|
146
|
+
dc.read_csv("gs://bucket/metrics.csv") \
|
|
147
|
+
.save("prod.analytics.metrics")
|
|
148
|
+
|
|
149
|
+
ds = dc.read_dataset("prod.analytics.metrics")
|
|
150
|
+
ds.show()
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Example (CLI – default only)
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
import datachain as dc
|
|
157
|
+
|
|
158
|
+
dc.read_values(scores=[0.8, 1.5, 2.1]).save("metrics")
|
|
159
|
+
|
|
160
|
+
ds = dc.read_dataset("local.local.metrics")
|
|
161
|
+
ds.show()
|
|
@@ -48,7 +48,7 @@ def main():
|
|
|
48
48
|
|
|
49
49
|
# Print JSON schema in Pydantic format from main COCO annotation
|
|
50
50
|
chain = dc.read_storage(uri, anon="True").filter(dc.C("file.path").glob("*.json"))
|
|
51
|
-
file =
|
|
51
|
+
file = chain.limit(1).to_values("file")[0]
|
|
52
52
|
print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
|
|
53
53
|
|
|
54
54
|
# Static JSON schema test parsing 3/7 objects
|
|
@@ -47,7 +47,7 @@ def process_files_with_delta():
|
|
|
47
47
|
print("\nDataset versions:")
|
|
48
48
|
test_dataset = dc.datasets().filter(C("name") == "test_files")
|
|
49
49
|
|
|
50
|
-
for version in test_dataset.
|
|
50
|
+
for version in test_dataset.to_iter("version"):
|
|
51
51
|
print(f"- Version: {version}")
|
|
52
52
|
|
|
53
53
|
# Show the last 3 records to demonstrate the incremental processing
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
1
3
|
from huggingface_hub import InferenceClient
|
|
2
4
|
from requests import HTTPError
|
|
3
5
|
|
|
@@ -23,6 +25,7 @@ def eval_dialog(
|
|
|
23
25
|
) -> DialogEval:
|
|
24
26
|
try:
|
|
25
27
|
completion = client.chat_completion(
|
|
28
|
+
model="meta-llama/Llama-3.3-70B-Instruct",
|
|
26
29
|
messages=[
|
|
27
30
|
{
|
|
28
31
|
"role": "user",
|
|
@@ -31,9 +34,10 @@ def eval_dialog(
|
|
|
31
34
|
],
|
|
32
35
|
response_format={"type": "json", "value": DialogEval.model_json_schema()},
|
|
33
36
|
)
|
|
34
|
-
except HTTPError:
|
|
37
|
+
except HTTPError as e:
|
|
35
38
|
return DialogEval(
|
|
36
|
-
result="Error",
|
|
39
|
+
result="Error",
|
|
40
|
+
reason=f"Error while interacting with the Hugging Face API. {e}",
|
|
37
41
|
)
|
|
38
42
|
|
|
39
43
|
message = completion.choices[0].message
|
|
@@ -48,9 +52,15 @@ def eval_dialog(
|
|
|
48
52
|
# Save to HF as Parquet. Dataset can be previewed here:
|
|
49
53
|
# https://huggingface.co/datasets/dvcorg/test-datachain-llm-eval/viewer
|
|
50
54
|
(
|
|
51
|
-
dc.read_csv(
|
|
52
|
-
|
|
53
|
-
|
|
55
|
+
dc.read_csv(
|
|
56
|
+
"hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv", source=False
|
|
57
|
+
)
|
|
58
|
+
.settings(parallel=True)
|
|
59
|
+
.setup(
|
|
60
|
+
client=lambda: InferenceClient(
|
|
61
|
+
provider="hf-inference", api_key=os.environ["HF_TOKEN"]
|
|
62
|
+
)
|
|
63
|
+
)
|
|
54
64
|
.map(response=eval_dialog)
|
|
55
65
|
.to_parquet("hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet")
|
|
56
66
|
)
|
|
@@ -105,6 +105,9 @@ nav:
|
|
|
105
105
|
- Overview: guide/processing.md
|
|
106
106
|
- Delta Processing: guide/delta.md
|
|
107
107
|
- Errors Handling and Retries: guide/retry.md
|
|
108
|
+
- Environment Variables: guide/env.md
|
|
109
|
+
- Namespaces: guide/namespaces.md
|
|
110
|
+
- Local DB Migrations: guide/db_migrations.md
|
|
108
111
|
- 🤝 Contributing: contributing.md
|
|
109
112
|
|
|
110
113
|
- DataChain Website ↗: https://datachain.ai" target="_blank"
|
|
@@ -108,7 +108,7 @@ tests = [
|
|
|
108
108
|
]
|
|
109
109
|
dev = [
|
|
110
110
|
"datachain[docs,tests]",
|
|
111
|
-
"mypy==1.16.
|
|
111
|
+
"mypy==1.16.1",
|
|
112
112
|
"types-python-dateutil",
|
|
113
113
|
"types-pytz",
|
|
114
114
|
"types-PyYAML",
|
|
@@ -221,7 +221,8 @@ ignore = [
|
|
|
221
221
|
"PERF203", # perflint - try-except-in-loop, irrelevant for Python>=3.11
|
|
222
222
|
"PERF401",
|
|
223
223
|
"D100", # undocumented-public-module
|
|
224
|
-
"D205" # one-blank-line-after-class
|
|
224
|
+
"D205", # one-blank-line-after-class
|
|
225
|
+
"PLC0415" # import-outside-top-level
|
|
225
226
|
]
|
|
226
227
|
select = [
|
|
227
228
|
"B", # flake8-bugbear
|
|
@@ -32,6 +32,7 @@ from datachain.lib.file import (
|
|
|
32
32
|
VideoFrame,
|
|
33
33
|
)
|
|
34
34
|
from datachain.lib.model_store import ModelStore
|
|
35
|
+
from datachain.lib.projects import create as create_project
|
|
35
36
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
36
37
|
from datachain.lib.utils import AbstractUDF, DataChainError
|
|
37
38
|
from datachain.query import metrics, param
|
|
@@ -62,6 +63,7 @@ __all__ = [
|
|
|
62
63
|
"VideoFile",
|
|
63
64
|
"VideoFragment",
|
|
64
65
|
"VideoFrame",
|
|
66
|
+
"create_project",
|
|
65
67
|
"datasets",
|
|
66
68
|
"delete_dataset",
|
|
67
69
|
"is_chain_type",
|
|
@@ -39,7 +39,7 @@ def temporary_cache(
|
|
|
39
39
|
cache.destroy()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class Cache:
|
|
42
|
+
class Cache: # noqa: PLW1641
|
|
43
43
|
def __init__(self, cache_dir: str, tmp_dir: str):
|
|
44
44
|
self.odb = LocalHashFileDB(
|
|
45
45
|
LocalFileSystem(),
|
|
@@ -76,9 +76,9 @@ class Cache:
|
|
|
76
76
|
async def download(
|
|
77
77
|
self, file: "File", client: "Client", callback: Optional[Callback] = None
|
|
78
78
|
) -> None:
|
|
79
|
-
from_path = f"{file.source}/{file.path}"
|
|
80
79
|
from dvc_objects.fs.utils import tmp_fname
|
|
81
80
|
|
|
81
|
+
from_path = file.get_uri()
|
|
82
82
|
odb_fs = self.odb.fs
|
|
83
83
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
84
84
|
size = file.size
|