datachain 0.32.0__tar.gz → 0.32.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.32.0 → datachain-0.32.2}/.pre-commit-config.yaml +1 -1
- {datachain-0.32.0 → datachain-0.32.2}/PKG-INFO +12 -24
- {datachain-0.32.0 → datachain-0.32.2}/README.rst +10 -22
- {datachain-0.32.0 → datachain-0.32.2}/docs/api_hooks.py +7 -0
- datachain-0.32.2/docs/assets/webhook_dialog.png +0 -0
- datachain-0.32.2/docs/assets/webhook_list.png +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/guide/namespaces.md +23 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/datachain.md +2 -0
- datachain-0.32.2/docs/studio/webhooks.md +276 -0
- {datachain-0.32.0 → datachain-0.32.2}/mkdocs.yml +1 -0
- {datachain-0.32.0 → datachain-0.32.2}/pyproject.toml +1 -1
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/__init__.py +1 -1
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/dataset.py +2 -2
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/convert/python_to_sql.py +18 -4
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/parquet.py +20 -5
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/storage.py +12 -6
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/storage_pattern.py +50 -99
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/namespaces.py +4 -5
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain.egg-info/PKG-INFO +12 -24
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain.egg-info/SOURCES.txt +3 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_metastore.py +1 -1
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_storage_pattern.py +61 -5
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_audio.py +1 -1
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_datachain.py +5 -5
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_namespace.py +8 -1
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_project.py +1 -1
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_python_to_sql.py +19 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_storage_pattern.py +88 -22
- {datachain-0.32.0 → datachain-0.32.2}/.cruft.json +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.gitattributes +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.github/codecov.yaml +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.github/dependabot.yml +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.github/workflows/release.yml +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.github/workflows/tests.yml +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/.gitignore +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/LICENSE +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/assets/datachain.svg +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/commands/auth/login.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/commands/auth/logout.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/commands/auth/team.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/commands/auth/token.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/commands/index.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/commands/job/cancel.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/commands/job/clusters.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/commands/job/logs.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/commands/job/ls.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/commands/job/run.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/contributing.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/examples.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/guide/db_migrations.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/guide/delta.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/guide/env.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/guide/index.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/guide/processing.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/guide/remotes.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/guide/retry.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/index.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/overrides/main.html +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/quick-start.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/data-types/file.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/data-types/index.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/data-types/pose.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/data-types/segment.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/func.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/functions/aggregate.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/functions/array.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/functions/conditional.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/functions/numeric.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/functions/path.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/functions/random.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/functions/string.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/functions/window.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/index.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/toolkit.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/torch.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/references/udf.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/studio/api/.gitkeep +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/templates/main.dot +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/templates/operation.dot +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/templates/responses.def +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/docs/tutorials.md +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/get_started/nested_datamodel.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/multimodal/wds.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/noxfile.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/setup.cfg +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/__main__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/asyn.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cache.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/cli/utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/client/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/client/azure.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/client/gcs.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/client/hf.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/client/local.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/client/s3.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/config.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/delta.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/error.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/fs/reference.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/fs/utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/array.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/base.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/conditional.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/func.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/numeric.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/path.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/random.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/string.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/func/window.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/job.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/audio.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/clip.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/datachain.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/file.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/hf.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/image.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/listing.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/projects.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/settings.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/tar.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/text.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/udf.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/video.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/listing.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/model/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/model/bbox.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/model/pose.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/model/segment.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/model/utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/namespace.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/node.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/progress.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/project.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/py.typed +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/batch.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/dataset.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/metrics.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/params.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/queue.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/schema.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/session.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/udf.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/query/utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/remote/studio.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/script_meta.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/semver.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/postgresql_dialect.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/postgresql_types.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/types.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/sql/utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/studio.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/telemetry.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain/utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/conftest.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/data.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/examples/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/examples/test_examples.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/examples/wds_data.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/data/lena.jpg +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/functions/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/functions/test_array.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/functions/test_path.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/functions/test_random.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/functions/test_string.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/model/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_audio.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_batching.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_catalog.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_client.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_data_storage.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_datachain.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_datasets.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_delta.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_file.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_hf.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_image.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_listing.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_ls.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_metrics.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_mutate.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_pull.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_pytorch.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_query.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_read_database.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_retry.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_session.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_to_database.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_toolkit.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_video.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/func/test_warehouse.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/scripts/feature_class.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/test_atomicity.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/test_cli_e2e.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/test_cli_studio.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/test_import_time.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/test_query_e2e.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/test_telemetry.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_settings.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/model/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_asyn.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_cache.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_catalog.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_cli_datasets.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_client.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_config.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_dataset.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_func.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_listing.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_metastore.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_query.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_query_params.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_semver.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_serializer.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_session.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_utils.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.32.0 → datachain-0.32.2}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.32.
|
|
3
|
+
Version: 0.32.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -102,7 +102,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
102
102
|
Requires-Dist: ultralytics; extra == "tests"
|
|
103
103
|
Provides-Extra: dev
|
|
104
104
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
105
|
-
Requires-Dist: mypy==1.
|
|
105
|
+
Requires-Dist: mypy==1.18.1; extra == "dev"
|
|
106
106
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
107
107
|
Requires-Dist: types-dateparser; extra == "dev"
|
|
108
108
|
Requires-Dist: types-pytz; extra == "dev"
|
|
@@ -210,45 +210,33 @@ datasets that evolve over time and may occasionally have processing errors.
|
|
|
210
210
|
.. code:: py
|
|
211
211
|
|
|
212
212
|
import datachain as dc
|
|
213
|
-
from datachain import C, File
|
|
214
213
|
|
|
215
|
-
def process_file(file: File):
|
|
216
|
-
"""
|
|
214
|
+
def process_file(file: dc.File) -> tuple[str, str, str]:
|
|
215
|
+
"""Analyze a file, may occasionally fail."""
|
|
217
216
|
try:
|
|
218
217
|
# Your processing logic here
|
|
219
218
|
content = file.read_text()
|
|
220
|
-
result =
|
|
221
|
-
return
|
|
222
|
-
"content": content,
|
|
223
|
-
"result": result,
|
|
224
|
-
"error": None # No error
|
|
225
|
-
}
|
|
219
|
+
result = content.upper()
|
|
220
|
+
return content, result, "" # No error
|
|
226
221
|
except Exception as e:
|
|
227
222
|
# Return an error that will trigger reprocessing next time
|
|
228
|
-
return
|
|
229
|
-
"content": None,
|
|
230
|
-
"result": None,
|
|
231
|
-
"error": str(e) # Error field will trigger retry
|
|
232
|
-
}
|
|
223
|
+
return "", "", str(e) # Error field will trigger retry
|
|
233
224
|
|
|
234
225
|
# Process files efficiently with delta and retry
|
|
226
|
+
# Run it many times, keep adding files, to see delta and retry in action
|
|
235
227
|
chain = (
|
|
236
228
|
dc.read_storage(
|
|
237
229
|
"data/",
|
|
238
230
|
update=True,
|
|
239
231
|
delta=True, # Process only new/changed files
|
|
240
232
|
delta_on="file.path", # Identify files by path
|
|
241
|
-
|
|
233
|
+
delta_retry="error", # Process files with error again
|
|
242
234
|
)
|
|
243
|
-
.map(
|
|
244
|
-
.
|
|
245
|
-
content=C("processed_result.content"),
|
|
246
|
-
result=C("processed_result.result"),
|
|
247
|
-
error=C("processed_result.error")
|
|
248
|
-
)
|
|
249
|
-
.save(name="processed_data")
|
|
235
|
+
.map(process_file, output=("content", "result", "error"))
|
|
236
|
+
.save("processed-data")
|
|
250
237
|
)
|
|
251
238
|
|
|
239
|
+
|
|
252
240
|
Example: LLM based text-file evaluation
|
|
253
241
|
---------------------------------------
|
|
254
242
|
|
|
@@ -89,45 +89,33 @@ datasets that evolve over time and may occasionally have processing errors.
|
|
|
89
89
|
.. code:: py
|
|
90
90
|
|
|
91
91
|
import datachain as dc
|
|
92
|
-
from datachain import C, File
|
|
93
92
|
|
|
94
|
-
def process_file(file: File):
|
|
95
|
-
"""
|
|
93
|
+
def process_file(file: dc.File) -> tuple[str, str, str]:
|
|
94
|
+
"""Analyze a file, may occasionally fail."""
|
|
96
95
|
try:
|
|
97
96
|
# Your processing logic here
|
|
98
97
|
content = file.read_text()
|
|
99
|
-
result =
|
|
100
|
-
return
|
|
101
|
-
"content": content,
|
|
102
|
-
"result": result,
|
|
103
|
-
"error": None # No error
|
|
104
|
-
}
|
|
98
|
+
result = content.upper()
|
|
99
|
+
return content, result, "" # No error
|
|
105
100
|
except Exception as e:
|
|
106
101
|
# Return an error that will trigger reprocessing next time
|
|
107
|
-
return
|
|
108
|
-
"content": None,
|
|
109
|
-
"result": None,
|
|
110
|
-
"error": str(e) # Error field will trigger retry
|
|
111
|
-
}
|
|
102
|
+
return "", "", str(e) # Error field will trigger retry
|
|
112
103
|
|
|
113
104
|
# Process files efficiently with delta and retry
|
|
105
|
+
# Run it many times, keep adding files, to see delta and retry in action
|
|
114
106
|
chain = (
|
|
115
107
|
dc.read_storage(
|
|
116
108
|
"data/",
|
|
117
109
|
update=True,
|
|
118
110
|
delta=True, # Process only new/changed files
|
|
119
111
|
delta_on="file.path", # Identify files by path
|
|
120
|
-
|
|
112
|
+
delta_retry="error", # Process files with error again
|
|
121
113
|
)
|
|
122
|
-
.map(
|
|
123
|
-
.
|
|
124
|
-
content=C("processed_result.content"),
|
|
125
|
-
result=C("processed_result.result"),
|
|
126
|
-
error=C("processed_result.error")
|
|
127
|
-
)
|
|
128
|
-
.save(name="processed_data")
|
|
114
|
+
.map(process_file, output=("content", "result", "error"))
|
|
115
|
+
.save("processed-data")
|
|
129
116
|
)
|
|
130
117
|
|
|
118
|
+
|
|
131
119
|
Example: LLM based text-file evaluation
|
|
132
120
|
---------------------------------------
|
|
133
121
|
|
|
@@ -4,6 +4,13 @@ def on_pre_build(**kwargs):
|
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
6
|
|
|
7
|
+
# Skip if files already exist
|
|
8
|
+
if os.path.exists("docs/openapi.json") and os.path.exists(
|
|
9
|
+
"docs/studio/api/index.md"
|
|
10
|
+
):
|
|
11
|
+
print("API docs already exist, skipping generation")
|
|
12
|
+
return
|
|
13
|
+
|
|
7
14
|
# Download OpenAPI spec
|
|
8
15
|
response = requests.get(
|
|
9
16
|
"https://studio.datachain.ai/api/openapi.json",
|
|
Binary file
|
|
Binary file
|
|
@@ -159,3 +159,26 @@ dc.read_values(scores=[0.8, 1.5, 2.1]).save("metrics")
|
|
|
159
159
|
|
|
160
160
|
ds = dc.read_dataset("local.local.metrics")
|
|
161
161
|
ds.show()
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Removing Namespaces and Projects
|
|
165
|
+
|
|
166
|
+
Use `delete_namespace` to remove an empty namespace or an empty project within a namespace. Delete will fail if the target is not empty.
|
|
167
|
+
|
|
168
|
+
### Signature
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
def delete_namespace(name: str, session: Optional[Session]) -> None:
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
- **`<namespace>`** — deletes the namespace (must contain no projects or datasets).
|
|
175
|
+
- **`<namespace>.<project>`** — deletes the project (must contain no datasets).
|
|
176
|
+
|
|
177
|
+
### Examples
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
import datachain as dc
|
|
181
|
+
|
|
182
|
+
dc.delete_namespace("dev.my-project") # delete project
|
|
183
|
+
dc.delete_namespace("dev") # delete namespace
|
|
184
|
+
```
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
# Webhooks in Studio
|
|
2
|
+
|
|
3
|
+
## About webhooks
|
|
4
|
+
|
|
5
|
+
Webhooks provide a way for the notifications to be delivered to an external web server whenever certain events occur in [Studio](https://studio.datachain.ai). With webhooks, you can set a setting once that you want to hear about certain events or activities .
|
|
6
|
+
|
|
7
|
+
When you create a webhook, you specify a URL, and necessary information you want us to send to along with the events that you want to listen on Datachain. When the event occurs, Datachain Studio will send an HTTP request with the data about the event to the URL that you specified. If your server is setup to listen for webhook deliveries at that URL, it can take action when it receives one.
|
|
8
|
+
|
|
9
|
+
For example, you can subscribe your webhook to events that occur when a job is created, is complete, is failed, is running, and so on. You can then monitor whenever a job is failed through this webhook.
|
|
10
|
+
|
|
11
|
+
### Alternative
|
|
12
|
+
As opposed to webhooks, you can also use [CLI command](../commands/index.md) to get the job information or some of our available [API endpoints](api/index.md) but webhook requires less effort than polling an API since it allows near real time updates.
|
|
13
|
+
|
|
14
|
+
## Available event type
|
|
15
|
+
As of now, your server can receive two different types of events.
|
|
16
|
+
|
|
17
|
+
### JOB
|
|
18
|
+
|
|
19
|
+
Whenever any job is created or any status is changed to the job, you will receive the JOB webhook event. The payload you get with the job webhook is as:
|
|
20
|
+
|
|
21
|
+
Header: `http-x-datachain-event`: `JOB`
|
|
22
|
+
|
|
23
|
+
Payload:
|
|
24
|
+
```json
|
|
25
|
+
{
|
|
26
|
+
"action": "job_status",
|
|
27
|
+
"job": {
|
|
28
|
+
"id": "da59df47-d121-4eb6-aa76-dc452755544e",
|
|
29
|
+
"status": "COMPLETE",
|
|
30
|
+
"error_message": "",
|
|
31
|
+
"name": "job_query.py",
|
|
32
|
+
"created_at": "2021-07-27T16:02:08.070557",
|
|
33
|
+
"updated_at": "2021-07-27T16:22:08.070557",
|
|
34
|
+
"finished_at": "2021-07-27T16:22:08.070557",
|
|
35
|
+
"url": "https://studio.datachain.ai/team/TeamName/datasets/jobs/da59df47-d121-4eb6-aa76-dc452755544e"
|
|
36
|
+
},
|
|
37
|
+
"timestamp": "2021-07-27T16:22:08.070557",
|
|
38
|
+
"text": "Job job_query.py (da59df47-d121-4eb6-aa76-dc452755544e) changed its status to COMPLETE"
|
|
39
|
+
}
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### PING
|
|
43
|
+
Whenever you add your webhook to your team, Studio sends a PING event to check the delivery to the server. You can check the recent deliveries to check if the webhook is successfully connected.
|
|
44
|
+
|
|
45
|
+
Header: `http-x-datachain-event`: `PING`.
|
|
46
|
+
|
|
47
|
+
Payload:
|
|
48
|
+
```json
|
|
49
|
+
{
|
|
50
|
+
"action": "PING",
|
|
51
|
+
"message": "Webhook connection test successful"
|
|
52
|
+
}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
## Creating webhooks
|
|
57
|
+
|
|
58
|
+
You should have admin access to a team to create the webhooks in the team. To create a webhook, go to settings for the team and under the section Webhooks, click on Add new Webhook.
|
|
59
|
+

|
|
60
|
+
|
|
61
|
+
Enter the necessary information to create the webhooks.
|
|
62
|
+
|
|
63
|
+
- **URL:** Enter the valid URL where you’d like to receive the webhook payload in
|
|
64
|
+
- **Secret:** A string to use as a secret key. You should choose a random string of text with high entropy. You can use the webhook secret to [validate incoming requests](#validating-webhook-deliveries) to those only originating from Datachain Studio.
|
|
65
|
+
- **Events:** Under events, select the events you would like to trigger the webhook.
|
|
66
|
+
- **JOB:**
|
|
67
|
+
- CREATED: When a job is created but not yet scheduled to run
|
|
68
|
+
- SCHEDULED: Job has been scheduled to run
|
|
69
|
+
- QUEUED: Job is in the queue waiting to be processed
|
|
70
|
+
- INIT: Job is initializing (starting up)
|
|
71
|
+
- RUNNING: When a job starts running
|
|
72
|
+
- COMPLETE: Job has completed successfully
|
|
73
|
+
- FAILED: Job failed with error
|
|
74
|
+
- CANCELED: Job has been canceled successfully
|
|
75
|
+
- CANCELING: Job has been scheduled to cancel
|
|
76
|
+
- TASK: A scheduled task is created.
|
|
77
|
+
|
|
78
|
+
- SSL Verification: By default, we verify SSL certificates when delivering payloads. SSL verification helps ensure that hook payloads are delivered to your URL endpoint securely, keeping your data away from prying eyes. Disabling this option is **not recommended**.
|
|
79
|
+
- HTTP Method: By default, we make a post request, but you can specify other http method if necessary.
|
|
80
|
+
- Content Type: Optionally, select the data format you want to receive the webhook payload in
|
|
81
|
+
- **application/json** will deliver the JSON payload directly as the body of the `POST` request.
|
|
82
|
+
- **application/x-www-form-urlencoded** will send the JSON payload as a form parameter called `payload`.
|
|
83
|
+
|
|
84
|
+

|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
## Handling webhook deliveries
|
|
88
|
+
|
|
89
|
+
When you create a webhook, you specify a URL and subscribe to event types. When any event that your webhook is subscribed to occurs, Datachain Studio will send an HTTP request with the data about the event to the event that you specified. If your server is setup at that URL, it can take action when it receives one.
|
|
90
|
+
|
|
91
|
+
### Setup
|
|
92
|
+
|
|
93
|
+
In order to test your webhook locally, you can use a webhook proxy URL to forward the webhooks from Studio to your computer or codespace. We are using [smee.io](http://smee.io) to provide a webhook proxy url and forward webhooks.
|
|
94
|
+
|
|
95
|
+
1. Go to [smee.io](http://smee.io)
|
|
96
|
+
2. Start a new channel
|
|
97
|
+
3. Copy the full URL under the webhook proxy URL. We will use this URL in the following setup steps.
|
|
98
|
+
4. Install smee-client if it is not already installed using `npm install --global smee-client`
|
|
99
|
+
5. To receive forwarded webhooks from smee.io, run the following command in your terminal. Replace the `WEBHOOK_PROXY_URL` with your webhook proxy URL from earlier.
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
smee --path /webhook --port 3000 --url WEBHOOK_PROXY_URL
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
6. Keep this running while you test out your webhook. When you want to stop forwarding the webhooks, enter Ctrl + C
|
|
106
|
+
7. Create webhook using the step as mentioned above or edit the one if you already have with the url from earlier.
|
|
107
|
+
8. Write code to handle webhook deliveries
|
|
108
|
+
1. Initialize your server to listen for requests to your webhook URL
|
|
109
|
+
2. Read HTTP headers and body from request
|
|
110
|
+
3. Take desired action in response to the request.
|
|
111
|
+
|
|
112
|
+
You can use any programming languages that you can to run on your server.
|
|
113
|
+
|
|
114
|
+
### Example Code
|
|
115
|
+
|
|
116
|
+
#### Python
|
|
117
|
+
|
|
118
|
+
This example uses the Python and Flask libraries to handle the routes and HTTP requests.
|
|
119
|
+
|
|
120
|
+
To use this you must install flask library in your project. For example:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
pip install Flask
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Create a python file with following contents. Modify the code to handle only the event types that your webhook is subscribed to as well as the ping event that Studio sends when you create a webhook. This example handles job, dataset and ping events.
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
# You installed the `flask` library earlier.
|
|
130
|
+
from flask import Flask, request
|
|
131
|
+
|
|
132
|
+
# This defines the port where your server should listen.
|
|
133
|
+
# 3000 matches the port that you specified for webhook forwarding.
|
|
134
|
+
#
|
|
135
|
+
# Once you deploy your code to a server,
|
|
136
|
+
# Change this to match the port where your server is listening.
|
|
137
|
+
port = 3000
|
|
138
|
+
secret = "secretString"
|
|
139
|
+
|
|
140
|
+
# This initializes a new Flask application.
|
|
141
|
+
app = Flask(__name__)
|
|
142
|
+
|
|
143
|
+
# This defines a POST route at the `/webhook` path.
|
|
144
|
+
# It matches the path you specified for the smee.io forwarding.
|
|
145
|
+
#
|
|
146
|
+
# Once you deploy your code to a server and update your webhook URL,
|
|
147
|
+
# Change this to match the path portion of the URL for your webhook.
|
|
148
|
+
@app.route('/webhook', methods=['POST'])
|
|
149
|
+
def webhook():
|
|
150
|
+
# Respond to indicate that delivery was successfully received.
|
|
151
|
+
# Your server should respond with a 2XX response
|
|
152
|
+
# within 10 seconds of receiving a webhook delivery.
|
|
153
|
+
# If your server takes longer than that to respond,
|
|
154
|
+
# then Studio terminates the connection.
|
|
155
|
+
|
|
156
|
+
# Check `http-x-datachain-event` header for the event type.
|
|
157
|
+
datachain_event = request.headers.get('http-x-datachain-event')
|
|
158
|
+
|
|
159
|
+
# You should add logic to handle each event type
|
|
160
|
+
# that your webhook is subscribed to.
|
|
161
|
+
# For example, this code handles the `JOB` and `PING` events.
|
|
162
|
+
if datachain_event == 'JOB':
|
|
163
|
+
data = request.get_json()
|
|
164
|
+
action = data.get('action')
|
|
165
|
+
if action == 'job_status':
|
|
166
|
+
print(
|
|
167
|
+
f"Job status for job {data['job']['id']} was" \
|
|
168
|
+
" changed to {data['job']['status']}"
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
print(f"Unhandled action for the job event: {action}")
|
|
172
|
+
elif datachain_event == 'PING':
|
|
173
|
+
print('Ping event received')
|
|
174
|
+
else:
|
|
175
|
+
print(f"Unhandled event: {datachain_event}")
|
|
176
|
+
|
|
177
|
+
return '', 202 # 202 Accepted status code
|
|
178
|
+
|
|
179
|
+
# This starts the server.
|
|
180
|
+
if __name__ == '__main__':
|
|
181
|
+
app.run(host='0.0.0.0', port=port, debug=True)
|
|
182
|
+
print(f"Server is running on port {port}")
|
|
183
|
+
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
To test the code, run the file using `python FILENAME`. Make sure that you are forwarding the webhooks in a separate terminal.
|
|
187
|
+
|
|
188
|
+
When you run a job in Studio, you will see some similar response as below:
|
|
189
|
+
|
|
190
|
+
```prolog
|
|
191
|
+
Ping event received
|
|
192
|
+
Job status for job a852ee4a-091a-456f-ba1a-c809f7e804f3 was changed to CREATED
|
|
193
|
+
Job status for job a852ee4a-091a-456f-ba1a-c809f7e804f3 was changed to SCHEDULED
|
|
194
|
+
Job status for job a852ee4a-091a-456f-ba1a-c809f7e804f3 was changed to QUEUED
|
|
195
|
+
Job status for job a852ee4a-091a-456f-ba1a-c809f7e804f3 was changed to INIT
|
|
196
|
+
Job status for job a852ee4a-091a-456f-ba1a-c809f7e804f3 was changed to RUNNING
|
|
197
|
+
Job status for job a852ee4a-091a-456f-ba1a-c809f7e804f3 was changed to COMPLETE
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## Validating webhook deliveries
|
|
201
|
+
|
|
202
|
+
Once your server is configured to receive payloads, it will listen for any delivery that’s sent to the endpoint you configured. To ensure that your server only processes webhook deliveries that were sent by Datachain Studio and to ensure that the delivery was not tampered with, you should validate webhook signature before processing the delivery further.
|
|
203
|
+
|
|
204
|
+
Studio will use the secret you added when creating your webhook to create a hash signature that’s sent to you with each payload. The hash signature will appear in each delivery as the value of `X-datachain-signature-256` header.
|
|
205
|
+
|
|
206
|
+
In your code that handles webhook deliveries, you should calculate a hash using your secret token and compare the hash Studio sent with the expected hash that you calculate and ensure they match.
|
|
207
|
+
|
|
208
|
+
Notes:
|
|
209
|
+
|
|
210
|
+
- Studio uses HMAC hex digest to compute the hash
|
|
211
|
+
- The hash signature always starts with `sha256=`
|
|
212
|
+
- The hash signature is generated using webhook’s secret token and payload contents.
|
|
213
|
+
- Never use a plain `==` operator. Instead consider using a method like [`secure_compare`](https://www.rubydoc.info/gems/rack/Rack%2FUtils:secure_compare) or [`crypto.timingSafeEqual`](https://nodejs.org/api/crypto.html#cryptotimingsafeequala-b), which performs a "constant time" string comparison to help mitigate certain timing attacks against regular equality operators, or regular loops in JIT-optimized languages.
|
|
214
|
+
|
|
215
|
+
Updating the example above:
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
import hashlib
|
|
219
|
+
import hmac
|
|
220
|
+
from flask import abort
|
|
221
|
+
|
|
222
|
+
def verify_signature(payload_body, secret_token, signature_header):
|
|
223
|
+
"""Verify the payload was sent from Studio by validating SHA256.
|
|
224
|
+
|
|
225
|
+
Raise and return 403 if not authorized.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
payload_body: request body to verify (request.body())
|
|
229
|
+
secret_token: Studio webhook token (WEBHOOK_SECRET)
|
|
230
|
+
signature_header: header (x-datachain-signature-256)
|
|
231
|
+
"""
|
|
232
|
+
if not signature_header:
|
|
233
|
+
abort(403, "X-datachain-signature-256 is missing!")
|
|
234
|
+
hash_object = hmac.new(
|
|
235
|
+
secret_token.encode('utf-8'),
|
|
236
|
+
msg=payload_body,
|
|
237
|
+
digestmod=hashlib.sha256
|
|
238
|
+
)
|
|
239
|
+
expected_signature = "sha256=" + hash_object.hexdigest()
|
|
240
|
+
if not hmac.compare_digest(
|
|
241
|
+
expected_signature, signature_header
|
|
242
|
+
):
|
|
243
|
+
abort(403, "Request signatures didn't match!")
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
Add the following call in the api receiver.
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
# Get the signature header
|
|
250
|
+
signature = request.headers.get('X-Datachain-Signature-256')
|
|
251
|
+
|
|
252
|
+
# Re-enable signature verification with improved JSON handling
|
|
253
|
+
if signature:
|
|
254
|
+
verify_signature(request.get_data(), secret, signature)
|
|
255
|
+
else:
|
|
256
|
+
print("Warning: No signature header found")
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
## Slack Integration
|
|
260
|
+
|
|
261
|
+
You can use this webhook feature to send a message to slack as well. To integrate slack with Studio,
|
|
262
|
+
|
|
263
|
+
1. Using the guide as described in [Slack documentation](https://docs.slack.dev/messaging/sending-messages-using-incoming-webhooks/) , create an incoming webhook and copy the webhook address in the following format `https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX` .
|
|
264
|
+
2. Use the address to create a webhook in Studio as described [above](#creating-webhooks)
|
|
265
|
+
3. You should now be able to view the message in Slack channel connected.
|
|
266
|
+
|
|
267
|
+
## Best practices for using Webhooks
|
|
268
|
+
|
|
269
|
+
1. You should only subscribe to the webhook events that you need. This will reduce the amount of work your server needs to do.
|
|
270
|
+
2. The webhook secret should be a random string of text with high entropy. You should securely store your webhook secret in a way that your server can access.
|
|
271
|
+
3. You should ensure that your server uses an HTTPS connection. By default, Studio will verify SSL certificates when delivering webhooks. Studio recommends that you leave SSL verification enabled.
|
|
272
|
+
4. Your server should respond with a 2XX response within 10 seconds of receiving a webhook delivery. If your server takes longer than that to respond, then Studio terminates the connection and considers the delivery a failure.
|
|
273
|
+
5. Check the event header and action type before processing the event.
|
|
274
|
+
6. Make sure the endpoints are idempotent meaning if multiple requests for same event is received, the server should handle this.
|
|
275
|
+
7. Datachain Studio may deliver webhooks in a different order than the order in which the events took place. If you need to know when the event occurred relative to another event, you should use the timestamps that are included in the delivery payload.
|
|
276
|
+
8. Consecutive 10 failures to webhook will disable the webhook deliveries.
|
|
@@ -37,7 +37,7 @@ from datachain.lib.file import (
|
|
|
37
37
|
VideoFrame,
|
|
38
38
|
)
|
|
39
39
|
from datachain.lib.model_store import ModelStore
|
|
40
|
-
from datachain.lib.namespaces import
|
|
40
|
+
from datachain.lib.namespaces import delete_namespace
|
|
41
41
|
from datachain.lib.projects import create as create_project
|
|
42
42
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
43
43
|
from datachain.lib.utils import AbstractUDF, DataChainError
|
|
@@ -619,7 +619,7 @@ class DatasetRecord:
|
|
|
619
619
|
if not self.versions:
|
|
620
620
|
return "1.0.0"
|
|
621
621
|
|
|
622
|
-
major,
|
|
622
|
+
major, _, _ = semver.parse(self.latest_version)
|
|
623
623
|
return semver.create(major + 1, 0, 0)
|
|
624
624
|
|
|
625
625
|
@property
|
|
@@ -630,7 +630,7 @@ class DatasetRecord:
|
|
|
630
630
|
if not self.versions:
|
|
631
631
|
return "1.0.0"
|
|
632
632
|
|
|
633
|
-
major, minor,
|
|
633
|
+
major, minor, _ = semver.parse(self.latest_version)
|
|
634
634
|
return semver.create(major, minor + 1, 0)
|
|
635
635
|
|
|
636
636
|
@property
|
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
import inspect
|
|
2
|
+
import sys
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from enum import Enum
|
|
4
5
|
from typing import Annotated, Literal, Union, get_args, get_origin
|
|
5
6
|
|
|
7
|
+
if sys.version_info >= (3, 10):
|
|
8
|
+
from types import UnionType
|
|
9
|
+
else:
|
|
10
|
+
UnionType = None
|
|
11
|
+
|
|
6
12
|
from pydantic import BaseModel
|
|
7
13
|
from typing_extensions import Literal as LiteralEx
|
|
8
14
|
|
|
@@ -34,6 +40,13 @@ PYTHON_TO_SQL = {
|
|
|
34
40
|
}
|
|
35
41
|
|
|
36
42
|
|
|
43
|
+
def _is_union(orig) -> bool:
|
|
44
|
+
if orig == Union:
|
|
45
|
+
return True
|
|
46
|
+
# some code is unreachab in python<3.10
|
|
47
|
+
return UnionType is not None and orig is UnionType # type: ignore[unreachable]
|
|
48
|
+
|
|
49
|
+
|
|
37
50
|
def python_to_sql(typ): # noqa: PLR0911
|
|
38
51
|
if inspect.isclass(typ):
|
|
39
52
|
if issubclass(typ, SQLType):
|
|
@@ -69,9 +82,10 @@ def python_to_sql(typ): # noqa: PLR0911
|
|
|
69
82
|
if inspect.isclass(orig) and issubclass(dict, orig):
|
|
70
83
|
return JSON
|
|
71
84
|
|
|
72
|
-
if orig
|
|
85
|
+
if _is_union(orig):
|
|
73
86
|
if len(args) == 2 and (type(None) in args):
|
|
74
|
-
|
|
87
|
+
non_none_arg = args[0] if args[0] is not type(None) else args[1]
|
|
88
|
+
return python_to_sql(non_none_arg)
|
|
75
89
|
|
|
76
90
|
if _is_union_str_literal(orig, args):
|
|
77
91
|
return String
|
|
@@ -95,7 +109,7 @@ def list_of_args_to_type(args) -> SQLType:
|
|
|
95
109
|
|
|
96
110
|
|
|
97
111
|
def _is_json_inside_union(orig, args) -> bool:
|
|
98
|
-
if orig
|
|
112
|
+
if _is_union(orig) and len(args) >= 2:
|
|
99
113
|
# List in JSON: Union[dict, list[dict]]
|
|
100
114
|
args_no_nones = [arg for arg in args if arg != type(None)] # noqa: E721
|
|
101
115
|
if len(args_no_nones) == 2:
|
|
@@ -112,6 +126,6 @@ def _is_json_inside_union(orig, args) -> bool:
|
|
|
112
126
|
|
|
113
127
|
|
|
114
128
|
def _is_union_str_literal(orig, args) -> bool:
|
|
115
|
-
if orig
|
|
129
|
+
if not _is_union(orig):
|
|
116
130
|
return False
|
|
117
131
|
return all(arg is str or get_origin(arg) in (Literal, LiteralEx) for arg in args)
|
|
@@ -26,8 +26,14 @@ def read_parquet(
|
|
|
26
26
|
"""Generate chain from parquet files.
|
|
27
27
|
|
|
28
28
|
Parameters:
|
|
29
|
-
path: Storage
|
|
30
|
-
|
|
29
|
+
path: Storage path(s) or URI(s). Can be a local path or start with a
|
|
30
|
+
storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
|
|
31
|
+
Supports glob patterns:
|
|
32
|
+
- `*` : wildcard
|
|
33
|
+
- `**` : recursive wildcard
|
|
34
|
+
- `?` : single character
|
|
35
|
+
- `{a,b}` : brace expansion list
|
|
36
|
+
- `{1..9}` : brace numeric or alphabetic range
|
|
31
37
|
partitioning: Any pyarrow partitioning schema.
|
|
32
38
|
output: Dictionary defining column names and their corresponding types.
|
|
33
39
|
column: Created column name.
|
|
@@ -43,10 +49,19 @@ def read_parquet(
|
|
|
43
49
|
dc.read_parquet("s3://mybucket/file.parquet")
|
|
44
50
|
```
|
|
45
51
|
|
|
46
|
-
|
|
52
|
+
All files from a directory:
|
|
47
53
|
```py
|
|
48
|
-
|
|
49
|
-
|
|
54
|
+
dc.read_parquet("s3://mybucket/dir/")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Only parquet files from a directory, and all it's subdirectories:
|
|
58
|
+
```py
|
|
59
|
+
dc.read_parquet("s3://mybucket/dir/**/*.parquet")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Using filename patterns - numeric, list, starting with zeros:
|
|
63
|
+
```py
|
|
64
|
+
dc.read_parquet("s3://mybucket/202{1..4}/{yellow,green}-{01..12}.parquet")
|
|
50
65
|
```
|
|
51
66
|
"""
|
|
52
67
|
from .storage import read_storage
|
|
@@ -51,7 +51,8 @@ def read_storage(
|
|
|
51
51
|
- `*` : wildcard
|
|
52
52
|
- `**` : recursive wildcard
|
|
53
53
|
- `?` : single character
|
|
54
|
-
- `{a,b}` : brace expansion
|
|
54
|
+
- `{a,b}` : brace expansion list
|
|
55
|
+
- `{1..9}` : brace numeric or alphabetic range
|
|
55
56
|
type: read file as "binary", "text", or "image" data. Default is "binary".
|
|
56
57
|
recursive: search recursively for the given path.
|
|
57
58
|
column: Column name that will contain File objects. Default is "file".
|
|
@@ -88,27 +89,32 @@ def read_storage(
|
|
|
88
89
|
Simple call from s3:
|
|
89
90
|
```python
|
|
90
91
|
import datachain as dc
|
|
91
|
-
|
|
92
|
+
dc.read_storage("s3://my-bucket/my-dir")
|
|
92
93
|
```
|
|
93
94
|
|
|
94
95
|
Match all .json files recursively using glob pattern
|
|
95
96
|
```py
|
|
96
|
-
|
|
97
|
+
dc.read_storage("gs://bucket/meta/**/*.json")
|
|
97
98
|
```
|
|
98
99
|
|
|
99
100
|
Match image file extensions for directories with pattern
|
|
100
101
|
```py
|
|
101
|
-
|
|
102
|
+
dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
By ranges in filenames:
|
|
106
|
+
```py
|
|
107
|
+
dc.read_storage("s3://bucket/202{1..4}/**/*.{jpg,jpeg,png}")
|
|
102
108
|
```
|
|
103
109
|
|
|
104
110
|
Multiple URIs:
|
|
105
111
|
```python
|
|
106
|
-
|
|
112
|
+
dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])
|
|
107
113
|
```
|
|
108
114
|
|
|
109
115
|
With AWS S3-compatible storage:
|
|
110
116
|
```python
|
|
111
|
-
|
|
117
|
+
dc.read_storage(
|
|
112
118
|
"s3://my-bucket/my-dir",
|
|
113
119
|
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
114
120
|
)
|