datachain 0.33.0__tar.gz → 0.33.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.33.0 → datachain-0.33.1}/PKG-INFO +2 -2
- {datachain-0.33.0 → datachain-0.33.1}/pyproject.toml +1 -1
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/catalog/catalog.py +58 -22
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/data_storage/job.py +1 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/diff/__init__.py +7 -13
- datachain-0.33.1/src/datachain/hash_utils.py +147 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/datachain.py +9 -1
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/signal_schema.py +7 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/udf.py +20 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/dataset.py +107 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/utils.py +6 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain.egg-info/PKG-INFO +2 -2
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain.egg-info/SOURCES.txt +4 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.33.0 → datachain-0.33.1}/tests/conftest.py +20 -2
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_diff.py +41 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_signal_schema.py +24 -0
- datachain-0.33.1/tests/unit/test_datachain_hash.py +173 -0
- datachain-0.33.1/tests/unit/test_hash_utils.py +109 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_query.py +22 -3
- datachain-0.33.1/tests/unit/test_query_steps_hash.py +505 -0
- {datachain-0.33.0 → datachain-0.33.1}/.cruft.json +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.gitattributes +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.github/codecov.yaml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.github/dependabot.yml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.github/workflows/release.yml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.gitignore +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/.pre-commit-config.yaml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/LICENSE +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/README.rst +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/api_hooks.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/assets/webhook_dialog.png +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/assets/webhook_list.png +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/commands/auth/login.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/commands/auth/logout.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/commands/auth/team.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/commands/auth/token.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/commands/index.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/commands/job/cancel.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/commands/job/clusters.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/commands/job/logs.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/commands/job/ls.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/commands/job/run.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/contributing.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/examples.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/guide/db_migrations.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/guide/delta.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/guide/env.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/guide/index.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/guide/namespaces.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/guide/processing.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/guide/remotes.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/guide/retry.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/index.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/overrides/main.html +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/quick-start.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/data-types/file.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/data-types/index.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/data-types/pose.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/data-types/segment.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/datachain.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/func.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/functions/aggregate.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/functions/array.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/functions/conditional.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/functions/numeric.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/functions/path.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/functions/random.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/functions/string.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/functions/window.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/index.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/toolkit.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/torch.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/references/udf.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/studio/api/.gitkeep +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/studio/webhooks.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/templates/main.dot +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/templates/operation.dot +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/templates/responses.def +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/docs/tutorials.md +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/get_started/nested_datamodel.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/mkdocs.yml +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/noxfile.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/setup.cfg +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/__main__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/asyn.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cache.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/checkpoint.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/cli/utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/client/http.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/client/local.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/config.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/dataset.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/delta.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/error.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/fs/reference.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/fs/utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/array.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/base.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/conditional.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/func.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/numeric.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/path.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/random.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/string.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/func/window.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/job.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/audio.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/storage_pattern.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/file.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/projects.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/video.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/listing.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/model/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/model/bbox.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/model/pose.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/model/segment.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/model/utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/namespace.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/node.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/progress.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/project.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/py.typed +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/batch.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/params.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/schema.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/session.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/udf.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/query/utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/script_meta.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/semver.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/postgresql_dialect.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/postgresql_types.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/studio.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/data.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/examples/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/data/lena.jpg +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/functions/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/functions/test_array.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/functions/test_path.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/functions/test_random.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/functions/test_string.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/model/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_audio.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_batching.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_client.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_data_storage.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_datachain.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_datasets.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_delta.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_file.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_hf.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_image.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_listing.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_ls.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_metastore.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_mutate.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_pull.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_query.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_read_database.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_retry.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_session.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_storage_pattern.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_to_database.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_toolkit.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_video.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/func/test_warehouse.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/test_atomicity.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/test_cli_studio.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/test_import_time.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/test_telemetry.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_settings.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_storage_pattern.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/model/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_cli_datasets.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_client.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_client_http.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_config.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_func.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_semver.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_session.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.33.0 → datachain-0.33.1}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.33.
|
|
3
|
+
Version: 0.33.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -103,7 +103,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
103
103
|
Requires-Dist: ultralytics; extra == "tests"
|
|
104
104
|
Provides-Extra: dev
|
|
105
105
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
106
|
-
Requires-Dist: mypy==1.18.
|
|
106
|
+
Requires-Dist: mypy==1.18.2; extra == "dev"
|
|
107
107
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
108
108
|
Requires-Dist: types-dateparser; extra == "dev"
|
|
109
109
|
Requires-Dist: types-pytz; extra == "dev"
|
|
@@ -144,19 +144,26 @@ def shutdown_process(
|
|
|
144
144
|
return proc.wait()
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
def
|
|
147
|
+
def process_output(stream: IO[bytes], callback: Callable[[str], None]) -> None:
|
|
148
148
|
buffer = b""
|
|
149
|
-
while byt := stream.read(1): # Read one byte at a time
|
|
150
|
-
buffer += byt
|
|
151
149
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
buffer = b"" # Clear buffer for next line
|
|
150
|
+
try:
|
|
151
|
+
while byt := stream.read(1): # Read one byte at a time
|
|
152
|
+
buffer += byt
|
|
156
153
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
154
|
+
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
155
|
+
line = buffer.decode("utf-8", errors="replace")
|
|
156
|
+
callback(line)
|
|
157
|
+
buffer = b"" # Clear buffer for the next line
|
|
158
|
+
|
|
159
|
+
if buffer: # Handle any remaining data in the buffer
|
|
160
|
+
line = buffer.decode("utf-8", errors="replace")
|
|
161
|
+
callback(line)
|
|
162
|
+
finally:
|
|
163
|
+
try:
|
|
164
|
+
stream.close() # Ensure output is closed
|
|
165
|
+
except Exception: # noqa: BLE001, S110
|
|
166
|
+
pass
|
|
160
167
|
|
|
161
168
|
|
|
162
169
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -1760,13 +1767,13 @@ class Catalog:
|
|
|
1760
1767
|
recursive=recursive,
|
|
1761
1768
|
)
|
|
1762
1769
|
|
|
1770
|
+
@staticmethod
|
|
1763
1771
|
def query(
|
|
1764
|
-
self,
|
|
1765
1772
|
query_script: str,
|
|
1766
1773
|
env: Optional[Mapping[str, str]] = None,
|
|
1767
1774
|
python_executable: str = sys.executable,
|
|
1768
|
-
|
|
1769
|
-
|
|
1775
|
+
stdout_callback: Optional[Callable[[str], None]] = None,
|
|
1776
|
+
stderr_callback: Optional[Callable[[str], None]] = None,
|
|
1770
1777
|
params: Optional[dict[str, str]] = None,
|
|
1771
1778
|
job_id: Optional[str] = None,
|
|
1772
1779
|
interrupt_timeout: Optional[int] = None,
|
|
@@ -1781,13 +1788,18 @@ class Catalog:
|
|
|
1781
1788
|
},
|
|
1782
1789
|
)
|
|
1783
1790
|
popen_kwargs: dict[str, Any] = {}
|
|
1784
|
-
|
|
1785
|
-
|
|
1791
|
+
|
|
1792
|
+
if stdout_callback is not None:
|
|
1793
|
+
popen_kwargs = {"stdout": subprocess.PIPE}
|
|
1794
|
+
if stderr_callback is not None:
|
|
1795
|
+
popen_kwargs["stderr"] = subprocess.PIPE
|
|
1786
1796
|
|
|
1787
1797
|
def raise_termination_signal(sig: int, _: Any) -> NoReturn:
|
|
1788
1798
|
raise TerminationSignal(sig)
|
|
1789
1799
|
|
|
1790
|
-
|
|
1800
|
+
stdout_thread: Optional[Thread] = None
|
|
1801
|
+
stderr_thread: Optional[Thread] = None
|
|
1802
|
+
|
|
1791
1803
|
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
|
|
1792
1804
|
logger.info("Starting process %s", proc.pid)
|
|
1793
1805
|
|
|
@@ -1801,10 +1813,20 @@ class Catalog:
|
|
|
1801
1813
|
orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
|
|
1802
1814
|
signal.signal(signal.SIGTERM, raise_termination_signal)
|
|
1803
1815
|
try:
|
|
1804
|
-
if
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1816
|
+
if stdout_callback is not None:
|
|
1817
|
+
stdout_thread = Thread(
|
|
1818
|
+
target=process_output,
|
|
1819
|
+
args=(proc.stdout, stdout_callback),
|
|
1820
|
+
daemon=True,
|
|
1821
|
+
)
|
|
1822
|
+
stdout_thread.start()
|
|
1823
|
+
if stderr_callback is not None:
|
|
1824
|
+
stderr_thread = Thread(
|
|
1825
|
+
target=process_output,
|
|
1826
|
+
args=(proc.stderr, stderr_callback),
|
|
1827
|
+
daemon=True,
|
|
1828
|
+
)
|
|
1829
|
+
stderr_thread.start()
|
|
1808
1830
|
|
|
1809
1831
|
proc.wait()
|
|
1810
1832
|
except TerminationSignal as exc:
|
|
@@ -1822,8 +1844,22 @@ class Catalog:
|
|
|
1822
1844
|
finally:
|
|
1823
1845
|
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1824
1846
|
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1825
|
-
|
|
1826
|
-
|
|
1847
|
+
# wait for the reader thread
|
|
1848
|
+
thread_join_timeout_seconds = 30
|
|
1849
|
+
if stdout_thread is not None:
|
|
1850
|
+
stdout_thread.join(timeout=thread_join_timeout_seconds)
|
|
1851
|
+
if stdout_thread.is_alive():
|
|
1852
|
+
logger.warning(
|
|
1853
|
+
"stdout thread is still alive after %s seconds",
|
|
1854
|
+
thread_join_timeout_seconds,
|
|
1855
|
+
)
|
|
1856
|
+
if stderr_thread is not None:
|
|
1857
|
+
stderr_thread.join(timeout=thread_join_timeout_seconds)
|
|
1858
|
+
if stderr_thread.is_alive():
|
|
1859
|
+
logger.warning(
|
|
1860
|
+
"stderr thread is still alive after %s seconds",
|
|
1861
|
+
thread_join_timeout_seconds,
|
|
1862
|
+
)
|
|
1827
1863
|
|
|
1828
1864
|
logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1829
1865
|
if proc.returncode in (
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import string
|
|
3
1
|
from collections.abc import Sequence
|
|
4
2
|
from enum import Enum
|
|
5
3
|
from typing import TYPE_CHECKING, Optional, Union
|
|
@@ -11,16 +9,12 @@ from datachain.query.schema import Column
|
|
|
11
9
|
if TYPE_CHECKING:
|
|
12
10
|
from datachain.lib.dc import DataChain
|
|
13
11
|
|
|
14
|
-
|
|
15
12
|
C = Column
|
|
16
13
|
|
|
17
14
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
random.choice(string.ascii_letters) # noqa: S311
|
|
22
|
-
for _ in range(10)
|
|
23
|
-
)
|
|
15
|
+
STATUS_COL_NAME = "diff_7aeed3aa17ba4d50b8d1c368c76e16a6"
|
|
16
|
+
LEFT_DIFF_COL_NAME = "diff_95f95344064a4b819c8625cd1a5cfc2b"
|
|
17
|
+
RIGHT_DIFF_COL_NAME = "diff_5808838a49b54849aa461d7387376d34"
|
|
24
18
|
|
|
25
19
|
|
|
26
20
|
class CompareStatus(str, Enum):
|
|
@@ -101,9 +95,9 @@ def _compare( # noqa: C901, PLR0912
|
|
|
101
95
|
compare = right_compare = [c for c in cols if c in right_cols and c not in on] # type: ignore[misc]
|
|
102
96
|
|
|
103
97
|
# get diff column names
|
|
104
|
-
diff_col = status_col or
|
|
105
|
-
ldiff_col =
|
|
106
|
-
rdiff_col =
|
|
98
|
+
diff_col = status_col or STATUS_COL_NAME
|
|
99
|
+
ldiff_col = LEFT_DIFF_COL_NAME
|
|
100
|
+
rdiff_col = RIGHT_DIFF_COL_NAME
|
|
107
101
|
|
|
108
102
|
# adding helper diff columns, which will be removed after
|
|
109
103
|
left = left.mutate(**{ldiff_col: 1})
|
|
@@ -227,7 +221,7 @@ def compare_and_split(
|
|
|
227
221
|
)
|
|
228
222
|
```
|
|
229
223
|
"""
|
|
230
|
-
status_col =
|
|
224
|
+
status_col = STATUS_COL_NAME
|
|
231
225
|
|
|
232
226
|
res = _compare(
|
|
233
227
|
left,
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import inspect
|
|
3
|
+
import json
|
|
4
|
+
import textwrap
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
from typing import TypeVar, Union
|
|
7
|
+
|
|
8
|
+
from sqlalchemy.sql.elements import (
|
|
9
|
+
BinaryExpression,
|
|
10
|
+
BindParameter,
|
|
11
|
+
ColumnElement,
|
|
12
|
+
Label,
|
|
13
|
+
Over,
|
|
14
|
+
UnaryExpression,
|
|
15
|
+
)
|
|
16
|
+
from sqlalchemy.sql.functions import Function
|
|
17
|
+
|
|
18
|
+
T = TypeVar("T", bound=ColumnElement)
|
|
19
|
+
ColumnLike = Union[str, T]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def serialize_column_element(expr: Union[str, ColumnElement]) -> dict: # noqa: PLR0911
|
|
23
|
+
"""
|
|
24
|
+
Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
# Binary operations: col > 5, col1 + col2, etc.
|
|
28
|
+
if isinstance(expr, BinaryExpression):
|
|
29
|
+
op = (
|
|
30
|
+
expr.operator.__name__
|
|
31
|
+
if hasattr(expr.operator, "__name__")
|
|
32
|
+
else str(expr.operator)
|
|
33
|
+
)
|
|
34
|
+
return {
|
|
35
|
+
"type": "binary",
|
|
36
|
+
"op": op,
|
|
37
|
+
"left": serialize_column_element(expr.left),
|
|
38
|
+
"right": serialize_column_element(expr.right),
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# Unary operations: -col, NOT col, etc.
|
|
42
|
+
if isinstance(expr, UnaryExpression):
|
|
43
|
+
op = (
|
|
44
|
+
expr.operator.__name__
|
|
45
|
+
if expr.operator is not None and hasattr(expr.operator, "__name__")
|
|
46
|
+
else str(expr.operator)
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
"type": "unary",
|
|
51
|
+
"op": op,
|
|
52
|
+
"element": serialize_column_element(expr.element), # type: ignore[arg-type]
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Function calls: func.lower(col), func.count(col), etc.
|
|
56
|
+
if isinstance(expr, Function):
|
|
57
|
+
return {
|
|
58
|
+
"type": "function",
|
|
59
|
+
"name": expr.name,
|
|
60
|
+
"clauses": [serialize_column_element(c) for c in expr.clauses],
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Window functions: func.row_number().over(partition_by=..., order_by=...)
|
|
64
|
+
if isinstance(expr, Over):
|
|
65
|
+
return {
|
|
66
|
+
"type": "window",
|
|
67
|
+
"function": serialize_column_element(expr.element),
|
|
68
|
+
"partition_by": [
|
|
69
|
+
serialize_column_element(p) for p in getattr(expr, "partition_by", [])
|
|
70
|
+
],
|
|
71
|
+
"order_by": [
|
|
72
|
+
serialize_column_element(o) for o in getattr(expr, "order_by", [])
|
|
73
|
+
],
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Labeled expressions: col.label("alias")
|
|
77
|
+
if isinstance(expr, Label):
|
|
78
|
+
return {
|
|
79
|
+
"type": "label",
|
|
80
|
+
"name": expr.name,
|
|
81
|
+
"element": serialize_column_element(expr.element),
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# Bound values (constants)
|
|
85
|
+
if isinstance(expr, BindParameter):
|
|
86
|
+
return {"type": "bind", "value": expr.value}
|
|
87
|
+
|
|
88
|
+
# Plain columns
|
|
89
|
+
if hasattr(expr, "name"):
|
|
90
|
+
return {"type": "column", "name": expr.name}
|
|
91
|
+
|
|
92
|
+
# Fallback: stringify unknown nodes
|
|
93
|
+
return {"type": "other", "repr": str(expr)}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def hash_column_elements(columns: Sequence[ColumnLike]) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Hash a list of ColumnElements deterministically, dialect agnostic.
|
|
99
|
+
Only accepts ordered iterables (like list or tuple).
|
|
100
|
+
"""
|
|
101
|
+
serialized = [serialize_column_element(c) for c in columns]
|
|
102
|
+
json_str = json.dumps(serialized, sort_keys=True) # stable JSON
|
|
103
|
+
return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def hash_callable(func):
|
|
107
|
+
"""
|
|
108
|
+
Calculate a hash from a callable.
|
|
109
|
+
Rules:
|
|
110
|
+
- Named functions (def) → use source code for stable, cross-version hashing
|
|
111
|
+
- Lambdas → use bytecode (deterministic in same Python runtime)
|
|
112
|
+
"""
|
|
113
|
+
if not callable(func):
|
|
114
|
+
raise TypeError("Expected a callable")
|
|
115
|
+
|
|
116
|
+
# Determine if it is a lambda
|
|
117
|
+
is_lambda = func.__name__ == "<lambda>"
|
|
118
|
+
|
|
119
|
+
if not is_lambda:
|
|
120
|
+
# Try to get exact source of named function
|
|
121
|
+
try:
|
|
122
|
+
lines, _ = inspect.getsourcelines(func)
|
|
123
|
+
payload = textwrap.dedent("".join(lines)).strip()
|
|
124
|
+
except (OSError, TypeError):
|
|
125
|
+
# Fallback: bytecode if source not available
|
|
126
|
+
payload = func.__code__.co_code
|
|
127
|
+
else:
|
|
128
|
+
# For lambdas, fall back directly to bytecode
|
|
129
|
+
payload = func.__code__.co_code
|
|
130
|
+
|
|
131
|
+
# Normalize annotations
|
|
132
|
+
annotations = {
|
|
133
|
+
k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Extras to distinguish functions with same code but different metadata
|
|
137
|
+
extras = {
|
|
138
|
+
"name": func.__name__,
|
|
139
|
+
"defaults": func.__defaults__,
|
|
140
|
+
"annotations": annotations,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# Compute SHA256
|
|
144
|
+
h = hashlib.sha256()
|
|
145
|
+
h.update(str(payload).encode() if isinstance(payload, str) else payload)
|
|
146
|
+
h.update(str(extras).encode())
|
|
147
|
+
return h.hexdigest()
|
|
@@ -209,6 +209,14 @@ class DataChain:
|
|
|
209
209
|
self.print_schema(file=file)
|
|
210
210
|
return file.getvalue()
|
|
211
211
|
|
|
212
|
+
def hash(self) -> str:
|
|
213
|
+
"""
|
|
214
|
+
Calculates SHA hash of this chain. Hash calculation is fast and consistent.
|
|
215
|
+
It takes into account all the steps added to the chain and their inputs.
|
|
216
|
+
Order of the steps is important.
|
|
217
|
+
"""
|
|
218
|
+
return self._query.hash()
|
|
219
|
+
|
|
212
220
|
def _as_delta(
|
|
213
221
|
self,
|
|
214
222
|
on: Optional[Union[str, Sequence[str]]] = None,
|
|
@@ -682,7 +690,7 @@ class DataChain:
|
|
|
682
690
|
|
|
683
691
|
if job_id := os.getenv("DATACHAIN_JOB_ID"):
|
|
684
692
|
catalog.metastore.create_checkpoint(
|
|
685
|
-
job_id,
|
|
693
|
+
job_id,
|
|
686
694
|
_hash=hashlib.sha256( # TODO this will be replaced with self.hash()
|
|
687
695
|
str(uuid4()).encode()
|
|
688
696
|
).hexdigest(),
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
2
4
|
import warnings
|
|
3
5
|
from collections.abc import Iterator, Sequence
|
|
4
6
|
from dataclasses import dataclass
|
|
@@ -257,6 +259,11 @@ class SignalSchema:
|
|
|
257
259
|
signals["_custom_types"] = custom_types
|
|
258
260
|
return signals
|
|
259
261
|
|
|
262
|
+
def hash(self) -> str:
|
|
263
|
+
"""Create SHA hash of this schema"""
|
|
264
|
+
json_str = json.dumps(self.serialize(), sort_keys=True, separators=(",", ":"))
|
|
265
|
+
return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
|
|
266
|
+
|
|
260
267
|
@staticmethod
|
|
261
268
|
def _split_subtypes(type_name: str) -> list[str]:
|
|
262
269
|
"""This splits a list of subtypes, including proper square bracket handling."""
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import sys
|
|
2
3
|
import traceback
|
|
3
4
|
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
|
|
@@ -12,6 +13,7 @@ from pydantic import BaseModel
|
|
|
12
13
|
from datachain.asyn import AsyncMapper
|
|
13
14
|
from datachain.cache import temporary_cache
|
|
14
15
|
from datachain.dataset import RowDict
|
|
16
|
+
from datachain.hash_utils import hash_callable
|
|
15
17
|
from datachain.lib.convert.flatten import flatten
|
|
16
18
|
from datachain.lib.file import DataModel, File
|
|
17
19
|
from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
|
|
@@ -61,6 +63,9 @@ class UDFAdapter:
|
|
|
61
63
|
batch_size: Optional[int] = None
|
|
62
64
|
batch: int = 1
|
|
63
65
|
|
|
66
|
+
def hash(self) -> str:
|
|
67
|
+
return self.inner.hash()
|
|
68
|
+
|
|
64
69
|
def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
|
|
65
70
|
if use_partitioning:
|
|
66
71
|
return Partition()
|
|
@@ -151,6 +156,21 @@ class UDFBase(AbstractUDF):
|
|
|
151
156
|
self.output = None
|
|
152
157
|
self._func = None
|
|
153
158
|
|
|
159
|
+
def hash(self) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Creates SHA hash of this UDF function. It takes into account function,
|
|
162
|
+
inputs and outputs.
|
|
163
|
+
"""
|
|
164
|
+
parts = [
|
|
165
|
+
hash_callable(self._func),
|
|
166
|
+
self.params.hash() if self.params else "",
|
|
167
|
+
self.output.hash(),
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
return hashlib.sha256(
|
|
171
|
+
b"".join([bytes.fromhex(part) for part in parts])
|
|
172
|
+
).hexdigest()
|
|
173
|
+
|
|
154
174
|
def process(self, *args, **kwargs):
|
|
155
175
|
"""Processing function that needs to be defined by user"""
|
|
156
176
|
if not self._func:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import contextlib
|
|
2
|
+
import hashlib
|
|
2
3
|
import inspect
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
@@ -44,6 +45,7 @@ from datachain.data_storage.schema import (
|
|
|
44
45
|
from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
|
|
45
46
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
46
47
|
from datachain.func.base import Function
|
|
48
|
+
from datachain.hash_utils import hash_column_elements
|
|
47
49
|
from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
|
|
48
50
|
from datachain.lib.signal_schema import SignalSchema
|
|
49
51
|
from datachain.lib.udf import UDFAdapter, _get_cache
|
|
@@ -57,6 +59,7 @@ from datachain.sql.types import SQLType
|
|
|
57
59
|
from datachain.utils import (
|
|
58
60
|
determine_processes,
|
|
59
61
|
determine_workers,
|
|
62
|
+
ensure_sequence,
|
|
60
63
|
filtered_cloudpickle_dumps,
|
|
61
64
|
get_datachain_executable,
|
|
62
65
|
safe_closing,
|
|
@@ -167,6 +170,18 @@ class Step(ABC):
|
|
|
167
170
|
) -> "StepResult":
|
|
168
171
|
"""Apply the processing step."""
|
|
169
172
|
|
|
173
|
+
@abstractmethod
|
|
174
|
+
def hash_inputs(self) -> str:
|
|
175
|
+
"""Calculates hash of step inputs"""
|
|
176
|
+
|
|
177
|
+
def hash(self) -> str:
|
|
178
|
+
"""
|
|
179
|
+
Calculates hash for step which includes step name and hash of it's inputs
|
|
180
|
+
"""
|
|
181
|
+
return hashlib.sha256(
|
|
182
|
+
f"{self.__class__.__name__}|{self.hash_inputs()}".encode()
|
|
183
|
+
).hexdigest()
|
|
184
|
+
|
|
170
185
|
|
|
171
186
|
@frozen
|
|
172
187
|
class QueryStep:
|
|
@@ -186,6 +201,11 @@ class QueryStep:
|
|
|
186
201
|
q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
|
|
187
202
|
)
|
|
188
203
|
|
|
204
|
+
def hash(self) -> str:
|
|
205
|
+
return hashlib.sha256(
|
|
206
|
+
self.dataset.uri(self.dataset_version).encode()
|
|
207
|
+
).hexdigest()
|
|
208
|
+
|
|
189
209
|
|
|
190
210
|
def generator_then_call(generator, func: Callable):
|
|
191
211
|
"""
|
|
@@ -256,6 +276,13 @@ class DatasetDiffOperation(Step):
|
|
|
256
276
|
class Subtract(DatasetDiffOperation):
|
|
257
277
|
on: Sequence[tuple[str, str]]
|
|
258
278
|
|
|
279
|
+
def hash_inputs(self) -> str:
|
|
280
|
+
on_bytes = b"".join(
|
|
281
|
+
f"{a}:{b}".encode() for a, b in sorted(self.on, key=lambda t: (t[0], t[1]))
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return hashlib.sha256(bytes.fromhex(self.dq.hash()) + on_bytes).hexdigest()
|
|
285
|
+
|
|
259
286
|
def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
|
|
260
287
|
sq = source_query.alias("source_query")
|
|
261
288
|
tq = target_query.alias("target_query")
|
|
@@ -393,6 +420,16 @@ class UDFStep(Step, ABC):
|
|
|
393
420
|
min_task_size: Optional[int] = None
|
|
394
421
|
batch_size: Optional[int] = None
|
|
395
422
|
|
|
423
|
+
def hash_inputs(self) -> str:
|
|
424
|
+
partition_by = ensure_sequence(self.partition_by or [])
|
|
425
|
+
parts = [
|
|
426
|
+
bytes.fromhex(self.udf.hash()),
|
|
427
|
+
bytes.fromhex(hash_column_elements(partition_by)),
|
|
428
|
+
str(self.is_generator).encode(),
|
|
429
|
+
]
|
|
430
|
+
|
|
431
|
+
return hashlib.sha256(b"".join(parts)).hexdigest()
|
|
432
|
+
|
|
396
433
|
@abstractmethod
|
|
397
434
|
def create_udf_table(self, query: Select) -> "Table":
|
|
398
435
|
"""Method that creates a table where temp udf results will be saved"""
|
|
@@ -790,6 +827,9 @@ class SQLClause(Step, ABC):
|
|
|
790
827
|
class SQLSelect(SQLClause):
|
|
791
828
|
args: tuple[Union[Function, ColumnElement], ...]
|
|
792
829
|
|
|
830
|
+
def hash_inputs(self) -> str:
|
|
831
|
+
return hash_column_elements(self.args)
|
|
832
|
+
|
|
793
833
|
def apply_sql_clause(self, query) -> Select:
|
|
794
834
|
subquery = query.subquery()
|
|
795
835
|
args = [
|
|
@@ -806,6 +846,9 @@ class SQLSelect(SQLClause):
|
|
|
806
846
|
class SQLSelectExcept(SQLClause):
|
|
807
847
|
args: tuple[Union[Function, ColumnElement], ...]
|
|
808
848
|
|
|
849
|
+
def hash_inputs(self) -> str:
|
|
850
|
+
return hash_column_elements(self.args)
|
|
851
|
+
|
|
809
852
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
810
853
|
subquery = query.subquery()
|
|
811
854
|
args = [c for c in subquery.c if c.name not in set(self.parse_cols(self.args))]
|
|
@@ -817,6 +860,9 @@ class SQLMutate(SQLClause):
|
|
|
817
860
|
args: tuple[Label, ...]
|
|
818
861
|
new_schema: SignalSchema
|
|
819
862
|
|
|
863
|
+
def hash_inputs(self) -> str:
|
|
864
|
+
return hash_column_elements(self.args)
|
|
865
|
+
|
|
820
866
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
821
867
|
original_subquery = query.subquery()
|
|
822
868
|
to_mutate = {c.name for c in self.args}
|
|
@@ -846,6 +892,9 @@ class SQLMutate(SQLClause):
|
|
|
846
892
|
class SQLFilter(SQLClause):
|
|
847
893
|
expressions: tuple[Union[Function, ColumnElement], ...]
|
|
848
894
|
|
|
895
|
+
def hash_inputs(self) -> str:
|
|
896
|
+
return hash_column_elements(self.expressions)
|
|
897
|
+
|
|
849
898
|
def __and__(self, other):
|
|
850
899
|
expressions = self.parse_cols(self.expressions)
|
|
851
900
|
return self.__class__(expressions + other)
|
|
@@ -859,6 +908,9 @@ class SQLFilter(SQLClause):
|
|
|
859
908
|
class SQLOrderBy(SQLClause):
|
|
860
909
|
args: tuple[Union[Function, ColumnElement], ...]
|
|
861
910
|
|
|
911
|
+
def hash_inputs(self) -> str:
|
|
912
|
+
return hash_column_elements(self.args)
|
|
913
|
+
|
|
862
914
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
863
915
|
args = self.parse_cols(self.args)
|
|
864
916
|
return query.order_by(*args)
|
|
@@ -868,6 +920,9 @@ class SQLOrderBy(SQLClause):
|
|
|
868
920
|
class SQLLimit(SQLClause):
|
|
869
921
|
n: int
|
|
870
922
|
|
|
923
|
+
def hash_inputs(self) -> str:
|
|
924
|
+
return hashlib.sha256(str(self.n).encode()).hexdigest()
|
|
925
|
+
|
|
871
926
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
872
927
|
return query.limit(self.n)
|
|
873
928
|
|
|
@@ -876,12 +931,18 @@ class SQLLimit(SQLClause):
|
|
|
876
931
|
class SQLOffset(SQLClause):
|
|
877
932
|
offset: int
|
|
878
933
|
|
|
934
|
+
def hash_inputs(self) -> str:
|
|
935
|
+
return hashlib.sha256(str(self.offset).encode()).hexdigest()
|
|
936
|
+
|
|
879
937
|
def apply_sql_clause(self, query: "GenerativeSelect"):
|
|
880
938
|
return query.offset(self.offset)
|
|
881
939
|
|
|
882
940
|
|
|
883
941
|
@frozen
|
|
884
942
|
class SQLCount(SQLClause):
|
|
943
|
+
def hash_inputs(self) -> str:
|
|
944
|
+
return ""
|
|
945
|
+
|
|
885
946
|
def apply_sql_clause(self, query):
|
|
886
947
|
return sqlalchemy.select(f.count(1)).select_from(query.subquery())
|
|
887
948
|
|
|
@@ -891,6 +952,9 @@ class SQLDistinct(SQLClause):
|
|
|
891
952
|
args: tuple[ColumnElement, ...]
|
|
892
953
|
dialect: str
|
|
893
954
|
|
|
955
|
+
def hash_inputs(self) -> str:
|
|
956
|
+
return hash_column_elements(self.args)
|
|
957
|
+
|
|
894
958
|
def apply_sql_clause(self, query):
|
|
895
959
|
if self.dialect == "sqlite":
|
|
896
960
|
return query.group_by(*self.args)
|
|
@@ -903,6 +967,11 @@ class SQLUnion(Step):
|
|
|
903
967
|
query1: "DatasetQuery"
|
|
904
968
|
query2: "DatasetQuery"
|
|
905
969
|
|
|
970
|
+
def hash_inputs(self) -> str:
|
|
971
|
+
return hashlib.sha256(
|
|
972
|
+
bytes.fromhex(self.query1.hash()) + bytes.fromhex(self.query2.hash())
|
|
973
|
+
).hexdigest()
|
|
974
|
+
|
|
906
975
|
def apply(
|
|
907
976
|
self, query_generator: QueryGenerator, temp_tables: list[str]
|
|
908
977
|
) -> StepResult:
|
|
@@ -939,6 +1008,20 @@ class SQLJoin(Step):
|
|
|
939
1008
|
full: bool
|
|
940
1009
|
rname: str
|
|
941
1010
|
|
|
1011
|
+
def hash_inputs(self) -> str:
|
|
1012
|
+
predicates = ensure_sequence(self.predicates or [])
|
|
1013
|
+
|
|
1014
|
+
parts = [
|
|
1015
|
+
bytes.fromhex(self.query1.hash()),
|
|
1016
|
+
bytes.fromhex(self.query2.hash()),
|
|
1017
|
+
bytes.fromhex(hash_column_elements(predicates)),
|
|
1018
|
+
str(self.inner).encode(),
|
|
1019
|
+
str(self.full).encode(),
|
|
1020
|
+
self.rname.encode("utf-8"),
|
|
1021
|
+
]
|
|
1022
|
+
|
|
1023
|
+
return hashlib.sha256(b"".join(parts)).hexdigest()
|
|
1024
|
+
|
|
942
1025
|
def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
|
|
943
1026
|
query = dq.apply_steps().select()
|
|
944
1027
|
temp_tables.extend(dq.temp_table_names)
|
|
@@ -1060,6 +1143,13 @@ class SQLGroupBy(SQLClause):
|
|
|
1060
1143
|
cols: Sequence[Union[str, Function, ColumnElement]]
|
|
1061
1144
|
group_by: Sequence[Union[str, Function, ColumnElement]]
|
|
1062
1145
|
|
|
1146
|
+
def hash_inputs(self) -> str:
|
|
1147
|
+
return hashlib.sha256(
|
|
1148
|
+
bytes.fromhex(
|
|
1149
|
+
hash_column_elements(self.cols) + hash_column_elements(self.group_by)
|
|
1150
|
+
)
|
|
1151
|
+
).hexdigest()
|
|
1152
|
+
|
|
1063
1153
|
def apply_sql_clause(self, query) -> Select:
|
|
1064
1154
|
if not self.cols:
|
|
1065
1155
|
raise ValueError("No columns to select")
|
|
@@ -1213,6 +1303,23 @@ class DatasetQuery:
|
|
|
1213
1303
|
def __or__(self, other):
|
|
1214
1304
|
return self.union(other)
|
|
1215
1305
|
|
|
1306
|
+
def hash(self) -> str:
|
|
1307
|
+
"""
|
|
1308
|
+
Calculates hash of this class taking into account hash of starting step
|
|
1309
|
+
and hashes of each following steps. Ordering is important.
|
|
1310
|
+
"""
|
|
1311
|
+
hasher = hashlib.sha256()
|
|
1312
|
+
if self.starting_step:
|
|
1313
|
+
hasher.update(self.starting_step.hash().encode("utf-8"))
|
|
1314
|
+
else:
|
|
1315
|
+
assert self.list_ds_name
|
|
1316
|
+
hasher.update(self.list_ds_name.encode("utf-8"))
|
|
1317
|
+
|
|
1318
|
+
for step in self.steps:
|
|
1319
|
+
hasher.update(step.hash().encode("utf-8"))
|
|
1320
|
+
|
|
1321
|
+
return hasher.hexdigest()
|
|
1322
|
+
|
|
1216
1323
|
@staticmethod
|
|
1217
1324
|
def get_table() -> "TableClause":
|
|
1218
1325
|
table_name = "".join(
|