datachain 0.18.1__tar.gz → 0.18.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.18.1 → datachain-0.18.3}/.pre-commit-config.yaml +1 -1
- {datachain-0.18.1/src/datachain.egg-info → datachain-0.18.3}/PKG-INFO +1 -1
- datachain-0.18.3/examples/incremental_processing/delta.py +64 -0
- datachain-0.18.3/examples/incremental_processing/utils.py +41 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/array.py +120 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/func.py +14 -12
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/file.py +1 -1
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/dataset.py +18 -4
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/session.py +8 -2
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/functions/array.py +22 -1
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/sqlite/base.py +33 -0
- {datachain-0.18.1 → datachain-0.18.3/src/datachain.egg-info}/PKG-INFO +1 -1
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain.egg-info/SOURCES.txt +2 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/examples/test_examples.py +11 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_datachain.py +19 -0
- datachain-0.18.3/tests/func/test_func.py +223 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/sql/test_array.py +42 -0
- datachain-0.18.1/tests/func/test_func.py +0 -124
- {datachain-0.18.1 → datachain-0.18.3}/.cruft.json +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.gitattributes +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.github/codecov.yaml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.github/dependabot.yml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.github/workflows/release.yml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.github/workflows/tests.yml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/.gitignore +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/LICENSE +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/README.rst +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/assets/datachain.svg +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/commands/auth/login.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/commands/auth/logout.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/commands/auth/team.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/commands/auth/token.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/commands/index.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/commands/job/cancel.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/commands/job/logs.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/commands/job/ls.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/commands/job/run.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/contributing.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/examples.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/index.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/overrides/main.html +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/quick-start.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/data-types/file.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/data-types/index.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/data-types/pose.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/data-types/segment.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/datachain.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/func.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/index.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/remotes.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/toolkit.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/torch.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/references/udf.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/docs/tutorials.md +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/multimodal/wds.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/mkdocs.yml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/noxfile.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/pyproject.toml +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/setup.cfg +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/__main__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/asyn.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cache.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/cli/utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/client/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/client/azure.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/client/gcs.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/client/hf.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/client/local.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/client/s3.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/config.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/dataset.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/delta.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/error.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/fs/reference.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/fs/utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/base.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/conditional.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/numeric.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/path.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/random.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/string.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/func/window.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/job.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/clip.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/datachain.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/hf.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/image.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/listing.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/settings.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/tar.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/text.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/udf.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/video.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/listing.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/model/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/model/bbox.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/model/pose.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/model/segment.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/model/utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/node.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/progress.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/py.typed +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/batch.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/metrics.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/params.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/queue.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/schema.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/udf.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/query/utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/remote/studio.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/script_meta.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/semver.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/types.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/sql/utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/studio.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/telemetry.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain/utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/conftest.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/data.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/examples/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/examples/wds_data.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/data/lena.jpg +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/model/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_batching.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_catalog.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_client.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_data_storage.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_datasets.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_delta.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_file.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_hf.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_image.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_listing.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_ls.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_metrics.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_pull.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_pytorch.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_query.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_read_database.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_session.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_toolkit.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_video.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/func/test_warehouse.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/scripts/feature_class.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/test_atomicity.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/test_cli_e2e.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/test_cli_studio.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/test_import_time.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/test_query_e2e.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/test_telemetry.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/model/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_asyn.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_cache.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_catalog.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_client.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_config.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_dataset.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_func.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_listing.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_metastore.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_query.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_query_params.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_semver.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_serializer.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_session.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_utils.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.18.1 → datachain-0.18.3}/tests/utils.py +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
File Generator Script using DataChain Delta
|
|
4
|
+
|
|
5
|
+
This script demonstrates:
|
|
6
|
+
1. Creating numbered text files in a 'test' directory
|
|
7
|
+
2. Using DataChain's delta flag for incremental dataset processing
|
|
8
|
+
|
|
9
|
+
Each execution:
|
|
10
|
+
- Creates a new numbered file in the 'test' directory
|
|
11
|
+
- Updates a DataChain dataset to track these files incrementally
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
import time
|
|
16
|
+
|
|
17
|
+
from utils import generate_next_file
|
|
18
|
+
|
|
19
|
+
import datachain as dc
|
|
20
|
+
from datachain import C, File
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def extract_file_number(file: File) -> int:
|
|
24
|
+
"""Extract file number from the filename."""
|
|
25
|
+
match = re.search(r"file-(\d+)\.txt", file.name)
|
|
26
|
+
if match:
|
|
27
|
+
return int(match.group(1))
|
|
28
|
+
return -1
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def process_files_with_delta():
|
|
32
|
+
"""
|
|
33
|
+
Process files in the test directory using DataChain with delta mode.
|
|
34
|
+
This demonstrates incremental processing - only new files are processed.
|
|
35
|
+
"""
|
|
36
|
+
chain = (
|
|
37
|
+
dc.read_storage("test/", update=True, delta=True, delta_on="file.path")
|
|
38
|
+
.filter(C("file.path").glob("*.txt"))
|
|
39
|
+
.map(file_number=extract_file_number)
|
|
40
|
+
.map(content=lambda file: file.read_text())
|
|
41
|
+
.map(processed_at=lambda: time.strftime("%Y-%m-%d %H:%M:%S"))
|
|
42
|
+
.save(name="test_files")
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Show information about the dataset
|
|
46
|
+
print(f"\nProcessed files. Total records: {chain.count()}")
|
|
47
|
+
print("\nDataset versions:")
|
|
48
|
+
test_dataset = dc.datasets().filter(C("name") == "test_files")
|
|
49
|
+
|
|
50
|
+
for version in test_dataset.collect("version"):
|
|
51
|
+
print(f"- Version: {version}")
|
|
52
|
+
|
|
53
|
+
# Show the last 3 records to demonstrate the incremental processing
|
|
54
|
+
print("\nLatest files processed:")
|
|
55
|
+
chain.order_by("file_number", descending=True).limit(3).show()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == "__main__":
|
|
59
|
+
# Generate a new file
|
|
60
|
+
new_file = generate_next_file()
|
|
61
|
+
print(f"Created new file: {new_file}")
|
|
62
|
+
|
|
63
|
+
# Process all new file with (delta update)
|
|
64
|
+
process_files_with_delta()
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
File Generator Helper
|
|
4
|
+
|
|
5
|
+
This helper creates numbered text files in a 'test' directory each time it runs.
|
|
6
|
+
The files follow the naming pattern: file-0.txt, file-1.txt, file-2.txt, etc.
|
|
7
|
+
|
|
8
|
+
Each execution, the script:
|
|
9
|
+
|
|
10
|
+
1. Creates the 'test' directory if it doesn't exist
|
|
11
|
+
2. Finds the highest numbered file currently present
|
|
12
|
+
3. Creates a new file with the next number in sequence
|
|
13
|
+
4. Adds timestamped content to the file
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
import time
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def generate_next_file() -> Path:
|
|
22
|
+
"""
|
|
23
|
+
Generate (appends) a new numbered text file in the 'test' directory.
|
|
24
|
+
"""
|
|
25
|
+
test_dir = Path("test")
|
|
26
|
+
test_dir.mkdir(exist_ok=True)
|
|
27
|
+
|
|
28
|
+
max_num = -1
|
|
29
|
+
for file in test_dir.glob("file-*.txt"):
|
|
30
|
+
if file.is_file():
|
|
31
|
+
match = re.search(r"file-(\d+)\.txt", file.name)
|
|
32
|
+
if match:
|
|
33
|
+
max_num = max(max_num, int(match.group(1)))
|
|
34
|
+
|
|
35
|
+
next_num = max_num + 1
|
|
36
|
+
new_file_path = test_dir / f"file-{next_num}.txt"
|
|
37
|
+
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
38
|
+
content = f"This is file number {next_num}\nCreated at: {timestamp}\n"
|
|
39
|
+
new_file_path.write_text(content)
|
|
40
|
+
|
|
41
|
+
return new_file_path
|
|
@@ -178,6 +178,126 @@ def contains(arr: Union[str, Sequence, Func], elem: Any) -> Func:
|
|
|
178
178
|
return Func("contains", inner=inner, cols=cols, args=args, result_type=int)
|
|
179
179
|
|
|
180
180
|
|
|
181
|
+
def slice(
|
|
182
|
+
arr: Union[str, Sequence, Func],
|
|
183
|
+
offset: int,
|
|
184
|
+
length: Optional[int] = None,
|
|
185
|
+
) -> Func:
|
|
186
|
+
"""
|
|
187
|
+
Returns a slice of the array.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
arr (str | Sequence | Func): Array to check for the element.
|
|
191
|
+
If a string is provided, it is assumed to be the name of the array column.
|
|
192
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
193
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
194
|
+
offset (int): Offset to start the slice from.
|
|
195
|
+
length (int, optional): Length of the slice. If not provided, the slice will
|
|
196
|
+
continue to the end of the array.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Func: A Func object that represents the slice function. Result of the
|
|
200
|
+
function will be a slice of the array starting from the offset
|
|
201
|
+
and with the given length.
|
|
202
|
+
|
|
203
|
+
Example:
|
|
204
|
+
```py
|
|
205
|
+
dc.mutate(
|
|
206
|
+
contains1=func.array.slice("signal.values", 3),
|
|
207
|
+
contains2=func.array.slice([1, 2, 3, 4, 5], 1, 3),
|
|
208
|
+
)
|
|
209
|
+
```
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
def inner(arg):
|
|
213
|
+
if length is not None:
|
|
214
|
+
return array.slice(arg, offset, length)
|
|
215
|
+
return array.slice(arg, offset)
|
|
216
|
+
|
|
217
|
+
def element_type(el):
|
|
218
|
+
if isinstance(el, list):
|
|
219
|
+
try:
|
|
220
|
+
return list[element_type(el[0])]
|
|
221
|
+
except IndexError:
|
|
222
|
+
# if the array is empty, return list[str] as default type
|
|
223
|
+
return list[str]
|
|
224
|
+
return type(el)
|
|
225
|
+
|
|
226
|
+
def type_from_args(arr, *_):
|
|
227
|
+
if isinstance(arr, list):
|
|
228
|
+
try:
|
|
229
|
+
return list[element_type(arr[0])]
|
|
230
|
+
except IndexError:
|
|
231
|
+
# if the array is empty, return list[str] as default type
|
|
232
|
+
return list[str]
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
if isinstance(arr, (str, Func)):
|
|
236
|
+
cols = [arr]
|
|
237
|
+
args = None
|
|
238
|
+
else:
|
|
239
|
+
cols = None
|
|
240
|
+
args = [arr]
|
|
241
|
+
|
|
242
|
+
return Func(
|
|
243
|
+
"slice",
|
|
244
|
+
inner=inner,
|
|
245
|
+
cols=cols,
|
|
246
|
+
args=args,
|
|
247
|
+
from_array=True,
|
|
248
|
+
is_array=True,
|
|
249
|
+
type_from_args=type_from_args,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def join(
|
|
254
|
+
arr: Union[str, Sequence, Func],
|
|
255
|
+
sep: str = "",
|
|
256
|
+
) -> Func:
|
|
257
|
+
"""
|
|
258
|
+
Returns a string that is the concatenation of the elements of the array,
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
arr (str | Sequence | Func): Array to check for the element.
|
|
262
|
+
If a string is provided, it is assumed to be the name of the array column.
|
|
263
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
264
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
265
|
+
sep (str): Separator to use for the concatenation. Default is an empty string.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Func: A Func object that represents the join function. Result of the
|
|
269
|
+
function will be a string that is the concatenation of the elements
|
|
270
|
+
of the array, separated by the given separator.
|
|
271
|
+
|
|
272
|
+
Example:
|
|
273
|
+
```py
|
|
274
|
+
dc.mutate(
|
|
275
|
+
contains1=func.array.join("signal.values", ":"),
|
|
276
|
+
contains2=func.array.join(["1", "2", "3", "4", "5"], "/"),
|
|
277
|
+
)
|
|
278
|
+
```
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
def inner(arg):
|
|
282
|
+
return array.join(arg, sep)
|
|
283
|
+
|
|
284
|
+
if isinstance(arr, (str, Func)):
|
|
285
|
+
cols = [arr]
|
|
286
|
+
args = None
|
|
287
|
+
else:
|
|
288
|
+
cols = None
|
|
289
|
+
args = [arr]
|
|
290
|
+
|
|
291
|
+
return Func(
|
|
292
|
+
"join",
|
|
293
|
+
inner=inner,
|
|
294
|
+
cols=cols,
|
|
295
|
+
args=args,
|
|
296
|
+
from_array=True,
|
|
297
|
+
result_type=str,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
|
|
181
301
|
def get_element(arg: Union[str, Sequence, Func], index: int) -> Func:
|
|
182
302
|
"""
|
|
183
303
|
Returns the element at the given index from the array.
|
|
@@ -108,18 +108,20 @@ class Func(Function):
|
|
|
108
108
|
)
|
|
109
109
|
|
|
110
110
|
if self.from_array:
|
|
111
|
-
if get_origin(col_type) is list:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
111
|
+
if get_origin(col_type) is not list:
|
|
112
|
+
raise DataChainColumnError(
|
|
113
|
+
str(self),
|
|
114
|
+
"Array column must be of type list",
|
|
115
|
+
)
|
|
116
|
+
if self.is_array:
|
|
117
|
+
return col_type
|
|
118
|
+
col_args = get_args(col_type)
|
|
119
|
+
if len(col_args) != 1:
|
|
120
|
+
raise DataChainColumnError(
|
|
121
|
+
str(self),
|
|
122
|
+
"Array column must have a single type argument",
|
|
123
|
+
)
|
|
124
|
+
return col_args[0]
|
|
123
125
|
|
|
124
126
|
return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
|
|
125
127
|
|
|
@@ -237,7 +237,7 @@ class File(DataModel):
|
|
|
237
237
|
@field_validator("path", mode="before")
|
|
238
238
|
@classmethod
|
|
239
239
|
def validate_path(cls, path):
|
|
240
|
-
return Path(path).as_posix()
|
|
240
|
+
return Path(path).as_posix() if path else ""
|
|
241
241
|
|
|
242
242
|
def model_dump_custom(self):
|
|
243
243
|
res = self.model_dump()
|
|
@@ -1675,13 +1675,27 @@ class DatasetQuery:
|
|
|
1675
1675
|
return query
|
|
1676
1676
|
|
|
1677
1677
|
def _add_dependencies(self, dataset: "DatasetRecord", version: str):
|
|
1678
|
-
|
|
1679
|
-
|
|
1678
|
+
dependencies: set[DatasetDependencyType] = set()
|
|
1679
|
+
for dep_name, dep_version in self.dependencies:
|
|
1680
|
+
if Session.is_temp_dataset(dep_name):
|
|
1681
|
+
# temp dataset are created for optimization and they will be removed
|
|
1682
|
+
# afterwards. Therefore, we should not put them as dependencies, but
|
|
1683
|
+
# their own direct dependencies
|
|
1684
|
+
for dep in self.catalog.get_dataset_dependencies(
|
|
1685
|
+
dep_name, dep_version, indirect=False
|
|
1686
|
+
):
|
|
1687
|
+
if dep:
|
|
1688
|
+
dependencies.add((dep.name, dep.version))
|
|
1689
|
+
else:
|
|
1690
|
+
dependencies.add((dep_name, dep_version))
|
|
1691
|
+
|
|
1692
|
+
for dep_name, dep_version in dependencies:
|
|
1693
|
+
# ds_dependency_name, ds_dependency_version = dependency
|
|
1680
1694
|
self.catalog.metastore.add_dataset_dependency(
|
|
1681
1695
|
dataset.name,
|
|
1682
1696
|
version,
|
|
1683
|
-
|
|
1684
|
-
|
|
1697
|
+
dep_name,
|
|
1698
|
+
dep_version,
|
|
1685
1699
|
)
|
|
1686
1700
|
|
|
1687
1701
|
def exec(self) -> "Self":
|
|
@@ -195,5 +195,11 @@ class Session:
|
|
|
195
195
|
Session.GLOBAL_SESSION_CTX.__exit__(None, None, None)
|
|
196
196
|
|
|
197
197
|
for obj in gc.get_objects(): # Get all tracked objects
|
|
198
|
-
|
|
199
|
-
obj
|
|
198
|
+
try:
|
|
199
|
+
if isinstance(obj, Session):
|
|
200
|
+
# Cleanup temp dataset for session variables.
|
|
201
|
+
obj.__exit__(None, None, None)
|
|
202
|
+
except ReferenceError:
|
|
203
|
+
continue # Object has been finalized already
|
|
204
|
+
except Exception as e: # noqa: BLE001
|
|
205
|
+
logger.error(f"Exception while cleaning up session: {e}") # noqa: G004
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from sqlalchemy.sql.functions import GenericFunction
|
|
2
2
|
|
|
3
|
-
from datachain.sql.types import Boolean, Float, Int64
|
|
3
|
+
from datachain.sql.types import Boolean, Float, Int64, String
|
|
4
4
|
from datachain.sql.utils import compiler_not_implemented
|
|
5
5
|
|
|
6
6
|
|
|
@@ -48,6 +48,27 @@ class contains(GenericFunction): # noqa: N801
|
|
|
48
48
|
inherit_cache = True
|
|
49
49
|
|
|
50
50
|
|
|
51
|
+
class slice(GenericFunction): # noqa: N801
|
|
52
|
+
"""
|
|
53
|
+
Returns a slice of the array.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
package = "array"
|
|
57
|
+
name = "slice"
|
|
58
|
+
inherit_cache = True
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class join(GenericFunction): # noqa: N801
|
|
62
|
+
"""
|
|
63
|
+
Returns the concatenation of the array elements.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
type = String()
|
|
67
|
+
package = "array"
|
|
68
|
+
name = "join"
|
|
69
|
+
inherit_cache = True
|
|
70
|
+
|
|
71
|
+
|
|
51
72
|
class get_element(GenericFunction): # noqa: N801
|
|
52
73
|
"""
|
|
53
74
|
Returns the element at the given index in the array.
|
|
@@ -88,6 +88,8 @@ def setup():
|
|
|
88
88
|
compiles(sql_path.file_ext, "sqlite")(compile_path_file_ext)
|
|
89
89
|
compiles(array.length, "sqlite")(compile_array_length)
|
|
90
90
|
compiles(array.contains, "sqlite")(compile_array_contains)
|
|
91
|
+
compiles(array.slice, "sqlite")(compile_array_slice)
|
|
92
|
+
compiles(array.join, "sqlite")(compile_array_join)
|
|
91
93
|
compiles(array.get_element, "sqlite")(compile_array_get_element)
|
|
92
94
|
compiles(string.length, "sqlite")(compile_string_length)
|
|
93
95
|
compiles(string.split, "sqlite")(compile_string_split)
|
|
@@ -275,6 +277,15 @@ def register_user_defined_sql_functions() -> None:
|
|
|
275
277
|
conn.create_function(
|
|
276
278
|
"json_array_get_element", 2, py_json_array_get_element, deterministic=True
|
|
277
279
|
)
|
|
280
|
+
conn.create_function(
|
|
281
|
+
"json_array_slice", 2, py_json_array_slice, deterministic=True
|
|
282
|
+
)
|
|
283
|
+
conn.create_function(
|
|
284
|
+
"json_array_slice", 3, py_json_array_slice, deterministic=True
|
|
285
|
+
)
|
|
286
|
+
conn.create_function(
|
|
287
|
+
"json_array_join", 2, py_json_array_join, deterministic=True
|
|
288
|
+
)
|
|
278
289
|
|
|
279
290
|
_registered_function_creators["array_functions"] = create_array_functions
|
|
280
291
|
|
|
@@ -454,6 +465,20 @@ def py_json_array_get_element(val, idx):
|
|
|
454
465
|
return None
|
|
455
466
|
|
|
456
467
|
|
|
468
|
+
def py_json_array_slice(val, offset: int, length: Optional[int] = None):
|
|
469
|
+
arr = orjson.loads(val)
|
|
470
|
+
try:
|
|
471
|
+
return orjson.dumps(
|
|
472
|
+
list(arr[offset : offset + length] if length is not None else arr[offset:])
|
|
473
|
+
).decode("utf-8")
|
|
474
|
+
except IndexError:
|
|
475
|
+
return None
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def py_json_array_join(val, sep: str):
|
|
479
|
+
return sep.join(orjson.loads(val))
|
|
480
|
+
|
|
481
|
+
|
|
457
482
|
def compile_array_get_element(element, compiler, **kwargs):
|
|
458
483
|
return compiler.process(
|
|
459
484
|
func.json_array_get_element(*element.clauses.clauses), **kwargs
|
|
@@ -470,6 +495,14 @@ def compile_array_contains(element, compiler, **kwargs):
|
|
|
470
495
|
)
|
|
471
496
|
|
|
472
497
|
|
|
498
|
+
def compile_array_slice(element, compiler, **kwargs):
|
|
499
|
+
return compiler.process(func.json_array_slice(*element.clauses.clauses), **kwargs)
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def compile_array_join(element, compiler, **kwargs):
|
|
503
|
+
return compiler.process(func.json_array_join(*element.clauses.clauses), **kwargs)
|
|
504
|
+
|
|
505
|
+
|
|
473
506
|
def compile_string_length(element, compiler, **kwargs):
|
|
474
507
|
return compiler.process(func.length(*element.clauses.clauses), **kwargs)
|
|
475
508
|
|
|
@@ -66,6 +66,8 @@ examples/get_started/torch-loader.py
|
|
|
66
66
|
examples/get_started/udfs/parallel.py
|
|
67
67
|
examples/get_started/udfs/simple.py
|
|
68
68
|
examples/get_started/udfs/stateful.py
|
|
69
|
+
examples/incremental_processing/delta.py
|
|
70
|
+
examples/incremental_processing/utils.py
|
|
69
71
|
examples/llm_and_nlp/claude-query.py
|
|
70
72
|
examples/llm_and_nlp/hf-dataset-llm-eval.py
|
|
71
73
|
examples/multimodal/clip_inference.py
|
|
@@ -12,6 +12,10 @@ llm_and_nlp_examples = sorted(glob.glob("examples/llm_and_nlp/**/*.py", recursiv
|
|
|
12
12
|
|
|
13
13
|
multimodal_examples = sorted(glob.glob("examples/multimodal/**/*.py", recursive=True))
|
|
14
14
|
|
|
15
|
+
incremental_processing_examples = sorted(
|
|
16
|
+
glob.glob("examples/incremental_processing/delta.py", recursive=True)
|
|
17
|
+
)
|
|
18
|
+
|
|
15
19
|
computer_vision_examples = sorted(
|
|
16
20
|
[
|
|
17
21
|
filename
|
|
@@ -86,6 +90,13 @@ def test_multimodal(example):
|
|
|
86
90
|
)
|
|
87
91
|
|
|
88
92
|
|
|
93
|
+
@pytest.mark.examples
|
|
94
|
+
@pytest.mark.incremental_processing
|
|
95
|
+
@pytest.mark.parametrize("example", incremental_processing_examples)
|
|
96
|
+
def test_incremental_processing_examples(example):
|
|
97
|
+
smoke_test(example)
|
|
98
|
+
|
|
99
|
+
|
|
89
100
|
@pytest.mark.examples
|
|
90
101
|
@pytest.mark.computer_vision
|
|
91
102
|
@pytest.mark.parametrize("example", computer_vision_examples)
|
|
@@ -239,6 +239,25 @@ def test_read_storage_dependencies(cloud_test_catalog, cloud_type):
|
|
|
239
239
|
assert dependencies[0].name == dep_name
|
|
240
240
|
|
|
241
241
|
|
|
242
|
+
def test_persist_not_affects_dependencies(tmp_dir, test_session):
|
|
243
|
+
for i in range(4):
|
|
244
|
+
(tmp_dir / f"file{i}.txt").write_text(f"file{i}")
|
|
245
|
+
|
|
246
|
+
uri = tmp_dir.as_uri()
|
|
247
|
+
dep_name, _, _ = parse_listing_uri(uri, test_session.catalog.client_config)
|
|
248
|
+
chain = dc.read_storage(uri, session=test_session) # .persist()
|
|
249
|
+
# calling multiple persists to create temp datasets
|
|
250
|
+
chain = chain.persist()
|
|
251
|
+
chain = chain.persist()
|
|
252
|
+
chain = chain.persist()
|
|
253
|
+
chain.save("test-data")
|
|
254
|
+
dependencies = test_session.catalog.get_dataset_dependencies("test-data", "1.0.0")
|
|
255
|
+
|
|
256
|
+
assert len(dependencies) == 1
|
|
257
|
+
assert dependencies[0].name == dep_name
|
|
258
|
+
assert dependencies[0].type == DatasetDependencyType.STORAGE
|
|
259
|
+
|
|
260
|
+
|
|
242
261
|
@pytest.mark.parametrize("use_cache", [True, False])
|
|
243
262
|
@pytest.mark.parametrize("prefetch", [0, 2])
|
|
244
263
|
def test_map_file(cloud_test_catalog, use_cache, prefetch, monkeypatch):
|