datachain 0.30.5__tar.gz → 0.30.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (420) hide show
  1. {datachain-0.30.5 → datachain-0.30.6}/PKG-INFO +1 -1
  2. {datachain-0.30.5 → datachain-0.30.6}/docs/guide/delta.md +20 -0
  3. {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/json-csv-reader.py +8 -6
  4. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/datasets.py +32 -17
  5. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/delta.py +36 -20
  6. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/datachain.py +8 -0
  7. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/datasets.py +4 -0
  8. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/storage.py +5 -0
  9. {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/PKG-INFO +1 -1
  10. {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/SOURCES.txt +1 -0
  11. {datachain-0.30.5 → datachain-0.30.6}/tests/conftest.py +3 -5
  12. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_delta.py +88 -33
  13. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_retry.py +40 -0
  14. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_datachain.py +7 -0
  15. datachain-0.30.6/tests/unit/test_cli_datasets.py +64 -0
  16. {datachain-0.30.5 → datachain-0.30.6}/.cruft.json +0 -0
  17. {datachain-0.30.5 → datachain-0.30.6}/.gitattributes +0 -0
  18. {datachain-0.30.5 → datachain-0.30.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  19. {datachain-0.30.5 → datachain-0.30.6}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  20. {datachain-0.30.5 → datachain-0.30.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  21. {datachain-0.30.5 → datachain-0.30.6}/.github/codecov.yaml +0 -0
  22. {datachain-0.30.5 → datachain-0.30.6}/.github/dependabot.yml +0 -0
  23. {datachain-0.30.5 → datachain-0.30.6}/.github/workflows/benchmarks.yml +0 -0
  24. {datachain-0.30.5 → datachain-0.30.6}/.github/workflows/release.yml +0 -0
  25. {datachain-0.30.5 → datachain-0.30.6}/.github/workflows/tests-studio.yml +0 -0
  26. {datachain-0.30.5 → datachain-0.30.6}/.github/workflows/tests.yml +0 -0
  27. {datachain-0.30.5 → datachain-0.30.6}/.github/workflows/update-template.yaml +0 -0
  28. {datachain-0.30.5 → datachain-0.30.6}/.gitignore +0 -0
  29. {datachain-0.30.5 → datachain-0.30.6}/.pre-commit-config.yaml +0 -0
  30. {datachain-0.30.5 → datachain-0.30.6}/CODE_OF_CONDUCT.rst +0 -0
  31. {datachain-0.30.5 → datachain-0.30.6}/LICENSE +0 -0
  32. {datachain-0.30.5 → datachain-0.30.6}/README.rst +0 -0
  33. {datachain-0.30.5 → datachain-0.30.6}/docs/assets/captioned_cartoons.png +0 -0
  34. {datachain-0.30.5 → datachain-0.30.6}/docs/assets/datachain-white.svg +0 -0
  35. {datachain-0.30.5 → datachain-0.30.6}/docs/assets/datachain.svg +0 -0
  36. {datachain-0.30.5 → datachain-0.30.6}/docs/commands/auth/login.md +0 -0
  37. {datachain-0.30.5 → datachain-0.30.6}/docs/commands/auth/logout.md +0 -0
  38. {datachain-0.30.5 → datachain-0.30.6}/docs/commands/auth/team.md +0 -0
  39. {datachain-0.30.5 → datachain-0.30.6}/docs/commands/auth/token.md +0 -0
  40. {datachain-0.30.5 → datachain-0.30.6}/docs/commands/index.md +0 -0
  41. {datachain-0.30.5 → datachain-0.30.6}/docs/commands/job/cancel.md +0 -0
  42. {datachain-0.30.5 → datachain-0.30.6}/docs/commands/job/clusters.md +0 -0
  43. {datachain-0.30.5 → datachain-0.30.6}/docs/commands/job/logs.md +0 -0
  44. {datachain-0.30.5 → datachain-0.30.6}/docs/commands/job/ls.md +0 -0
  45. {datachain-0.30.5 → datachain-0.30.6}/docs/commands/job/run.md +0 -0
  46. {datachain-0.30.5 → datachain-0.30.6}/docs/contributing.md +0 -0
  47. {datachain-0.30.5 → datachain-0.30.6}/docs/css/github-permalink-style.css +0 -0
  48. {datachain-0.30.5 → datachain-0.30.6}/docs/examples.md +0 -0
  49. {datachain-0.30.5 → datachain-0.30.6}/docs/guide/db_migrations.md +0 -0
  50. {datachain-0.30.5 → datachain-0.30.6}/docs/guide/env.md +0 -0
  51. {datachain-0.30.5 → datachain-0.30.6}/docs/guide/index.md +0 -0
  52. {datachain-0.30.5 → datachain-0.30.6}/docs/guide/namespaces.md +0 -0
  53. {datachain-0.30.5 → datachain-0.30.6}/docs/guide/processing.md +0 -0
  54. {datachain-0.30.5 → datachain-0.30.6}/docs/guide/remotes.md +0 -0
  55. {datachain-0.30.5 → datachain-0.30.6}/docs/guide/retry.md +0 -0
  56. {datachain-0.30.5 → datachain-0.30.6}/docs/index.md +0 -0
  57. {datachain-0.30.5 → datachain-0.30.6}/docs/overrides/main.html +0 -0
  58. {datachain-0.30.5 → datachain-0.30.6}/docs/quick-start.md +0 -0
  59. {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/arrowrow.md +0 -0
  60. {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/bbox.md +0 -0
  61. {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/file.md +0 -0
  62. {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/imagefile.md +0 -0
  63. {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/index.md +0 -0
  64. {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/pose.md +0 -0
  65. {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/segment.md +0 -0
  66. {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/tarvfile.md +0 -0
  67. {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/textfile.md +0 -0
  68. {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/videofile.md +0 -0
  69. {datachain-0.30.5 → datachain-0.30.6}/docs/references/datachain.md +0 -0
  70. {datachain-0.30.5 → datachain-0.30.6}/docs/references/func.md +0 -0
  71. {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/aggregate.md +0 -0
  72. {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/array.md +0 -0
  73. {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/conditional.md +0 -0
  74. {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/numeric.md +0 -0
  75. {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/path.md +0 -0
  76. {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/random.md +0 -0
  77. {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/string.md +0 -0
  78. {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/window.md +0 -0
  79. {datachain-0.30.5 → datachain-0.30.6}/docs/references/index.md +0 -0
  80. {datachain-0.30.5 → datachain-0.30.6}/docs/references/toolkit.md +0 -0
  81. {datachain-0.30.5 → datachain-0.30.6}/docs/references/torch.md +0 -0
  82. {datachain-0.30.5 → datachain-0.30.6}/docs/references/udf.md +0 -0
  83. {datachain-0.30.5 → datachain-0.30.6}/docs/tutorials.md +0 -0
  84. {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  85. {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  86. {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/openimage-detect.py +0 -0
  87. {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/ultralytics-bbox.py +0 -0
  88. {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/ultralytics-pose.py +0 -0
  89. {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/ultralytics-segment.py +0 -0
  90. {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/common_sql_functions.py +0 -0
  91. {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/nested_datamodel.py +0 -0
  92. {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/torch-loader.py +0 -0
  93. {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/udfs/parallel.py +0 -0
  94. {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/udfs/simple.py +0 -0
  95. {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/udfs/stateful.py +0 -0
  96. {datachain-0.30.5 → datachain-0.30.6}/examples/incremental_processing/delta.py +0 -0
  97. {datachain-0.30.5 → datachain-0.30.6}/examples/incremental_processing/retry.py +0 -0
  98. {datachain-0.30.5 → datachain-0.30.6}/examples/incremental_processing/utils.py +0 -0
  99. {datachain-0.30.5 → datachain-0.30.6}/examples/llm_and_nlp/claude-query.py +0 -0
  100. {datachain-0.30.5 → datachain-0.30.6}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  101. {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/audio-to-text.py +0 -0
  102. {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/clip_inference.py +0 -0
  103. {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/hf_pipeline.py +0 -0
  104. {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/openai_image_desc_lib.py +0 -0
  105. {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/wds.py +0 -0
  106. {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/wds_filtered.py +0 -0
  107. {datachain-0.30.5 → datachain-0.30.6}/mkdocs.yml +0 -0
  108. {datachain-0.30.5 → datachain-0.30.6}/noxfile.py +0 -0
  109. {datachain-0.30.5 → datachain-0.30.6}/pyproject.toml +0 -0
  110. {datachain-0.30.5 → datachain-0.30.6}/setup.cfg +0 -0
  111. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/__init__.py +0 -0
  112. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/__main__.py +0 -0
  113. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/asyn.py +0 -0
  114. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cache.py +0 -0
  115. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/catalog/__init__.py +0 -0
  116. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/catalog/catalog.py +0 -0
  117. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/catalog/datasource.py +0 -0
  118. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/catalog/loader.py +0 -0
  119. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/__init__.py +0 -0
  120. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/__init__.py +0 -0
  121. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/du.py +0 -0
  122. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/index.py +0 -0
  123. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/ls.py +0 -0
  124. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/misc.py +0 -0
  125. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/query.py +0 -0
  126. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/show.py +0 -0
  127. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/parser/__init__.py +0 -0
  128. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/parser/job.py +0 -0
  129. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/parser/studio.py +0 -0
  130. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/parser/utils.py +0 -0
  131. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/utils.py +0 -0
  132. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/__init__.py +0 -0
  133. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/azure.py +0 -0
  134. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/fileslice.py +0 -0
  135. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/fsspec.py +0 -0
  136. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/gcs.py +0 -0
  137. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/hf.py +0 -0
  138. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/local.py +0 -0
  139. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/s3.py +0 -0
  140. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/config.py +0 -0
  141. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/__init__.py +0 -0
  142. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/db_engine.py +0 -0
  143. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/job.py +0 -0
  144. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/metastore.py +0 -0
  145. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/schema.py +0 -0
  146. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/serializer.py +0 -0
  147. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/sqlite.py +0 -0
  148. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/warehouse.py +0 -0
  149. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/dataset.py +0 -0
  150. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/diff/__init__.py +0 -0
  151. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/error.py +0 -0
  152. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/fs/__init__.py +0 -0
  153. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/fs/reference.py +0 -0
  154. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/fs/utils.py +0 -0
  155. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/__init__.py +0 -0
  156. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/aggregate.py +0 -0
  157. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/array.py +0 -0
  158. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/base.py +0 -0
  159. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/conditional.py +0 -0
  160. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/func.py +0 -0
  161. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/numeric.py +0 -0
  162. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/path.py +0 -0
  163. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/random.py +0 -0
  164. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/string.py +0 -0
  165. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/window.py +0 -0
  166. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/job.py +0 -0
  167. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/__init__.py +0 -0
  168. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/arrow.py +0 -0
  169. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/audio.py +0 -0
  170. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/clip.py +0 -0
  171. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/__init__.py +0 -0
  172. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/flatten.py +0 -0
  173. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/python_to_sql.py +0 -0
  174. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/sql_to_python.py +0 -0
  175. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/unflatten.py +0 -0
  176. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  177. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/data_model.py +0 -0
  178. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dataset_info.py +0 -0
  179. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/__init__.py +0 -0
  180. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/csv.py +0 -0
  181. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/database.py +0 -0
  182. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/hf.py +0 -0
  183. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/json.py +0 -0
  184. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/listings.py +0 -0
  185. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/pandas.py +0 -0
  186. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/parquet.py +0 -0
  187. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/records.py +0 -0
  188. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/utils.py +0 -0
  189. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/values.py +0 -0
  190. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/file.py +0 -0
  191. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/hf.py +0 -0
  192. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/image.py +0 -0
  193. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/listing.py +0 -0
  194. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/listing_info.py +0 -0
  195. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/meta_formats.py +0 -0
  196. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/model_store.py +0 -0
  197. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/namespaces.py +0 -0
  198. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/projects.py +0 -0
  199. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/pytorch.py +0 -0
  200. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/settings.py +0 -0
  201. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/signal_schema.py +0 -0
  202. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/tar.py +0 -0
  203. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/text.py +0 -0
  204. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/udf.py +0 -0
  205. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/udf_signature.py +0 -0
  206. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/utils.py +0 -0
  207. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/video.py +0 -0
  208. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/webdataset.py +0 -0
  209. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/webdataset_laion.py +0 -0
  210. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/listing.py +0 -0
  211. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/__init__.py +0 -0
  212. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/bbox.py +0 -0
  213. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/pose.py +0 -0
  214. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/segment.py +0 -0
  215. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/ultralytics/__init__.py +0 -0
  216. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/ultralytics/bbox.py +0 -0
  217. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/ultralytics/pose.py +0 -0
  218. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/ultralytics/segment.py +0 -0
  219. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/utils.py +0 -0
  220. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/namespace.py +0 -0
  221. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/node.py +0 -0
  222. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/nodes_fetcher.py +0 -0
  223. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/nodes_thread_pool.py +0 -0
  224. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/progress.py +0 -0
  225. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/project.py +0 -0
  226. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/py.typed +0 -0
  227. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/__init__.py +0 -0
  228. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/batch.py +0 -0
  229. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/dataset.py +0 -0
  230. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/dispatch.py +0 -0
  231. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/metrics.py +0 -0
  232. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/params.py +0 -0
  233. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/queue.py +0 -0
  234. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/schema.py +0 -0
  235. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/session.py +0 -0
  236. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/udf.py +0 -0
  237. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/utils.py +0 -0
  238. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/remote/__init__.py +0 -0
  239. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/remote/studio.py +0 -0
  240. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/script_meta.py +0 -0
  241. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/semver.py +0 -0
  242. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/__init__.py +0 -0
  243. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/default/__init__.py +0 -0
  244. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/default/base.py +0 -0
  245. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/__init__.py +0 -0
  246. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/aggregate.py +0 -0
  247. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/array.py +0 -0
  248. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/conditional.py +0 -0
  249. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/numeric.py +0 -0
  250. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/path.py +0 -0
  251. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/random.py +0 -0
  252. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/string.py +0 -0
  253. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/postgresql_dialect.py +0 -0
  254. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/postgresql_types.py +0 -0
  255. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/selectable.py +0 -0
  256. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/sqlite/__init__.py +0 -0
  257. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/sqlite/base.py +0 -0
  258. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/sqlite/types.py +0 -0
  259. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/sqlite/vector.py +0 -0
  260. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/types.py +0 -0
  261. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/utils.py +0 -0
  262. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/studio.py +0 -0
  263. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/telemetry.py +0 -0
  264. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/toolkit/__init__.py +0 -0
  265. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/toolkit/split.py +0 -0
  266. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/torch/__init__.py +0 -0
  267. {datachain-0.30.5 → datachain-0.30.6}/src/datachain/utils.py +0 -0
  268. {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/dependency_links.txt +0 -0
  269. {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/entry_points.txt +0 -0
  270. {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/requires.txt +0 -0
  271. {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/top_level.txt +0 -0
  272. {datachain-0.30.5 → datachain-0.30.6}/tests/__init__.py +0 -0
  273. {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/__init__.py +0 -0
  274. {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/conftest.py +0 -0
  275. {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  276. {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/datasets/.dvc/config +0 -0
  277. {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/datasets/.gitignore +0 -0
  278. {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  279. {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/test_datachain.py +0 -0
  280. {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/test_ls.py +0 -0
  281. {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/test_version.py +0 -0
  282. {datachain-0.30.5 → datachain-0.30.6}/tests/data.py +0 -0
  283. {datachain-0.30.5 → datachain-0.30.6}/tests/examples/__init__.py +0 -0
  284. {datachain-0.30.5 → datachain-0.30.6}/tests/examples/test_examples.py +0 -0
  285. {datachain-0.30.5 → datachain-0.30.6}/tests/examples/test_wds_e2e.py +0 -0
  286. {datachain-0.30.5 → datachain-0.30.6}/tests/examples/wds_data.py +0 -0
  287. {datachain-0.30.5 → datachain-0.30.6}/tests/func/__init__.py +0 -0
  288. {datachain-0.30.5 → datachain-0.30.6}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  289. {datachain-0.30.5 → datachain-0.30.6}/tests/func/data/lena.jpg +0 -0
  290. {datachain-0.30.5 → datachain-0.30.6}/tests/func/fake-service-account-credentials.json +0 -0
  291. {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/__init__.py +0 -0
  292. {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_aggregate.py +0 -0
  293. {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_array.py +0 -0
  294. {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_conditional.py +0 -0
  295. {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_numeric.py +0 -0
  296. {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_path.py +0 -0
  297. {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_random.py +0 -0
  298. {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_string.py +0 -0
  299. {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/__init__.py +0 -0
  300. {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/data/running-mask0.png +0 -0
  301. {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/data/running-mask1.png +0 -0
  302. {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/data/running.jpg +0 -0
  303. {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/data/ships.jpg +0 -0
  304. {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/test_yolo.py +0 -0
  305. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_audio.py +0 -0
  306. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_batching.py +0 -0
  307. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_catalog.py +0 -0
  308. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_client.py +0 -0
  309. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_cloud_transfer.py +0 -0
  310. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_data_storage.py +0 -0
  311. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_datachain.py +0 -0
  312. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_datachain_merge.py +0 -0
  313. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_dataset_query.py +0 -0
  314. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_datasets.py +0 -0
  315. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_feature_pickling.py +0 -0
  316. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_file.py +0 -0
  317. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_hf.py +0 -0
  318. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_hidden_field.py +0 -0
  319. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_image.py +0 -0
  320. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_listing.py +0 -0
  321. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_ls.py +0 -0
  322. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_meta_formats.py +0 -0
  323. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_metastore.py +0 -0
  324. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_metrics.py +0 -0
  325. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_mutate.py +0 -0
  326. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_pull.py +0 -0
  327. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_pytorch.py +0 -0
  328. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_query.py +0 -0
  329. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_read_database.py +0 -0
  330. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_read_dataset_remote.py +0 -0
  331. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  332. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_session.py +0 -0
  333. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_studio_datetime_parsing.py +0 -0
  334. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_to_database.py +0 -0
  335. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_toolkit.py +0 -0
  336. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_video.py +0 -0
  337. {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_warehouse.py +0 -0
  338. {datachain-0.30.5 → datachain-0.30.6}/tests/scripts/feature_class.py +0 -0
  339. {datachain-0.30.5 → datachain-0.30.6}/tests/scripts/feature_class_exception.py +0 -0
  340. {datachain-0.30.5 → datachain-0.30.6}/tests/scripts/feature_class_parallel.py +0 -0
  341. {datachain-0.30.5 → datachain-0.30.6}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  342. {datachain-0.30.5 → datachain-0.30.6}/tests/scripts/name_len_slow.py +0 -0
  343. {datachain-0.30.5 → datachain-0.30.6}/tests/test_atomicity.py +0 -0
  344. {datachain-0.30.5 → datachain-0.30.6}/tests/test_cli_e2e.py +0 -0
  345. {datachain-0.30.5 → datachain-0.30.6}/tests/test_cli_studio.py +0 -0
  346. {datachain-0.30.5 → datachain-0.30.6}/tests/test_import_time.py +0 -0
  347. {datachain-0.30.5 → datachain-0.30.6}/tests/test_query_e2e.py +0 -0
  348. {datachain-0.30.5 → datachain-0.30.6}/tests/test_telemetry.py +0 -0
  349. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/__init__.py +0 -0
  350. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/__init__.py +0 -0
  351. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/conftest.py +0 -0
  352. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_arrow.py +0 -0
  353. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_audio.py +0 -0
  354. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_clip.py +0 -0
  355. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  356. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_datachain_merge.py +0 -0
  357. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_diff.py +0 -0
  358. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_feature.py +0 -0
  359. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_feature_utils.py +0 -0
  360. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_file.py +0 -0
  361. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_hf.py +0 -0
  362. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_image.py +0 -0
  363. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_listing_info.py +0 -0
  364. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_namespace.py +0 -0
  365. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_partition_by.py +0 -0
  366. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_project.py +0 -0
  367. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_python_to_sql.py +0 -0
  368. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_schema.py +0 -0
  369. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_settings.py +0 -0
  370. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_signal_schema.py +0 -0
  371. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_sql_to_python.py +0 -0
  372. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_text.py +0 -0
  373. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_udf.py +0 -0
  374. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_udf_signature.py +0 -0
  375. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_utils.py +0 -0
  376. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_webdataset.py +0 -0
  377. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/model/__init__.py +0 -0
  378. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/model/test_bbox.py +0 -0
  379. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/model/test_pose.py +0 -0
  380. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/model/test_segment.py +0 -0
  381. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/model/test_utils.py +0 -0
  382. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/__init__.py +0 -0
  383. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/sqlite/__init__.py +0 -0
  384. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/sqlite/test_types.py +0 -0
  385. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/sqlite/test_utils.py +0 -0
  386. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_array.py +0 -0
  387. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_conditional.py +0 -0
  388. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_path.py +0 -0
  389. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_random.py +0 -0
  390. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_selectable.py +0 -0
  391. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_string.py +0 -0
  392. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_asyn.py +0 -0
  393. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_cache.py +0 -0
  394. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_catalog.py +0 -0
  395. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_catalog_loader.py +0 -0
  396. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_cli_parsing.py +0 -0
  397. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_client.py +0 -0
  398. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_client_gcs.py +0 -0
  399. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_client_s3.py +0 -0
  400. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_config.py +0 -0
  401. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_data_storage.py +0 -0
  402. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_database_engine.py +0 -0
  403. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_dataset.py +0 -0
  404. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_dispatch.py +0 -0
  405. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_fileslice.py +0 -0
  406. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_func.py +0 -0
  407. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_listing.py +0 -0
  408. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_metastore.py +0 -0
  409. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_module_exports.py +0 -0
  410. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_pytorch.py +0 -0
  411. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_query.py +0 -0
  412. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_query_metrics.py +0 -0
  413. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_query_params.py +0 -0
  414. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_script_meta.py +0 -0
  415. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_semver.py +0 -0
  416. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_serializer.py +0 -0
  417. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_session.py +0 -0
  418. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_utils.py +0 -0
  419. {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_warehouse.py +0 -0
  420. {datachain-0.30.5 → datachain-0.30.6}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.30.5
3
+ Version: 0.30.6
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -80,3 +80,23 @@ Delta processing can be combined with [retry processing](./retry.md) to create a
80
80
 
81
81
  1. Processes only new or changed records (delta)
82
82
  2. Reprocesses records with errors or that are missing (retry)
83
+
84
+ ## Using Delta with Restricted Methods
85
+
86
+ By default, delta updates cannot be combined with the following methods:
87
+
88
+ 1. `merge`
89
+ 2. `union`
90
+ 3. `distinct`
91
+ 4. `agg`
92
+ 5. `group_by`
93
+
94
+ These methods are restricted because they may produce **unexpected results** when used with delta processing. Delta runs the chain only on a subset of rows (new and changed records), while methods like `distinct`, `agg`, or `group_by` are designed to operate on the entire dataset.
95
+
96
+ Similarly, combining delta with methods like `merge` or `union` may result in duplicated rows when merging with a static dataset.
97
+
98
+ If you still need to use these methods together with delta, you can override this restriction by setting the additional flag:
99
+
100
+ ```python
101
+ delta_unsafe=True
102
+ ```
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from typing import Optional
2
3
 
3
4
  import datachain as dc
@@ -39,7 +40,7 @@ def main():
39
40
  uri = "gs://datachain-demo/coco2017/annotations_captions/"
40
41
 
41
42
  # Print JSON schema in Pydantic format from main COCO annotation
42
- chain = dc.read_storage(uri, anon="True").filter(dc.C("file.path").glob("*.json"))
43
+ chain = dc.read_storage(uri, anon=True).filter(dc.C("file.path").glob("*.json"))
43
44
  file = chain.limit(1).to_values("file")[0]
44
45
  print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
45
46
 
@@ -65,11 +66,12 @@ def main():
65
66
  dynamic_csv_ds.print_schema()
66
67
  dynamic_csv_ds.show()
67
68
 
68
- print(
69
- "Note: script might hang at the end due to https://github.com/apache/arrow/issues/43497"
70
- )
71
- print("Just press Ctrl+C to exit.")
72
-
73
69
 
74
70
  if __name__ == "__main__":
75
71
  main()
72
+
73
+ # Force exit without cleanup to avoid hanging due to arrow issue
74
+ print(
75
+ "Note: script might warn about leaked semaphore at the end due to https://github.com/apache/arrow/issues/43497"
76
+ )
77
+ os._exit(0)
@@ -1,30 +1,41 @@
1
1
  import sys
2
- from typing import TYPE_CHECKING, Optional
2
+ from collections.abc import Iterable, Iterator
3
+ from typing import TYPE_CHECKING, Optional, Union
3
4
 
4
5
  from tabulate import tabulate
5
6
 
6
- if TYPE_CHECKING:
7
- from datachain.catalog import Catalog
8
-
7
+ from datachain import semver
9
8
  from datachain.catalog import is_namespace_local
10
9
  from datachain.cli.utils import determine_flavors
11
10
  from datachain.config import Config
12
11
  from datachain.error import DataChainError, DatasetNotFoundError
13
12
  from datachain.studio import list_datasets as list_datasets_studio
14
13
 
14
+ if TYPE_CHECKING:
15
+ from datachain.catalog import Catalog
16
+
17
+
18
+ def group_dataset_versions(
19
+ datasets: Iterable[tuple[str, str]], latest_only=True
20
+ ) -> dict[str, Union[str, list[str]]]:
21
+ grouped: dict[str, list[tuple[int, int, int]]] = {}
15
22
 
16
- def group_dataset_versions(datasets, latest_only=True):
17
- grouped = {}
18
23
  # Sort to ensure groupby works as expected
19
24
  # (groupby expects consecutive items with the same key)
20
25
  for name, version in sorted(datasets):
21
- grouped.setdefault(name, []).append(version)
26
+ grouped.setdefault(name, []).append(semver.parse(version))
22
27
 
23
28
  if latest_only:
24
29
  # For each dataset name, pick the highest version.
25
- return {name: max(versions) for name, versions in grouped.items()}
30
+ return {
31
+ name: semver.create(*(max(versions))) for name, versions in grouped.items()
32
+ }
33
+
26
34
  # For each dataset name, return a sorted list of unique versions.
27
- return {name: sorted(set(versions)) for name, versions in grouped.items()}
35
+ return {
36
+ name: [semver.create(*v) for v in sorted(set(versions))]
37
+ for name, versions in grouped.items()
38
+ }
28
39
 
29
40
 
30
41
  def list_datasets(
@@ -35,7 +46,7 @@ def list_datasets(
35
46
  team: Optional[str] = None,
36
47
  latest_only: bool = True,
37
48
  name: Optional[str] = None,
38
- ):
49
+ ) -> None:
39
50
  token = Config().read().get("studio", {}).get("token")
40
51
  all, local, studio = determine_flavors(studio, local, all, token)
41
52
  if name:
@@ -95,27 +106,31 @@ def list_datasets(
95
106
  print(tabulate(rows, headers="keys"))
96
107
 
97
108
 
98
- def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
109
+ def list_datasets_local(
110
+ catalog: "Catalog", name: Optional[str] = None
111
+ ) -> Iterator[tuple[str, str]]:
99
112
  if name:
100
113
  yield from list_datasets_local_versions(catalog, name)
101
114
  return
102
115
 
103
116
  for d in catalog.ls_datasets():
104
117
  for v in d.versions:
105
- yield (d.full_name, v.version)
118
+ yield d.full_name, v.version
106
119
 
107
120
 
108
- def list_datasets_local_versions(catalog: "Catalog", name: str):
121
+ def list_datasets_local_versions(
122
+ catalog: "Catalog", name: str
123
+ ) -> Iterator[tuple[str, str]]:
109
124
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
110
125
 
111
126
  ds = catalog.get_dataset(
112
127
  name, namespace_name=namespace_name, project_name=project_name
113
128
  )
114
129
  for v in ds.versions:
115
- yield (name, v.version)
130
+ yield name, v.version
116
131
 
117
132
 
118
- def _datasets_tabulate_row(name, both, local_version, studio_version):
133
+ def _datasets_tabulate_row(name, both, local_version, studio_version) -> dict[str, str]:
119
134
  row = {
120
135
  "Name": name,
121
136
  }
@@ -136,7 +151,7 @@ def rm_dataset(
136
151
  force: Optional[bool] = False,
137
152
  studio: Optional[bool] = False,
138
153
  team: Optional[str] = None,
139
- ):
154
+ ) -> None:
140
155
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
141
156
 
142
157
  if studio:
@@ -166,7 +181,7 @@ def edit_dataset(
166
181
  description: Optional[str] = None,
167
182
  attrs: Optional[list[str]] = None,
168
183
  team: Optional[str] = None,
169
- ):
184
+ ) -> None:
170
185
  from datachain.lib.dc.utils import is_studio
171
186
 
172
187
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
@@ -4,7 +4,7 @@ from functools import wraps
4
4
  from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
5
5
 
6
6
  import datachain
7
- from datachain.dataset import DatasetDependency
7
+ from datachain.dataset import DatasetDependency, DatasetRecord
8
8
  from datachain.error import DatasetNotFoundError
9
9
  from datachain.project import Project
10
10
 
@@ -30,9 +30,10 @@ def delta_disabled(
30
30
 
31
31
  @wraps(method)
32
32
  def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
33
- if self.delta:
33
+ if self.delta and not self._delta_unsafe:
34
34
  raise NotImplementedError(
35
- f"Delta update cannot be used with {method.__name__}"
35
+ f"Cannot use {method.__name__} with delta datasets - may cause"
36
+ " inconsistency. Use delta_unsafe flag to allow this operation."
36
37
  )
37
38
  return method(self, *args, **kwargs)
38
39
 
@@ -124,10 +125,19 @@ def _get_retry_chain(
124
125
  # Subtract also diff chain since some items might be picked
125
126
  # up by `delta=True` itself (e.g. records got modified AND are missing in the
126
127
  # result dataset atm)
127
- return retry_chain.subtract(diff_chain, on=on) if retry_chain else None
128
+ on = [on] if isinstance(on, str) else on
129
+
130
+ return (
131
+ retry_chain.diff(
132
+ diff_chain, on=on, added=True, same=True, modified=False, deleted=False
133
+ ).distinct(*on)
134
+ if retry_chain
135
+ else None
136
+ )
128
137
 
129
138
 
130
139
  def _get_source_info(
140
+ source_ds: DatasetRecord,
131
141
  name: str,
132
142
  namespace_name: str,
133
143
  project_name: str,
@@ -154,25 +164,23 @@ def _get_source_info(
154
164
  indirect=False,
155
165
  )
156
166
 
157
- dep = dependencies[0]
158
- if not dep:
167
+ source_ds_dep = next((d for d in dependencies if d.name == source_ds.name), None)
168
+ if not source_ds_dep:
159
169
  # Starting dataset was removed, back off to normal dataset creation
160
170
  return None, None, None, None, None
161
171
 
162
- source_ds_project = catalog.metastore.get_project(dep.project, dep.namespace)
163
- source_ds_name = dep.name
164
- source_ds_version = dep.version
165
- source_ds_latest_version = catalog.get_dataset(
166
- source_ds_name,
167
- namespace_name=source_ds_project.namespace.name,
168
- project_name=source_ds_project.name,
169
- ).latest_version
172
+ # Refresh starting dataset to have new versions if they are created
173
+ source_ds = catalog.get_dataset(
174
+ source_ds.name,
175
+ namespace_name=source_ds.project.namespace.name,
176
+ project_name=source_ds.project.name,
177
+ )
170
178
 
171
179
  return (
172
- source_ds_name,
173
- source_ds_project,
174
- source_ds_version,
175
- source_ds_latest_version,
180
+ source_ds.name,
181
+ source_ds.project,
182
+ source_ds_dep.version,
183
+ source_ds.latest_version,
176
184
  dependencies,
177
185
  )
178
186
 
@@ -244,7 +252,14 @@ def delta_retry_update(
244
252
  source_ds_version,
245
253
  source_ds_latest_version,
246
254
  dependencies,
247
- ) = _get_source_info(name, namespace_name, project_name, latest_version, catalog)
255
+ ) = _get_source_info(
256
+ dc._query.starting_step.dataset, # type: ignore[union-attr]
257
+ name,
258
+ namespace_name,
259
+ project_name,
260
+ latest_version,
261
+ catalog,
262
+ )
248
263
 
249
264
  # If source_ds_name is None, starting dataset was removed
250
265
  if source_ds_name is None:
@@ -267,8 +282,9 @@ def delta_retry_update(
267
282
  if dependencies:
268
283
  dependencies = copy(dependencies)
269
284
  dependencies = [d for d in dependencies if d is not None]
285
+ source_ds_dep = next(d for d in dependencies if d.name == source_ds_name)
270
286
  # Update to latest version
271
- dependencies[0].version = source_ds_latest_version # type: ignore[union-attr]
287
+ source_ds_dep.version = source_ds_latest_version # type: ignore[union-attr]
272
288
 
273
289
  # Handle retry functionality if enabled
274
290
  if delta_retry:
@@ -193,6 +193,7 @@ class DataChain:
193
193
  self._setup: dict = setup or {}
194
194
  self._sys = _sys
195
195
  self._delta = False
196
+ self._delta_unsafe = False
196
197
  self._delta_on: Optional[Union[str, Sequence[str]]] = None
197
198
  self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
198
199
  self._delta_compare: Optional[Union[str, Sequence[str]]] = None
@@ -216,6 +217,7 @@ class DataChain:
216
217
  right_on: Optional[Union[str, Sequence[str]]] = None,
217
218
  compare: Optional[Union[str, Sequence[str]]] = None,
218
219
  delta_retry: Optional[Union[bool, str]] = None,
220
+ delta_unsafe: bool = False,
219
221
  ) -> "Self":
220
222
  """Marks this chain as delta, which means special delta process will be
221
223
  called on saving dataset for optimization"""
@@ -226,6 +228,7 @@ class DataChain:
226
228
  self._delta_result_on = right_on
227
229
  self._delta_compare = compare
228
230
  self._delta_retry = delta_retry
231
+ self._delta_unsafe = delta_unsafe
229
232
  return self
230
233
 
231
234
  @property
@@ -238,6 +241,10 @@ class DataChain:
238
241
  """Returns True if this chain is ran in "delta" update mode"""
239
242
  return self._delta
240
243
 
244
+ @property
245
+ def delta_unsafe(self) -> bool:
246
+ return self._delta_unsafe
247
+
241
248
  @property
242
249
  def schema(self) -> dict[str, DataType]:
243
250
  """Get schema of the chain."""
@@ -328,6 +335,7 @@ class DataChain:
328
335
  right_on=self._delta_result_on,
329
336
  compare=self._delta_compare,
330
337
  delta_retry=self._delta_retry,
338
+ delta_unsafe=self._delta_unsafe,
331
339
  )
332
340
 
333
341
  return chain
@@ -40,6 +40,7 @@ def read_dataset(
40
40
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
41
41
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
42
42
  delta_retry: Optional[Union[bool, str]] = None,
43
+ delta_unsafe: bool = False,
43
44
  update: bool = False,
44
45
  ) -> "DataChain":
45
46
  """Get data from a saved Dataset. It returns the chain itself.
@@ -80,6 +81,8 @@ def read_dataset(
80
81
  update: If True always checks for newer versions available on Studio, even if
81
82
  some version of the dataset exists locally already. If False (default), it
82
83
  will only fetch the dataset from Studio if it is not found locally.
84
+ delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
85
+ distinct.
83
86
 
84
87
 
85
88
  Example:
@@ -205,6 +208,7 @@ def read_dataset(
205
208
  right_on=delta_result_on,
206
209
  compare=delta_compare,
207
210
  delta_retry=delta_retry,
211
+ delta_unsafe=delta_unsafe,
208
212
  )
209
213
 
210
214
  return chain
@@ -43,6 +43,7 @@ def read_storage(
43
43
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
44
44
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
45
45
  delta_retry: Optional[Union[bool, str]] = None,
46
+ delta_unsafe: bool = False,
46
47
  client_config: Optional[dict] = None,
47
48
  ) -> "DataChain":
48
49
  """Get data from storage(s) as a list of file with all file attributes.
@@ -77,6 +78,9 @@ def read_storage(
77
78
  (error mode)
78
79
  - True: Reprocess records missing from the result dataset (missing mode)
79
80
  - None: No retry processing (default)
81
+ delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
82
+ distinct. Caller must ensure datasets are consistent and not partially
83
+ updated.
80
84
 
81
85
  Returns:
82
86
  DataChain: A DataChain object containing the file information.
@@ -218,6 +222,7 @@ def read_storage(
218
222
  right_on=delta_result_on,
219
223
  compare=delta_compare,
220
224
  delta_retry=delta_retry,
225
+ delta_unsafe=delta_unsafe,
221
226
  )
222
227
 
223
228
  return storage_chain
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.30.5
3
+ Version: 0.30.6
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -348,6 +348,7 @@ tests/unit/test_asyn.py
348
348
  tests/unit/test_cache.py
349
349
  tests/unit/test_catalog.py
350
350
  tests/unit/test_catalog_loader.py
351
+ tests/unit/test_cli_datasets.py
351
352
  tests/unit/test_cli_parsing.py
352
353
  tests/unit/test_client.py
353
354
  tests/unit/test_client_gcs.py
@@ -547,11 +547,9 @@ def is_studio():
547
547
 
548
548
  @pytest.fixture(autouse=True)
549
549
  def mock_is_studio(monkeypatch, is_studio):
550
- if not is_studio:
551
- yield
552
- else:
553
- monkeypatch.setenv("DATACHAIN_IS_STUDIO", True)
554
- yield
550
+ if is_studio:
551
+ monkeypatch.setenv("DATACHAIN_IS_STUDIO", "True")
552
+ yield
555
553
 
556
554
 
557
555
  @pytest.fixture
@@ -14,26 +14,16 @@ from datachain.lib.file import File, ImageFile
14
14
  def _get_dependencies(catalog, name, version) -> list[tuple[str, str]]:
15
15
  return sorted(
16
16
  [
17
- (f"{d.namespace}.{d.project}.{d.name}", d.version)
17
+ (d.name, d.version)
18
18
  for d in catalog.get_dataset_dependencies(name, version, indirect=False)
19
19
  ]
20
20
  )
21
21
 
22
22
 
23
- @pytest.mark.parametrize("project", ("global.dev", ""))
24
- def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path, project):
23
+ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
25
24
  catalog = test_session.catalog
26
- default_namespace_name = catalog.metastore.default_namespace_name
27
- default_project_name = catalog.metastore.default_project_name
28
-
29
- if project:
30
- starting_ds_name = f"{project}.starting_ds"
31
- dependency_ds_name = starting_ds_name
32
- else:
33
- starting_ds_name = "starting_ds"
34
- dependency_ds_name = (
35
- f"{default_namespace_name}.{default_project_name}.{starting_ds_name}"
36
- )
25
+
26
+ starting_ds_name = "starting_ds"
37
27
  ds_name = "delta_ds"
38
28
 
39
29
  images = [
@@ -66,16 +56,12 @@ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path, project):
66
56
  create_image_dataset(starting_ds_name, images[:2])
67
57
  # first version of delta dataset
68
58
  create_delta_dataset(ds_name)
69
- assert _get_dependencies(catalog, ds_name, "1.0.0") == [
70
- (dependency_ds_name, "1.0.0")
71
- ]
59
+ assert _get_dependencies(catalog, ds_name, "1.0.0") == [(starting_ds_name, "1.0.0")]
72
60
  # second version of starting dataset
73
61
  create_image_dataset(starting_ds_name, images[2:])
74
62
  # second version of delta dataset
75
63
  create_delta_dataset(ds_name)
76
- assert _get_dependencies(catalog, ds_name, "1.0.1") == [
77
- (dependency_ds_name, "1.0.1")
78
- ]
64
+ assert _get_dependencies(catalog, ds_name, "1.0.1") == [(starting_ds_name, "1.0.1")]
79
65
 
80
66
  assert (dc.read_dataset(ds_name, version="1.0.0").order_by("file.path")).to_values(
81
67
  "file.path"
@@ -96,6 +82,66 @@ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path, project):
96
82
  create_delta_dataset(ds_name)
97
83
 
98
84
 
85
+ def test_delta_update_unsafe(test_session):
86
+ catalog = test_session.catalog
87
+
88
+ starting_ds_name = "starting_ds"
89
+ merge_ds_name = "merge_ds"
90
+ ds_name = "delta_ds"
91
+
92
+ # create dataset which will be merged to delta one
93
+ merge_ds = dc.read_values(
94
+ id=[1, 2, 3, 4, 5, 6], value=[1, 2, 3, 4, 5, 6], session=test_session
95
+ ).save(merge_ds_name)
96
+
97
+ # first version of starting dataset
98
+ dc.read_values(id=[1, 2, 3], session=test_session).save(starting_ds_name)
99
+ # first version of delta dataset
100
+ dc.read_dataset(
101
+ starting_ds_name,
102
+ session=test_session,
103
+ delta_on="id",
104
+ delta=True,
105
+ delta_unsafe=True,
106
+ ).merge(merge_ds, on="id", inner=True).save(ds_name)
107
+
108
+ assert set(_get_dependencies(catalog, ds_name, "1.0.0")) == {
109
+ (starting_ds_name, "1.0.0"),
110
+ (merge_ds_name, "1.0.0"),
111
+ }
112
+
113
+ # second version of starting dataset
114
+ dc.read_values(id=[1, 2, 3, 4, 5, 6], session=test_session).save(starting_ds_name)
115
+ # second version of delta dataset
116
+ dc.read_dataset(
117
+ starting_ds_name,
118
+ session=test_session,
119
+ delta_on="id",
120
+ delta=True,
121
+ delta_unsafe=True,
122
+ ).merge(merge_ds, on="id", inner=True).save(ds_name)
123
+
124
+ assert set(_get_dependencies(catalog, ds_name, "1.0.1")) == {
125
+ (starting_ds_name, "1.0.1"),
126
+ (merge_ds_name, "1.0.0"),
127
+ }
128
+
129
+ assert set((dc.read_dataset(ds_name, version="1.0.0")).to_list("id", "value")) == {
130
+ (1, 1),
131
+ (2, 2),
132
+ (3, 3),
133
+ }
134
+
135
+ assert set((dc.read_dataset(ds_name, version="1.0.1")).to_list("id", "value")) == {
136
+ (1, 1),
137
+ (2, 2),
138
+ (3, 3),
139
+ (4, 4),
140
+ (5, 5),
141
+ (6, 6),
142
+ }
143
+
144
+
99
145
  def test_delta_update_from_storage(test_session, tmp_dir, tmp_path):
100
146
  ds_name = "delta_ds"
101
147
  path = tmp_dir.as_uri()
@@ -249,8 +295,6 @@ def test_delta_update_check_num_calls(test_session, tmp_dir, tmp_path, capsys):
249
295
 
250
296
  def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
251
297
  catalog = test_session.catalog
252
- default_namespace_name = catalog.metastore.default_namespace_name
253
- default_project_name = catalog.metastore.default_project_name
254
298
  ds_name = "delta_ds"
255
299
  path = tmp_dir.as_uri()
256
300
  tmp_dir = tmp_dir / "images"
@@ -301,7 +345,8 @@ def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
301
345
 
302
346
  assert str(exc_info.value) == (
303
347
  f"Dataset {ds_name} version 1.0.1 not found in namespace "
304
- f"{default_namespace_name} and project {default_project_name}"
348
+ f"{catalog.metastore.default_namespace_name}"
349
+ f" and project {catalog.metastore.default_project_name}"
305
350
  )
306
351
 
307
352
 
@@ -325,11 +370,13 @@ def test_delta_update_union(test_session, file_dataset):
325
370
  file_dataset.name,
326
371
  session=test_session,
327
372
  delta=True,
328
- delta_on=["file.source", "file.path"],
329
373
  ).union(dc.read_dataset("numbers"), session=test_session)
330
374
  )
331
375
 
332
- assert str(excinfo.value) == "Delta update cannot be used with union"
376
+ assert str(excinfo.value) == (
377
+ "Cannot use union with delta datasets - may cause inconsistency."
378
+ " Use delta_unsafe flag to allow this operation."
379
+ )
333
380
 
334
381
 
335
382
  def test_delta_update_merge(test_session, file_dataset):
@@ -341,11 +388,13 @@ def test_delta_update_merge(test_session, file_dataset):
341
388
  file_dataset.name,
342
389
  session=test_session,
343
390
  delta=True,
344
- delta_on=["file.source", "file.path"],
345
391
  ).merge(dc.read_dataset("numbers"), on="id", session=test_session)
346
392
  )
347
393
 
348
- assert str(excinfo.value) == "Delta update cannot be used with merge"
394
+ assert str(excinfo.value) == (
395
+ "Cannot use merge with delta datasets - may cause inconsistency."
396
+ " Use delta_unsafe flag to allow this operation."
397
+ )
349
398
 
350
399
 
351
400
  def test_delta_update_distinct(test_session, file_dataset):
@@ -355,11 +404,13 @@ def test_delta_update_distinct(test_session, file_dataset):
355
404
  file_dataset.name,
356
405
  session=test_session,
357
406
  delta=True,
358
- delta_on=["file.source", "file.path"],
359
407
  ).distinct("file.path")
360
408
  )
361
409
 
362
- assert str(excinfo.value) == "Delta update cannot be used with distinct"
410
+ assert str(excinfo.value) == (
411
+ "Cannot use distinct with delta datasets - may cause inconsistency."
412
+ " Use delta_unsafe flag to allow this operation."
413
+ )
363
414
 
364
415
 
365
416
  def test_delta_update_group_by(test_session, file_dataset):
@@ -369,11 +420,13 @@ def test_delta_update_group_by(test_session, file_dataset):
369
420
  file_dataset.name,
370
421
  session=test_session,
371
422
  delta=True,
372
- delta_on=["file.source", "file.path"],
373
423
  ).group_by(cnt=func.count(), partition_by="file.path")
374
424
  )
375
425
 
376
- assert str(excinfo.value) == "Delta update cannot be used with group_by"
426
+ assert str(excinfo.value) == (
427
+ "Cannot use group_by with delta datasets - may cause inconsistency."
428
+ " Use delta_unsafe flag to allow this operation."
429
+ )
377
430
 
378
431
 
379
432
  def test_delta_update_agg(test_session, file_dataset):
@@ -383,8 +436,10 @@ def test_delta_update_agg(test_session, file_dataset):
383
436
  file_dataset.name,
384
437
  session=test_session,
385
438
  delta=True,
386
- delta_on=["file.source", "file.path"],
387
439
  ).agg(cnt=func.count(), partition_by="file.path")
388
440
  )
389
441
 
390
- assert str(excinfo.value) == "Delta update cannot be used with agg"
442
+ assert str(excinfo.value) == (
443
+ "Cannot use agg with delta datasets - may cause inconsistency."
444
+ " Use delta_unsafe flag to allow this operation."
445
+ )
@@ -1,3 +1,4 @@
1
+ from collections.abc import Iterator
1
2
  from datetime import datetime, timezone
2
3
  from typing import TYPE_CHECKING
3
4
 
@@ -425,3 +426,42 @@ def test_delta_and_delta_retry_no_duplicates(test_session):
425
426
  assert len(ids_in_result) == 4
426
427
  assert len(set(ids_in_result)) == 4 # No duplicate IDs
427
428
  assert set(ids_in_result) == {1, 2, 3, 4}
429
+
430
+
431
+ def test_repeating_errors(test_session):
432
+ def run_delta():
433
+ def func(id) -> Iterator[tuple[int, str, str]]:
434
+ yield id, "name1", "error"
435
+ yield id, "name2", "error"
436
+
437
+ return (
438
+ dc.read_dataset(
439
+ "sample_data",
440
+ delta=True,
441
+ delta_on="id",
442
+ delta_result_on="id",
443
+ delta_retry="error",
444
+ session=test_session,
445
+ )
446
+ .gen(func, output={"id": int, "name": str, "error": str})
447
+ .save("processed_data")
448
+ )
449
+ return dc.read_dataset("processed_data")
450
+
451
+ _create_sample_data(
452
+ test_session, ids=list(range(1)), contents=[str(i) for i in range(1)]
453
+ )
454
+ ch1 = run_delta()
455
+ assert sorted(ch1.collect("id")) == [0, 0]
456
+
457
+ _create_sample_data(
458
+ test_session, ids=list(range(2)), contents=[str(i) for i in range(2)]
459
+ )
460
+ ch2 = run_delta()
461
+ assert sorted(ch2.collect("id")) == [0, 0, 1, 1]
462
+
463
+ _create_sample_data(
464
+ test_session, ids=list(range(3)), contents=[str(i) for i in range(3)]
465
+ )
466
+ ch3 = run_delta()
467
+ assert sorted(ch3.collect("id")) == [0, 0, 1, 1, 2, 2]