datachain 0.24.0__tar.gz → 0.24.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (399) hide show
  1. {datachain-0.24.0 → datachain-0.24.1}/PKG-INFO +1 -1
  2. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/delta.py +82 -25
  3. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/datachain.py +2 -0
  4. {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/PKG-INFO +1 -1
  5. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_delta.py +20 -5
  6. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_retry.py +164 -50
  7. {datachain-0.24.0 → datachain-0.24.1}/.cruft.json +0 -0
  8. {datachain-0.24.0 → datachain-0.24.1}/.gitattributes +0 -0
  9. {datachain-0.24.0 → datachain-0.24.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  10. {datachain-0.24.0 → datachain-0.24.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  11. {datachain-0.24.0 → datachain-0.24.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  12. {datachain-0.24.0 → datachain-0.24.1}/.github/codecov.yaml +0 -0
  13. {datachain-0.24.0 → datachain-0.24.1}/.github/dependabot.yml +0 -0
  14. {datachain-0.24.0 → datachain-0.24.1}/.github/workflows/benchmarks.yml +0 -0
  15. {datachain-0.24.0 → datachain-0.24.1}/.github/workflows/release.yml +0 -0
  16. {datachain-0.24.0 → datachain-0.24.1}/.github/workflows/tests-studio.yml +0 -0
  17. {datachain-0.24.0 → datachain-0.24.1}/.github/workflows/tests.yml +0 -0
  18. {datachain-0.24.0 → datachain-0.24.1}/.github/workflows/update-template.yaml +0 -0
  19. {datachain-0.24.0 → datachain-0.24.1}/.gitignore +0 -0
  20. {datachain-0.24.0 → datachain-0.24.1}/.pre-commit-config.yaml +0 -0
  21. {datachain-0.24.0 → datachain-0.24.1}/CODE_OF_CONDUCT.rst +0 -0
  22. {datachain-0.24.0 → datachain-0.24.1}/LICENSE +0 -0
  23. {datachain-0.24.0 → datachain-0.24.1}/README.rst +0 -0
  24. {datachain-0.24.0 → datachain-0.24.1}/docs/assets/captioned_cartoons.png +0 -0
  25. {datachain-0.24.0 → datachain-0.24.1}/docs/assets/datachain-white.svg +0 -0
  26. {datachain-0.24.0 → datachain-0.24.1}/docs/assets/datachain.svg +0 -0
  27. {datachain-0.24.0 → datachain-0.24.1}/docs/commands/auth/login.md +0 -0
  28. {datachain-0.24.0 → datachain-0.24.1}/docs/commands/auth/logout.md +0 -0
  29. {datachain-0.24.0 → datachain-0.24.1}/docs/commands/auth/team.md +0 -0
  30. {datachain-0.24.0 → datachain-0.24.1}/docs/commands/auth/token.md +0 -0
  31. {datachain-0.24.0 → datachain-0.24.1}/docs/commands/index.md +0 -0
  32. {datachain-0.24.0 → datachain-0.24.1}/docs/commands/job/cancel.md +0 -0
  33. {datachain-0.24.0 → datachain-0.24.1}/docs/commands/job/clusters.md +0 -0
  34. {datachain-0.24.0 → datachain-0.24.1}/docs/commands/job/logs.md +0 -0
  35. {datachain-0.24.0 → datachain-0.24.1}/docs/commands/job/ls.md +0 -0
  36. {datachain-0.24.0 → datachain-0.24.1}/docs/commands/job/run.md +0 -0
  37. {datachain-0.24.0 → datachain-0.24.1}/docs/contributing.md +0 -0
  38. {datachain-0.24.0 → datachain-0.24.1}/docs/css/github-permalink-style.css +0 -0
  39. {datachain-0.24.0 → datachain-0.24.1}/docs/examples.md +0 -0
  40. {datachain-0.24.0 → datachain-0.24.1}/docs/guide/db_migrations.md +0 -0
  41. {datachain-0.24.0 → datachain-0.24.1}/docs/guide/delta.md +0 -0
  42. {datachain-0.24.0 → datachain-0.24.1}/docs/guide/env.md +0 -0
  43. {datachain-0.24.0 → datachain-0.24.1}/docs/guide/index.md +0 -0
  44. {datachain-0.24.0 → datachain-0.24.1}/docs/guide/namespaces.md +0 -0
  45. {datachain-0.24.0 → datachain-0.24.1}/docs/guide/processing.md +0 -0
  46. {datachain-0.24.0 → datachain-0.24.1}/docs/guide/remotes.md +0 -0
  47. {datachain-0.24.0 → datachain-0.24.1}/docs/guide/retry.md +0 -0
  48. {datachain-0.24.0 → datachain-0.24.1}/docs/index.md +0 -0
  49. {datachain-0.24.0 → datachain-0.24.1}/docs/overrides/main.html +0 -0
  50. {datachain-0.24.0 → datachain-0.24.1}/docs/quick-start.md +0 -0
  51. {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/arrowrow.md +0 -0
  52. {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/bbox.md +0 -0
  53. {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/file.md +0 -0
  54. {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/imagefile.md +0 -0
  55. {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/index.md +0 -0
  56. {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/pose.md +0 -0
  57. {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/segment.md +0 -0
  58. {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/tarvfile.md +0 -0
  59. {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/textfile.md +0 -0
  60. {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/videofile.md +0 -0
  61. {datachain-0.24.0 → datachain-0.24.1}/docs/references/datachain.md +0 -0
  62. {datachain-0.24.0 → datachain-0.24.1}/docs/references/func.md +0 -0
  63. {datachain-0.24.0 → datachain-0.24.1}/docs/references/index.md +0 -0
  64. {datachain-0.24.0 → datachain-0.24.1}/docs/references/toolkit.md +0 -0
  65. {datachain-0.24.0 → datachain-0.24.1}/docs/references/torch.md +0 -0
  66. {datachain-0.24.0 → datachain-0.24.1}/docs/references/udf.md +0 -0
  67. {datachain-0.24.0 → datachain-0.24.1}/docs/tutorials.md +0 -0
  68. {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  69. {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  70. {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/openimage-detect.py +0 -0
  71. {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
  72. {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/ultralytics-pose.py +0 -0
  73. {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/ultralytics-segment.py +0 -0
  74. {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/common_sql_functions.py +0 -0
  75. {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/json-csv-reader.py +0 -0
  76. {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/torch-loader.py +0 -0
  77. {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/udfs/parallel.py +0 -0
  78. {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/udfs/simple.py +0 -0
  79. {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/udfs/stateful.py +0 -0
  80. {datachain-0.24.0 → datachain-0.24.1}/examples/incremental_processing/delta.py +0 -0
  81. {datachain-0.24.0 → datachain-0.24.1}/examples/incremental_processing/retry.py +0 -0
  82. {datachain-0.24.0 → datachain-0.24.1}/examples/incremental_processing/utils.py +0 -0
  83. {datachain-0.24.0 → datachain-0.24.1}/examples/llm_and_nlp/claude-query.py +0 -0
  84. {datachain-0.24.0 → datachain-0.24.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  85. {datachain-0.24.0 → datachain-0.24.1}/examples/multimodal/clip_inference.py +0 -0
  86. {datachain-0.24.0 → datachain-0.24.1}/examples/multimodal/hf_pipeline.py +0 -0
  87. {datachain-0.24.0 → datachain-0.24.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  88. {datachain-0.24.0 → datachain-0.24.1}/examples/multimodal/wds.py +0 -0
  89. {datachain-0.24.0 → datachain-0.24.1}/examples/multimodal/wds_filtered.py +0 -0
  90. {datachain-0.24.0 → datachain-0.24.1}/mkdocs.yml +0 -0
  91. {datachain-0.24.0 → datachain-0.24.1}/noxfile.py +0 -0
  92. {datachain-0.24.0 → datachain-0.24.1}/pyproject.toml +0 -0
  93. {datachain-0.24.0 → datachain-0.24.1}/setup.cfg +0 -0
  94. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/__init__.py +0 -0
  95. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/__main__.py +0 -0
  96. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/asyn.py +0 -0
  97. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cache.py +0 -0
  98. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/catalog/__init__.py +0 -0
  99. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/catalog/catalog.py +0 -0
  100. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/catalog/datasource.py +0 -0
  101. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/catalog/loader.py +0 -0
  102. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/__init__.py +0 -0
  103. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/__init__.py +0 -0
  104. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/datasets.py +0 -0
  105. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/du.py +0 -0
  106. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/index.py +0 -0
  107. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/ls.py +0 -0
  108. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/misc.py +0 -0
  109. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/query.py +0 -0
  110. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/show.py +0 -0
  111. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/parser/__init__.py +0 -0
  112. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/parser/job.py +0 -0
  113. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/parser/studio.py +0 -0
  114. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/parser/utils.py +0 -0
  115. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/utils.py +0 -0
  116. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/__init__.py +0 -0
  117. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/azure.py +0 -0
  118. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/fileslice.py +0 -0
  119. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/fsspec.py +0 -0
  120. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/gcs.py +0 -0
  121. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/hf.py +0 -0
  122. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/local.py +0 -0
  123. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/s3.py +0 -0
  124. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/config.py +0 -0
  125. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/__init__.py +0 -0
  126. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/db_engine.py +0 -0
  127. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/job.py +0 -0
  128. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/metastore.py +0 -0
  129. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/schema.py +0 -0
  130. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/serializer.py +0 -0
  131. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/sqlite.py +0 -0
  132. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/warehouse.py +0 -0
  133. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/dataset.py +0 -0
  134. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/diff/__init__.py +0 -0
  135. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/error.py +0 -0
  136. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/fs/__init__.py +0 -0
  137. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/fs/reference.py +0 -0
  138. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/fs/utils.py +0 -0
  139. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/__init__.py +0 -0
  140. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/aggregate.py +0 -0
  141. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/array.py +0 -0
  142. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/base.py +0 -0
  143. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/conditional.py +0 -0
  144. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/func.py +0 -0
  145. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/numeric.py +0 -0
  146. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/path.py +0 -0
  147. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/random.py +0 -0
  148. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/string.py +0 -0
  149. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/window.py +0 -0
  150. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/job.py +0 -0
  151. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/__init__.py +0 -0
  152. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/arrow.py +0 -0
  153. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/clip.py +0 -0
  154. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/__init__.py +0 -0
  155. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/flatten.py +0 -0
  156. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  157. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  158. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/unflatten.py +0 -0
  159. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  160. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/data_model.py +0 -0
  161. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dataset_info.py +0 -0
  162. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/__init__.py +0 -0
  163. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/csv.py +0 -0
  164. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/database.py +0 -0
  165. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/datasets.py +0 -0
  166. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/hf.py +0 -0
  167. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/json.py +0 -0
  168. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/listings.py +0 -0
  169. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/pandas.py +0 -0
  170. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/parquet.py +0 -0
  171. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/records.py +0 -0
  172. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/storage.py +0 -0
  173. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/utils.py +0 -0
  174. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/values.py +0 -0
  175. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/file.py +0 -0
  176. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/hf.py +0 -0
  177. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/image.py +0 -0
  178. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/listing.py +0 -0
  179. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/listing_info.py +0 -0
  180. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/meta_formats.py +0 -0
  181. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/model_store.py +0 -0
  182. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/namespaces.py +0 -0
  183. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/projects.py +0 -0
  184. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/pytorch.py +0 -0
  185. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/settings.py +0 -0
  186. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/signal_schema.py +0 -0
  187. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/tar.py +0 -0
  188. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/text.py +0 -0
  189. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/udf.py +0 -0
  190. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/udf_signature.py +0 -0
  191. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/utils.py +0 -0
  192. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/video.py +0 -0
  193. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/webdataset.py +0 -0
  194. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/webdataset_laion.py +0 -0
  195. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/listing.py +0 -0
  196. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/__init__.py +0 -0
  197. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/bbox.py +0 -0
  198. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/pose.py +0 -0
  199. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/segment.py +0 -0
  200. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/ultralytics/__init__.py +0 -0
  201. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/ultralytics/bbox.py +0 -0
  202. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/ultralytics/pose.py +0 -0
  203. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/ultralytics/segment.py +0 -0
  204. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/utils.py +0 -0
  205. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/namespace.py +0 -0
  206. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/node.py +0 -0
  207. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/nodes_fetcher.py +0 -0
  208. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/nodes_thread_pool.py +0 -0
  209. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/progress.py +0 -0
  210. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/project.py +0 -0
  211. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/py.typed +0 -0
  212. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/__init__.py +0 -0
  213. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/batch.py +0 -0
  214. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/dataset.py +0 -0
  215. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/dispatch.py +0 -0
  216. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/metrics.py +0 -0
  217. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/params.py +0 -0
  218. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/queue.py +0 -0
  219. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/schema.py +0 -0
  220. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/session.py +0 -0
  221. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/udf.py +0 -0
  222. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/utils.py +0 -0
  223. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/remote/__init__.py +0 -0
  224. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/remote/studio.py +0 -0
  225. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/script_meta.py +0 -0
  226. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/semver.py +0 -0
  227. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/__init__.py +0 -0
  228. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/default/__init__.py +0 -0
  229. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/default/base.py +0 -0
  230. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/__init__.py +0 -0
  231. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/aggregate.py +0 -0
  232. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/array.py +0 -0
  233. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/conditional.py +0 -0
  234. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/numeric.py +0 -0
  235. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/path.py +0 -0
  236. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/random.py +0 -0
  237. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/string.py +0 -0
  238. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/selectable.py +0 -0
  239. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  240. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/sqlite/base.py +0 -0
  241. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/sqlite/types.py +0 -0
  242. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/sqlite/vector.py +0 -0
  243. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/types.py +0 -0
  244. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/utils.py +0 -0
  245. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/studio.py +0 -0
  246. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/telemetry.py +0 -0
  247. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/toolkit/__init__.py +0 -0
  248. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/toolkit/split.py +0 -0
  249. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/torch/__init__.py +0 -0
  250. {datachain-0.24.0 → datachain-0.24.1}/src/datachain/utils.py +0 -0
  251. {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/SOURCES.txt +0 -0
  252. {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  253. {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/entry_points.txt +0 -0
  254. {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/requires.txt +0 -0
  255. {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/top_level.txt +0 -0
  256. {datachain-0.24.0 → datachain-0.24.1}/tests/__init__.py +0 -0
  257. {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/__init__.py +0 -0
  258. {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/conftest.py +0 -0
  259. {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  260. {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  261. {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/datasets/.gitignore +0 -0
  262. {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  263. {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/test_datachain.py +0 -0
  264. {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/test_ls.py +0 -0
  265. {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/test_version.py +0 -0
  266. {datachain-0.24.0 → datachain-0.24.1}/tests/conftest.py +0 -0
  267. {datachain-0.24.0 → datachain-0.24.1}/tests/data.py +0 -0
  268. {datachain-0.24.0 → datachain-0.24.1}/tests/examples/__init__.py +0 -0
  269. {datachain-0.24.0 → datachain-0.24.1}/tests/examples/test_examples.py +0 -0
  270. {datachain-0.24.0 → datachain-0.24.1}/tests/examples/test_wds_e2e.py +0 -0
  271. {datachain-0.24.0 → datachain-0.24.1}/tests/examples/wds_data.py +0 -0
  272. {datachain-0.24.0 → datachain-0.24.1}/tests/func/__init__.py +0 -0
  273. {datachain-0.24.0 → datachain-0.24.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  274. {datachain-0.24.0 → datachain-0.24.1}/tests/func/data/lena.jpg +0 -0
  275. {datachain-0.24.0 → datachain-0.24.1}/tests/func/fake-service-account-credentials.json +0 -0
  276. {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/__init__.py +0 -0
  277. {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_aggregate.py +0 -0
  278. {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_array.py +0 -0
  279. {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_conditional.py +0 -0
  280. {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_numeric.py +0 -0
  281. {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_path.py +0 -0
  282. {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_random.py +0 -0
  283. {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_string.py +0 -0
  284. {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/__init__.py +0 -0
  285. {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/data/running-mask0.png +0 -0
  286. {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/data/running-mask1.png +0 -0
  287. {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/data/running.jpg +0 -0
  288. {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/data/ships.jpg +0 -0
  289. {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/test_yolo.py +0 -0
  290. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_batching.py +0 -0
  291. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_catalog.py +0 -0
  292. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_client.py +0 -0
  293. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_cloud_transfer.py +0 -0
  294. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_data_storage.py +0 -0
  295. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_datachain.py +0 -0
  296. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_datachain_merge.py +0 -0
  297. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_dataset_query.py +0 -0
  298. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_datasets.py +0 -0
  299. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_feature_pickling.py +0 -0
  300. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_file.py +0 -0
  301. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_hf.py +0 -0
  302. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_hidden_field.py +0 -0
  303. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_image.py +0 -0
  304. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_listing.py +0 -0
  305. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_ls.py +0 -0
  306. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_meta_formats.py +0 -0
  307. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_metastore.py +0 -0
  308. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_metrics.py +0 -0
  309. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_pull.py +0 -0
  310. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_pytorch.py +0 -0
  311. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_query.py +0 -0
  312. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_read_database.py +0 -0
  313. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_read_dataset_remote.py +0 -0
  314. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  315. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_session.py +0 -0
  316. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_toolkit.py +0 -0
  317. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_video.py +0 -0
  318. {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_warehouse.py +0 -0
  319. {datachain-0.24.0 → datachain-0.24.1}/tests/scripts/feature_class.py +0 -0
  320. {datachain-0.24.0 → datachain-0.24.1}/tests/scripts/feature_class_exception.py +0 -0
  321. {datachain-0.24.0 → datachain-0.24.1}/tests/scripts/feature_class_parallel.py +0 -0
  322. {datachain-0.24.0 → datachain-0.24.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  323. {datachain-0.24.0 → datachain-0.24.1}/tests/scripts/name_len_slow.py +0 -0
  324. {datachain-0.24.0 → datachain-0.24.1}/tests/test_atomicity.py +0 -0
  325. {datachain-0.24.0 → datachain-0.24.1}/tests/test_cli_e2e.py +0 -0
  326. {datachain-0.24.0 → datachain-0.24.1}/tests/test_cli_studio.py +0 -0
  327. {datachain-0.24.0 → datachain-0.24.1}/tests/test_import_time.py +0 -0
  328. {datachain-0.24.0 → datachain-0.24.1}/tests/test_query_e2e.py +0 -0
  329. {datachain-0.24.0 → datachain-0.24.1}/tests/test_telemetry.py +0 -0
  330. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/__init__.py +0 -0
  331. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/__init__.py +0 -0
  332. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/conftest.py +0 -0
  333. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_arrow.py +0 -0
  334. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_clip.py +0 -0
  335. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_datachain.py +0 -0
  336. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  337. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  338. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_diff.py +0 -0
  339. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_feature.py +0 -0
  340. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_feature_utils.py +0 -0
  341. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_file.py +0 -0
  342. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_hf.py +0 -0
  343. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_image.py +0 -0
  344. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_listing_info.py +0 -0
  345. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_namespace.py +0 -0
  346. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_project.py +0 -0
  347. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_python_to_sql.py +0 -0
  348. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_schema.py +0 -0
  349. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_signal_schema.py +0 -0
  350. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  351. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_text.py +0 -0
  352. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_udf.py +0 -0
  353. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_udf_signature.py +0 -0
  354. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_utils.py +0 -0
  355. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_webdataset.py +0 -0
  356. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/model/__init__.py +0 -0
  357. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/model/test_bbox.py +0 -0
  358. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/model/test_pose.py +0 -0
  359. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/model/test_segment.py +0 -0
  360. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/model/test_utils.py +0 -0
  361. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/__init__.py +0 -0
  362. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  363. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/sqlite/test_types.py +0 -0
  364. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  365. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_array.py +0 -0
  366. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_conditional.py +0 -0
  367. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_path.py +0 -0
  368. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_random.py +0 -0
  369. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_selectable.py +0 -0
  370. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_string.py +0 -0
  371. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_asyn.py +0 -0
  372. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_cache.py +0 -0
  373. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_catalog.py +0 -0
  374. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_catalog_loader.py +0 -0
  375. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_cli_parsing.py +0 -0
  376. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_client.py +0 -0
  377. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_client_gcs.py +0 -0
  378. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_client_s3.py +0 -0
  379. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_config.py +0 -0
  380. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_data_storage.py +0 -0
  381. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_database_engine.py +0 -0
  382. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_dataset.py +0 -0
  383. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_dispatch.py +0 -0
  384. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_fileslice.py +0 -0
  385. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_func.py +0 -0
  386. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_listing.py +0 -0
  387. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_metastore.py +0 -0
  388. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_module_exports.py +0 -0
  389. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_pytorch.py +0 -0
  390. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_query.py +0 -0
  391. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_query_metrics.py +0 -0
  392. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_query_params.py +0 -0
  393. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_script_meta.py +0 -0
  394. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_semver.py +0 -0
  395. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_serializer.py +0 -0
  396. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_session.py +0 -0
  397. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_utils.py +0 -0
  398. {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_warehouse.py +0 -0
  399. {datachain-0.24.0 → datachain-0.24.1}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.0
3
+ Version: 0.24.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
6
6
  import datachain
7
7
  from datachain.dataset import DatasetDependency
8
8
  from datachain.error import DatasetNotFoundError
9
+ from datachain.project import Project
9
10
 
10
11
  if TYPE_CHECKING:
11
12
  from typing_extensions import Concatenate, ParamSpec
@@ -50,15 +51,24 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
50
51
 
51
52
  def _get_delta_chain(
52
53
  source_ds_name: str,
54
+ source_ds_project: Project,
53
55
  source_ds_version: str,
54
56
  source_ds_latest_version: str,
55
57
  on: Union[str, Sequence[str]],
56
58
  compare: Optional[Union[str, Sequence[str]]] = None,
57
59
  ) -> "DataChain":
58
60
  """Get delta chain for processing changes between versions."""
59
- source_dc = datachain.read_dataset(source_ds_name, version=source_ds_version)
61
+ source_dc = datachain.read_dataset(
62
+ source_ds_name,
63
+ namespace=source_ds_project.namespace.name,
64
+ project=source_ds_project.name,
65
+ version=source_ds_version,
66
+ )
60
67
  source_dc_latest = datachain.read_dataset(
61
- source_ds_name, version=source_ds_latest_version
68
+ source_ds_name,
69
+ namespace=source_ds_project.namespace.name,
70
+ project=source_ds_project.name,
71
+ version=source_ds_latest_version,
62
72
  )
63
73
 
64
74
  # Calculate diff between source versions
@@ -67,12 +77,15 @@ def _get_delta_chain(
67
77
 
68
78
  def _get_retry_chain(
69
79
  name: str,
80
+ project: Project,
70
81
  latest_version: str,
71
82
  source_ds_name: str,
72
- source_ds_latest_version: str,
83
+ source_ds_project: Project,
84
+ source_ds_version: str,
73
85
  on: Union[str, Sequence[str]],
74
86
  right_on: Optional[Union[str, Sequence[str]]],
75
87
  delta_retry: Optional[Union[bool, str]],
88
+ diff_chain: "DataChain",
76
89
  ) -> Optional["DataChain"]:
77
90
  """Get retry chain for processing error records and missing records."""
78
91
  # Import here to avoid circular import
@@ -81,35 +94,49 @@ def _get_retry_chain(
81
94
  retry_chain = None
82
95
 
83
96
  # Read the latest version of the result dataset for retry logic
84
- result_dataset = datachain.read_dataset(name, version=latest_version)
85
- source_dc_latest = datachain.read_dataset(
86
- source_ds_name, version=source_ds_latest_version
97
+ result_dataset = datachain.read_dataset(
98
+ name,
99
+ namespace=project.namespace.name,
100
+ project=project.name,
101
+ version=latest_version,
102
+ )
103
+ source_dc = datachain.read_dataset(
104
+ source_ds_name,
105
+ namespace=source_ds_project.namespace.name,
106
+ project=source_ds_project.name,
107
+ version=source_ds_version,
87
108
  )
88
109
 
89
110
  # Handle error records if delta_retry is a string (column name)
90
111
  if isinstance(delta_retry, str):
91
112
  error_records = result_dataset.filter(C(delta_retry) != "")
92
- error_source_records = source_dc_latest.merge(
113
+ error_source_records = source_dc.merge(
93
114
  error_records, on=on, right_on=right_on, inner=True
94
- ).select(*list(source_dc_latest.signals_schema.values))
115
+ ).select(*list(source_dc.signals_schema.values))
95
116
  retry_chain = error_source_records
96
117
 
97
118
  # Handle missing records if delta_retry is True
98
119
  elif delta_retry is True:
99
- missing_records = source_dc_latest.subtract(
100
- result_dataset, on=on, right_on=right_on
101
- )
120
+ missing_records = source_dc.subtract(result_dataset, on=on, right_on=right_on)
102
121
  retry_chain = missing_records
103
122
 
104
- return retry_chain
123
+ # Subtract also diff chain since some items might be picked
124
+ # up by `delta=True` itself (e.g. records got modified AND are missing in the
125
+ # result dataset atm)
126
+ return retry_chain.subtract(diff_chain, on=on) if retry_chain else None
105
127
 
106
128
 
107
129
  def _get_source_info(
108
130
  name: str,
131
+ project: Project,
109
132
  latest_version: str,
110
133
  catalog,
111
134
  ) -> tuple[
112
- Optional[str], Optional[str], Optional[str], Optional[list[DatasetDependency]]
135
+ Optional[str],
136
+ Optional[Project],
137
+ Optional[str],
138
+ Optional[str],
139
+ Optional[list[DatasetDependency]],
113
140
  ]:
114
141
  """Get source dataset information and dependencies.
115
142
 
@@ -118,23 +145,34 @@ def _get_source_info(
118
145
  Returns (None, None, None, None) if source dataset was removed.
119
146
  """
120
147
  dependencies = catalog.get_dataset_dependencies(
121
- name, latest_version, indirect=False
148
+ name, latest_version, project=project, indirect=False
122
149
  )
123
150
 
124
151
  dep = dependencies[0]
125
152
  if not dep:
126
153
  # Starting dataset was removed, back off to normal dataset creation
127
- return None, None, None, None
154
+ return None, None, None, None, None
128
155
 
156
+ source_ds_project = catalog.metastore.get_project(dep.project, dep.namespace)
129
157
  source_ds_name = dep.name
130
158
  source_ds_version = dep.version
131
- source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
132
-
133
- return source_ds_name, source_ds_version, source_ds_latest_version, dependencies
159
+ source_ds_latest_version = catalog.get_dataset(
160
+ source_ds_name, project=source_ds_project
161
+ ).latest_version
162
+
163
+ return (
164
+ source_ds_name,
165
+ source_ds_project,
166
+ source_ds_version,
167
+ source_ds_latest_version,
168
+ dependencies,
169
+ )
134
170
 
135
171
 
136
172
  def delta_retry_update(
137
173
  dc: "DataChain",
174
+ namespace_name: str,
175
+ project_name: str,
138
176
  name: str,
139
177
  on: Union[str, Sequence[str]],
140
178
  right_on: Optional[Union[str, Sequence[str]]] = None,
@@ -173,11 +211,12 @@ def delta_retry_update(
173
211
  """
174
212
 
175
213
  catalog = dc.session.catalog
214
+ project = catalog.metastore.get_project(project_name, namespace_name)
176
215
  dc._query.apply_listing_pre_step()
177
216
 
178
217
  # Check if dataset exists
179
218
  try:
180
- dataset = catalog.get_dataset(name)
219
+ dataset = catalog.get_dataset(name, project=project)
181
220
  latest_version = dataset.latest_version
182
221
  except DatasetNotFoundError:
183
222
  # First creation of result dataset
@@ -189,19 +228,29 @@ def delta_retry_update(
189
228
  retry_chain = None
190
229
  processing_chain = None
191
230
 
192
- source_ds_name, source_ds_version, source_ds_latest_version, dependencies = (
193
- _get_source_info(name, latest_version, catalog)
194
- )
231
+ (
232
+ source_ds_name,
233
+ source_ds_project,
234
+ source_ds_version,
235
+ source_ds_latest_version,
236
+ dependencies,
237
+ ) = _get_source_info(name, project, latest_version, catalog)
195
238
 
196
239
  # If source_ds_name is None, starting dataset was removed
197
240
  if source_ds_name is None:
198
241
  return None, None, True
199
242
 
243
+ assert source_ds_project
200
244
  assert source_ds_version
201
245
  assert source_ds_latest_version
202
246
 
203
247
  diff_chain = _get_delta_chain(
204
- source_ds_name, source_ds_version, source_ds_latest_version, on, compare
248
+ source_ds_name,
249
+ source_ds_project,
250
+ source_ds_version,
251
+ source_ds_latest_version,
252
+ on,
253
+ compare,
205
254
  )
206
255
 
207
256
  # Filter out removed dep
@@ -215,12 +264,15 @@ def delta_retry_update(
215
264
  if delta_retry:
216
265
  retry_chain = _get_retry_chain(
217
266
  name,
267
+ project,
218
268
  latest_version,
219
269
  source_ds_name,
220
- source_ds_latest_version,
270
+ source_ds_project,
271
+ source_ds_version,
221
272
  on,
222
273
  right_on,
223
274
  delta_retry,
275
+ diff_chain,
224
276
  )
225
277
 
226
278
  # Combine delta and retry chains
@@ -236,7 +288,12 @@ def delta_retry_update(
236
288
  if processing_chain is None or (processing_chain and processing_chain.empty):
237
289
  return None, None, False
238
290
 
239
- latest_dataset = datachain.read_dataset(name, version=latest_version)
291
+ latest_dataset = datachain.read_dataset(
292
+ name,
293
+ namespace=project.namespace.name,
294
+ project=project.name,
295
+ version=latest_version,
296
+ )
240
297
  compared_chain = latest_dataset.diff(
241
298
  processing_chain,
242
299
  on=right_on or on,
@@ -598,6 +598,8 @@ class DataChain:
598
598
 
599
599
  result_ds, dependencies, has_changes = delta_retry_update(
600
600
  self,
601
+ namespace_name,
602
+ project_name,
601
603
  name,
602
604
  on=self._delta_on,
603
605
  right_on=self._delta_result_on,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.0
3
+ Version: 0.24.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -14,15 +14,26 @@ from datachain.lib.file import File, ImageFile
14
14
  def _get_dependencies(catalog, name, version) -> list[tuple[str, str]]:
15
15
  return sorted(
16
16
  [
17
- (d.name, d.version)
17
+ (f"{d.namespace}.{d.project}.{d.name}", d.version)
18
18
  for d in catalog.get_dataset_dependencies(name, version, indirect=False)
19
19
  ]
20
20
  )
21
21
 
22
22
 
23
- def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
23
+ @pytest.mark.parametrize("project", ("global.dev", ""))
24
+ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path, project):
24
25
  catalog = test_session.catalog
25
- starting_ds_name = "starting_ds"
26
+ default_namespace_name = catalog.metastore.default_namespace_name
27
+ default_project_name = catalog.metastore.default_project_name
28
+
29
+ if project:
30
+ starting_ds_name = f"{project}.starting_ds"
31
+ dependency_ds_name = starting_ds_name
32
+ else:
33
+ starting_ds_name = "starting_ds"
34
+ dependency_ds_name = (
35
+ f"{default_namespace_name}.{default_project_name}.{starting_ds_name}"
36
+ )
26
37
  ds_name = "delta_ds"
27
38
 
28
39
  images = [
@@ -55,12 +66,16 @@ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
55
66
  create_image_dataset(starting_ds_name, images[:2])
56
67
  # first version of delta dataset
57
68
  create_delta_dataset(ds_name)
58
- assert _get_dependencies(catalog, ds_name, "1.0.0") == [(starting_ds_name, "1.0.0")]
69
+ assert _get_dependencies(catalog, ds_name, "1.0.0") == [
70
+ (dependency_ds_name, "1.0.0")
71
+ ]
59
72
  # second version of starting dataset
60
73
  create_image_dataset(starting_ds_name, images[2:])
61
74
  # second version of delta dataset
62
75
  create_delta_dataset(ds_name)
63
- assert _get_dependencies(catalog, ds_name, "1.0.1") == [(starting_ds_name, "1.0.1")]
76
+ assert _get_dependencies(catalog, ds_name, "1.0.1") == [
77
+ (dependency_ds_name, "1.0.1")
78
+ ]
64
79
 
65
80
  assert (dc.read_dataset(ds_name, version="1.0.0").order_by("file.path")).to_values(
66
81
  "file.path"
@@ -30,6 +30,23 @@ def _process_with_errors(id: int, content: str, attempt: int) -> ProcessingResul
30
30
  )
31
31
 
32
32
 
33
+ def _create_sample_data(test_session, ids=None, contents=None):
34
+ """Helper function to create sample data for retry tests."""
35
+ ids = ids or [1, 2, 3, 4]
36
+ contents = contents or ["first item", "second item", "third item", "fourth item"]
37
+ dc.read_values(id=ids, content=contents, session=test_session).save("sample_data")
38
+
39
+
40
+ def _simple_process(id: int, content: str, attempt: int = 1) -> ProcessingResult:
41
+ """Helper function for simple processing in retry tests."""
42
+ return ProcessingResult(
43
+ processed_content=content.upper(),
44
+ processed_at=datetime.now(tz=timezone.utc).isoformat(),
45
+ error="",
46
+ attempt=attempt,
47
+ )
48
+
49
+
33
50
  def test_retry_with_error_records(test_session):
34
51
  """Test retry functionality with records that have errors."""
35
52
 
@@ -48,13 +65,7 @@ def test_retry_with_error_records(test_session):
48
65
  )
49
66
 
50
67
  # First processing pass - some records will fail
51
- sample_ids = [1, 2, 3, 4]
52
- sample_contents = ["first item", "second item", "third item", "fourth item"]
53
-
54
- dc.read_values(id=sample_ids, content=sample_contents, session=test_session).save(
55
- "sample_data"
56
- )
57
-
68
+ _create_sample_data(test_session)
58
69
  first_pass = _run_processing(1)
59
70
 
60
71
  # Check that some records failed
@@ -74,72 +85,103 @@ def test_retry_with_error_records(test_session):
74
85
 
75
86
  def test_retry_with_missing_records(test_session):
76
87
  """Test retry functionality with missing records."""
77
- # Create source dataset
78
- source_ids = [1, 2, 3]
79
- source_contents = ["first", "second", "third"]
88
+ _create_sample_data(test_session)
80
89
 
81
- dc.read_values(id=source_ids, content=source_contents, session=test_session).save(
82
- "source_data"
90
+ # Process only first 2 records
91
+ # Create partial result dataset (missing id=3)
92
+ partial_result = (
93
+ dc.read_dataset("sample_data", session=test_session)
94
+ .setup(attempt=lambda: 1)
95
+ .filter(C("id") < 3)
96
+ .map(result=_simple_process)
97
+ .save("partial_result")
83
98
  )
84
99
 
85
- def simple_process(id: int, content: str, attempt: int) -> ProcessingResult:
86
- return ProcessingResult(
87
- processed_content=content.upper(),
88
- processed_at=datetime.now(tz=timezone.utc).isoformat(),
89
- error="",
90
- attempt=attempt,
100
+ assert partial_result.count() == 2
101
+
102
+ # Use retry with delta_retry=True to process missing records
103
+ retry_chain = (
104
+ dc.read_dataset(
105
+ "sample_data",
106
+ session=test_session,
107
+ delta=True,
108
+ delta_on="id",
109
+ delta_retry=True,
91
110
  )
111
+ .setup(attempt=lambda: 2)
112
+ .map(result=_simple_process)
113
+ .save("partial_result")
114
+ )
115
+
116
+ # Should now have all 4 records
117
+ assert retry_chain.count() == 4
118
+
119
+ # Verify all records are present
120
+ ids = set(retry_chain.to_values("id"))
121
+ assert ids == {1, 2, 3, 4}
122
+
123
+ final_first_attempts_count = retry_chain.filter(C("result.attempt") == 1).count()
124
+ final_missing_attempts_count = retry_chain.filter(C("result.attempt") == 2).count()
125
+
126
+ # Only missing records should have attempt 2
127
+ assert final_missing_attempts_count == 2
128
+ assert final_first_attempts_count == 2
129
+
130
+
131
+ def test_retry_with_missing_and_new_records(test_session):
132
+ """Test retry functionality with missing records (e.g. ignored
133
+ in first pass since they failed). Also we add new records to the source
134
+ to test that retry and delta don't pick records twice."""
135
+ _create_sample_data(test_session)
92
136
 
93
137
  # Process only first 2 records
94
138
  # Create partial result dataset (missing id=3)
95
139
  partial_result = (
96
- dc.read_dataset("source_data", session=test_session)
140
+ dc.read_dataset("sample_data", session=test_session)
97
141
  .setup(attempt=lambda: 1)
98
142
  .filter(C("id") < 3)
99
- .map(result=simple_process)
143
+ .map(result=_simple_process)
100
144
  .save("partial_result")
101
145
  )
102
146
 
103
147
  assert partial_result.count() == 2
104
148
 
149
+ ids = [1, 2, 3, 4, 5]
150
+ contents = ["first item", "second item", "third item", "fourth item", "fifth item"]
151
+ _create_sample_data(test_session, ids, contents)
152
+
105
153
  # Use retry with delta_retry=True to process missing records
106
154
  retry_chain = (
107
155
  dc.read_dataset(
108
- "source_data",
156
+ "sample_data",
109
157
  session=test_session,
110
158
  delta=True,
111
159
  delta_on="id",
112
160
  delta_retry=True,
113
161
  )
114
162
  .setup(attempt=lambda: 2)
115
- .map(result=simple_process)
163
+ .map(result=_simple_process)
116
164
  .save("partial_result")
117
165
  )
118
166
 
119
167
  # Should now have all 3 records
120
- assert retry_chain.count() == 3
168
+ assert retry_chain.count() == 5
121
169
 
122
170
  # Verify all records are present
123
171
  ids = set(retry_chain.to_values("id"))
124
- assert ids == {1, 2, 3}
172
+ assert ids == {1, 2, 3, 4, 5}
125
173
 
126
174
  final_first_attempts_count = retry_chain.filter(C("result.attempt") == 1).count()
127
175
  final_missing_attempts_count = retry_chain.filter(C("result.attempt") == 2).count()
128
176
 
129
177
  # Only missing records should have attempt 2
130
- assert final_missing_attempts_count == 1
178
+ assert final_missing_attempts_count == 3
131
179
  assert final_first_attempts_count == 2
132
180
 
133
181
 
134
182
  def test_retry_no_records_to_retry(test_session):
135
183
  """Test retry when no records need to be retried."""
136
- # Create dataset with all successful records
137
- source_ids = [1, 2]
138
- source_contents = ["first", "second"]
139
-
140
- dc.read_values(id=source_ids, content=source_contents, session=test_session).save(
141
- "source_data"
142
- )
184
+ _create_sample_data(test_session, ids=[1, 2], contents=["first", "second"])
143
185
 
144
186
  def successful_process(id: int, content: str) -> ProcessingResult:
145
187
  return ProcessingResult(
@@ -151,7 +193,7 @@ def test_retry_no_records_to_retry(test_session):
151
193
 
152
194
  # First pass - all succeed
153
195
  first_pass = (
154
- dc.read_dataset("source_data", session=test_session)
196
+ dc.read_dataset("sample_data", session=test_session)
155
197
  .map(result=successful_process)
156
198
  .save("successful_data")
157
199
  )
@@ -162,7 +204,7 @@ def test_retry_no_records_to_retry(test_session):
162
204
  # Retry - should not create a new version since no records need retry
163
205
  (
164
206
  dc.read_dataset(
165
- "source_data",
207
+ "sample_data",
166
208
  session=test_session,
167
209
  delta=True,
168
210
  delta_on="id",
@@ -179,32 +221,20 @@ def test_retry_no_records_to_retry(test_session):
179
221
 
180
222
  def test_retry_first_dataset_creation(test_session):
181
223
  """Test retry when dataset doesn't exist yet (first creation)."""
182
- source_ids = [1, 2]
183
- source_contents = ["first", "second"]
184
-
185
- dc.read_values(id=source_ids, content=source_contents, session=test_session).save(
186
- "source_data"
187
- )
188
-
189
- def simple_process(id: int, content: str) -> ProcessingResult:
190
- return ProcessingResult(
191
- processed_content=content.upper(),
192
- processed_at=datetime.now(tz=timezone.utc).isoformat(),
193
- error="",
194
- attempt=1,
195
- )
224
+ _create_sample_data(test_session, ids=[1, 2], contents=["first", "second"])
196
225
 
197
226
  # First run with retry enabled on non-existent dataset
198
227
  # Should process all records
199
228
  retry_chain = (
200
229
  dc.read_dataset(
201
- "source_data",
230
+ "sample_data",
202
231
  session=test_session,
203
232
  delta=True,
204
233
  delta_on="id",
205
234
  delta_retry="result.error",
206
235
  )
207
- .map(result=simple_process)
236
+ .setup(attempt=lambda: 1)
237
+ .map(result=_simple_process)
208
238
  .save("new_dataset")
209
239
  )
210
240
 
@@ -311,3 +341,87 @@ def test_retry_with_delta_functionality(test_session):
311
341
  (2, "", 1),
312
342
  (3, "", 2),
313
343
  }
344
+
345
+
346
+ def test_delta_and_delta_retry_no_duplicates(test_session):
347
+ """Test that delta and delta_retry work together without creating duplicates
348
+ when the same records are picked up for different reasons:
349
+ - delta_retry=True picks up unprocessed records missing from result dataset
350
+ - delta=True picks up modified records from source dataset
351
+ """
352
+ _create_sample_data(test_session)
353
+
354
+ # First pass - process only records 1 and 2
355
+ partial_result = (
356
+ dc.read_dataset("sample_data", session=test_session)
357
+ .setup(attempt=lambda: 1)
358
+ .filter(C("id") < 3) # Only process id=1,2, leaving id=3,4 unprocessed
359
+ .map(result=_simple_process)
360
+ .save("delta_retry_combined_result")
361
+ )
362
+
363
+ assert partial_result.count() == 2
364
+ initial_results = set(partial_result.to_iter("id", "result.attempt"))
365
+ assert initial_results == {(1, 1), (2, 1)}
366
+
367
+ # Modify the source data - update content for records 3 and 4
368
+ # This will make delta=True pick them up as "changed"
369
+ # But delta_retry=True will also pick them up as "missing from result"
370
+ modified_ids = [1, 2, 3, 4]
371
+ modified_contents = [
372
+ "first item", # unchanged
373
+ "second item", # unchanged
374
+ "MODIFIED third item", # modified - delta will pick this up
375
+ "MODIFIED fourth item", # modified - delta will pick this up
376
+ ]
377
+ _create_sample_data(test_session, modified_ids, modified_contents)
378
+
379
+ # Second pass with both delta=True and delta_retry=True
380
+ # Records 3,4 should be picked up by BOTH:
381
+ # - delta_retry=True (because they're missing from result dataset)
382
+ # - delta=True (because their content was modified in source)
383
+ # But they should only be processed ONCE (no duplicates)
384
+ combined_result = (
385
+ dc.read_dataset(
386
+ "sample_data",
387
+ session=test_session,
388
+ delta=True,
389
+ delta_on="id",
390
+ delta_retry=True,
391
+ )
392
+ .setup(attempt=lambda: 2)
393
+ .map(result=_simple_process)
394
+ .save("delta_retry_combined_result")
395
+ )
396
+
397
+ # Should have 4 total records: 2 from first pass + 2 newly processed
398
+ assert combined_result.count() == 4
399
+
400
+ # Get all results and verify no duplicates
401
+ all_results = set(
402
+ combined_result.to_iter("id", "result.attempt", "result.processed_content")
403
+ )
404
+
405
+ # Records 1,2 should have attempt=1 (from first pass)
406
+ # Records 3,4 should have attempt=2 (from second pass) and MODIFIED content
407
+ expected_results = {
408
+ (1, 1, "FIRST ITEM"),
409
+ (2, 1, "SECOND ITEM"),
410
+ (3, 2, "MODIFIED THIRD ITEM"),
411
+ (4, 2, "MODIFIED FOURTH ITEM"),
412
+ }
413
+
414
+ assert all_results == expected_results
415
+
416
+ # Verify counts by attempt
417
+ first_attempt_count = combined_result.filter(C("result.attempt") == 1).count()
418
+ second_attempt_count = combined_result.filter(C("result.attempt") == 2).count()
419
+
420
+ assert first_attempt_count == 2 # Records 1,2 from first pass
421
+ assert second_attempt_count == 2 # Records 3,4 from second pass (no duplicates)
422
+
423
+ # Verify that each id appears exactly once
424
+ ids_in_result = list(combined_result.to_values("id"))
425
+ assert len(ids_in_result) == 4
426
+ assert len(set(ids_in_result)) == 4 # No duplicate IDs
427
+ assert set(ids_in_result) == {1, 2, 3, 4}
File without changes
File without changes
File without changes
File without changes
File without changes