datachain 0.17.1__tar.gz → 0.18.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (371) hide show
  1. {datachain-0.17.1 → datachain-0.18.0}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.17.1/src/datachain.egg-info → datachain-0.18.0}/PKG-INFO +2 -2
  3. {datachain-0.17.1 → datachain-0.18.0}/docs/commands/job/run.md +6 -0
  4. {datachain-0.17.1 → datachain-0.18.0}/pyproject.toml +1 -1
  5. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/catalog/catalog.py +6 -0
  6. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/parser/job.py +7 -0
  7. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/warehouse.py +1 -1
  8. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/dataset.py +7 -10
  9. datachain-0.18.0/src/datachain/delta.py +119 -0
  10. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/diff/__init__.py +10 -4
  11. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/datachain.py +89 -2
  12. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/datasets.py +41 -1
  13. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/storage.py +45 -11
  14. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/signal_schema.py +12 -6
  15. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/dataset.py +27 -10
  16. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/remote/studio.py +2 -0
  17. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/studio.py +3 -0
  18. {datachain-0.17.1 → datachain-0.18.0/src/datachain.egg-info}/PKG-INFO +2 -2
  19. {datachain-0.17.1 → datachain-0.18.0}/src/datachain.egg-info/SOURCES.txt +2 -0
  20. {datachain-0.17.1 → datachain-0.18.0}/src/datachain.egg-info/requires.txt +1 -1
  21. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_datachain.py +2 -4
  22. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_dataset_query.py +18 -4
  23. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_datasets.py +2 -1
  24. datachain-0.18.0/tests/func/test_delta.py +383 -0
  25. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_warehouse.py +2 -2
  26. {datachain-0.17.1 → datachain-0.18.0}/tests/test_cli_studio.py +1 -0
  27. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_datachain.py +47 -9
  28. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_signal_schema.py +11 -11
  29. {datachain-0.17.1 → datachain-0.18.0}/.cruft.json +0 -0
  30. {datachain-0.17.1 → datachain-0.18.0}/.gitattributes +0 -0
  31. {datachain-0.17.1 → datachain-0.18.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  32. {datachain-0.17.1 → datachain-0.18.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  33. {datachain-0.17.1 → datachain-0.18.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  34. {datachain-0.17.1 → datachain-0.18.0}/.github/codecov.yaml +0 -0
  35. {datachain-0.17.1 → datachain-0.18.0}/.github/dependabot.yml +0 -0
  36. {datachain-0.17.1 → datachain-0.18.0}/.github/workflows/benchmarks.yml +0 -0
  37. {datachain-0.17.1 → datachain-0.18.0}/.github/workflows/release.yml +0 -0
  38. {datachain-0.17.1 → datachain-0.18.0}/.github/workflows/tests-studio.yml +0 -0
  39. {datachain-0.17.1 → datachain-0.18.0}/.github/workflows/tests.yml +0 -0
  40. {datachain-0.17.1 → datachain-0.18.0}/.github/workflows/update-template.yaml +0 -0
  41. {datachain-0.17.1 → datachain-0.18.0}/.gitignore +0 -0
  42. {datachain-0.17.1 → datachain-0.18.0}/CODE_OF_CONDUCT.rst +0 -0
  43. {datachain-0.17.1 → datachain-0.18.0}/LICENSE +0 -0
  44. {datachain-0.17.1 → datachain-0.18.0}/README.rst +0 -0
  45. {datachain-0.17.1 → datachain-0.18.0}/docs/assets/captioned_cartoons.png +0 -0
  46. {datachain-0.17.1 → datachain-0.18.0}/docs/assets/datachain-white.svg +0 -0
  47. {datachain-0.17.1 → datachain-0.18.0}/docs/assets/datachain.svg +0 -0
  48. {datachain-0.17.1 → datachain-0.18.0}/docs/commands/auth/login.md +0 -0
  49. {datachain-0.17.1 → datachain-0.18.0}/docs/commands/auth/logout.md +0 -0
  50. {datachain-0.17.1 → datachain-0.18.0}/docs/commands/auth/team.md +0 -0
  51. {datachain-0.17.1 → datachain-0.18.0}/docs/commands/auth/token.md +0 -0
  52. {datachain-0.17.1 → datachain-0.18.0}/docs/commands/index.md +0 -0
  53. {datachain-0.17.1 → datachain-0.18.0}/docs/commands/job/cancel.md +0 -0
  54. {datachain-0.17.1 → datachain-0.18.0}/docs/commands/job/logs.md +0 -0
  55. {datachain-0.17.1 → datachain-0.18.0}/docs/commands/job/ls.md +0 -0
  56. {datachain-0.17.1 → datachain-0.18.0}/docs/contributing.md +0 -0
  57. {datachain-0.17.1 → datachain-0.18.0}/docs/css/github-permalink-style.css +0 -0
  58. {datachain-0.17.1 → datachain-0.18.0}/docs/examples.md +0 -0
  59. {datachain-0.17.1 → datachain-0.18.0}/docs/index.md +0 -0
  60. {datachain-0.17.1 → datachain-0.18.0}/docs/overrides/main.html +0 -0
  61. {datachain-0.17.1 → datachain-0.18.0}/docs/quick-start.md +0 -0
  62. {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/arrowrow.md +0 -0
  63. {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/bbox.md +0 -0
  64. {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/file.md +0 -0
  65. {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/imagefile.md +0 -0
  66. {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/index.md +0 -0
  67. {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/pose.md +0 -0
  68. {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/segment.md +0 -0
  69. {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/tarvfile.md +0 -0
  70. {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/textfile.md +0 -0
  71. {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/videofile.md +0 -0
  72. {datachain-0.17.1 → datachain-0.18.0}/docs/references/datachain.md +0 -0
  73. {datachain-0.17.1 → datachain-0.18.0}/docs/references/func.md +0 -0
  74. {datachain-0.17.1 → datachain-0.18.0}/docs/references/index.md +0 -0
  75. {datachain-0.17.1 → datachain-0.18.0}/docs/references/remotes.md +0 -0
  76. {datachain-0.17.1 → datachain-0.18.0}/docs/references/toolkit.md +0 -0
  77. {datachain-0.17.1 → datachain-0.18.0}/docs/references/torch.md +0 -0
  78. {datachain-0.17.1 → datachain-0.18.0}/docs/references/udf.md +0 -0
  79. {datachain-0.17.1 → datachain-0.18.0}/docs/tutorials.md +0 -0
  80. {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  81. {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  82. {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/openimage-detect.py +0 -0
  83. {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  84. {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  85. {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  86. {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/common_sql_functions.py +0 -0
  87. {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/json-csv-reader.py +0 -0
  88. {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/torch-loader.py +0 -0
  89. {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/udfs/parallel.py +0 -0
  90. {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/udfs/simple.py +0 -0
  91. {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/udfs/stateful.py +0 -0
  92. {datachain-0.17.1 → datachain-0.18.0}/examples/llm_and_nlp/claude-query.py +0 -0
  93. {datachain-0.17.1 → datachain-0.18.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  94. {datachain-0.17.1 → datachain-0.18.0}/examples/multimodal/clip_inference.py +0 -0
  95. {datachain-0.17.1 → datachain-0.18.0}/examples/multimodal/hf_pipeline.py +0 -0
  96. {datachain-0.17.1 → datachain-0.18.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  97. {datachain-0.17.1 → datachain-0.18.0}/examples/multimodal/wds.py +0 -0
  98. {datachain-0.17.1 → datachain-0.18.0}/examples/multimodal/wds_filtered.py +0 -0
  99. {datachain-0.17.1 → datachain-0.18.0}/mkdocs.yml +0 -0
  100. {datachain-0.17.1 → datachain-0.18.0}/noxfile.py +0 -0
  101. {datachain-0.17.1 → datachain-0.18.0}/setup.cfg +0 -0
  102. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/__init__.py +0 -0
  103. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/__main__.py +0 -0
  104. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/asyn.py +0 -0
  105. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cache.py +0 -0
  106. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/catalog/__init__.py +0 -0
  107. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/catalog/datasource.py +0 -0
  108. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/catalog/loader.py +0 -0
  109. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/__init__.py +0 -0
  110. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/__init__.py +0 -0
  111. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/datasets.py +0 -0
  112. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/du.py +0 -0
  113. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/index.py +0 -0
  114. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/ls.py +0 -0
  115. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/misc.py +0 -0
  116. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/query.py +0 -0
  117. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/show.py +0 -0
  118. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/parser/__init__.py +0 -0
  119. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/parser/studio.py +0 -0
  120. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/parser/utils.py +0 -0
  121. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/utils.py +0 -0
  122. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/__init__.py +0 -0
  123. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/azure.py +0 -0
  124. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/fileslice.py +0 -0
  125. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/fsspec.py +0 -0
  126. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/gcs.py +0 -0
  127. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/hf.py +0 -0
  128. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/local.py +0 -0
  129. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/s3.py +0 -0
  130. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/config.py +0 -0
  131. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/__init__.py +0 -0
  132. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/db_engine.py +0 -0
  133. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/job.py +0 -0
  134. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/metastore.py +0 -0
  135. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/schema.py +0 -0
  136. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/serializer.py +0 -0
  137. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/sqlite.py +0 -0
  138. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/error.py +0 -0
  139. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/fs/__init__.py +0 -0
  140. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/fs/reference.py +0 -0
  141. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/fs/utils.py +0 -0
  142. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/__init__.py +0 -0
  143. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/aggregate.py +0 -0
  144. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/array.py +0 -0
  145. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/base.py +0 -0
  146. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/conditional.py +0 -0
  147. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/func.py +0 -0
  148. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/numeric.py +0 -0
  149. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/path.py +0 -0
  150. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/random.py +0 -0
  151. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/string.py +0 -0
  152. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/window.py +0 -0
  153. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/job.py +0 -0
  154. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/__init__.py +0 -0
  155. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/arrow.py +0 -0
  156. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/clip.py +0 -0
  157. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/__init__.py +0 -0
  158. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/flatten.py +0 -0
  159. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  160. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  161. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/unflatten.py +0 -0
  162. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  163. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/data_model.py +0 -0
  164. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dataset_info.py +0 -0
  165. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/__init__.py +0 -0
  166. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/csv.py +0 -0
  167. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/database.py +0 -0
  168. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/hf.py +0 -0
  169. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/json.py +0 -0
  170. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/listings.py +0 -0
  171. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/pandas.py +0 -0
  172. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/parquet.py +0 -0
  173. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/records.py +0 -0
  174. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/utils.py +0 -0
  175. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/values.py +0 -0
  176. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/file.py +0 -0
  177. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/hf.py +0 -0
  178. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/image.py +0 -0
  179. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/listing.py +0 -0
  180. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/listing_info.py +0 -0
  181. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/meta_formats.py +0 -0
  182. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/model_store.py +0 -0
  183. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/pytorch.py +0 -0
  184. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/settings.py +0 -0
  185. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/tar.py +0 -0
  186. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/text.py +0 -0
  187. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/udf.py +0 -0
  188. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/udf_signature.py +0 -0
  189. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/utils.py +0 -0
  190. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/video.py +0 -0
  191. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/webdataset.py +0 -0
  192. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/webdataset_laion.py +0 -0
  193. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/listing.py +0 -0
  194. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/__init__.py +0 -0
  195. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/bbox.py +0 -0
  196. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/pose.py +0 -0
  197. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/segment.py +0 -0
  198. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  199. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  200. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/ultralytics/pose.py +0 -0
  201. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/ultralytics/segment.py +0 -0
  202. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/utils.py +0 -0
  203. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/node.py +0 -0
  204. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/nodes_fetcher.py +0 -0
  205. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/nodes_thread_pool.py +0 -0
  206. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/progress.py +0 -0
  207. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/py.typed +0 -0
  208. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/__init__.py +0 -0
  209. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/batch.py +0 -0
  210. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/dispatch.py +0 -0
  211. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/metrics.py +0 -0
  212. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/params.py +0 -0
  213. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/queue.py +0 -0
  214. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/schema.py +0 -0
  215. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/session.py +0 -0
  216. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/udf.py +0 -0
  217. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/utils.py +0 -0
  218. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/remote/__init__.py +0 -0
  219. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/script_meta.py +0 -0
  220. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/semver.py +0 -0
  221. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/__init__.py +0 -0
  222. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/default/__init__.py +0 -0
  223. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/default/base.py +0 -0
  224. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/__init__.py +0 -0
  225. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/aggregate.py +0 -0
  226. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/array.py +0 -0
  227. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/conditional.py +0 -0
  228. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/numeric.py +0 -0
  229. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/path.py +0 -0
  230. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/random.py +0 -0
  231. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/string.py +0 -0
  232. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/selectable.py +0 -0
  233. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  234. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/sqlite/base.py +0 -0
  235. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/sqlite/types.py +0 -0
  236. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/sqlite/vector.py +0 -0
  237. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/types.py +0 -0
  238. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/utils.py +0 -0
  239. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/telemetry.py +0 -0
  240. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/toolkit/__init__.py +0 -0
  241. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/toolkit/split.py +0 -0
  242. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/torch/__init__.py +0 -0
  243. {datachain-0.17.1 → datachain-0.18.0}/src/datachain/utils.py +0 -0
  244. {datachain-0.17.1 → datachain-0.18.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  245. {datachain-0.17.1 → datachain-0.18.0}/src/datachain.egg-info/entry_points.txt +0 -0
  246. {datachain-0.17.1 → datachain-0.18.0}/src/datachain.egg-info/top_level.txt +0 -0
  247. {datachain-0.17.1 → datachain-0.18.0}/tests/__init__.py +0 -0
  248. {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/__init__.py +0 -0
  249. {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/conftest.py +0 -0
  250. {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  251. {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  252. {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/datasets/.gitignore +0 -0
  253. {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  254. {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/test_datachain.py +0 -0
  255. {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/test_ls.py +0 -0
  256. {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/test_version.py +0 -0
  257. {datachain-0.17.1 → datachain-0.18.0}/tests/conftest.py +0 -0
  258. {datachain-0.17.1 → datachain-0.18.0}/tests/data.py +0 -0
  259. {datachain-0.17.1 → datachain-0.18.0}/tests/examples/__init__.py +0 -0
  260. {datachain-0.17.1 → datachain-0.18.0}/tests/examples/test_examples.py +0 -0
  261. {datachain-0.17.1 → datachain-0.18.0}/tests/examples/test_wds_e2e.py +0 -0
  262. {datachain-0.17.1 → datachain-0.18.0}/tests/examples/wds_data.py +0 -0
  263. {datachain-0.17.1 → datachain-0.18.0}/tests/func/__init__.py +0 -0
  264. {datachain-0.17.1 → datachain-0.18.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  265. {datachain-0.17.1 → datachain-0.18.0}/tests/func/data/lena.jpg +0 -0
  266. {datachain-0.17.1 → datachain-0.18.0}/tests/func/fake-service-account-credentials.json +0 -0
  267. {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/__init__.py +0 -0
  268. {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/data/running-mask0.png +0 -0
  269. {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/data/running-mask1.png +0 -0
  270. {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/data/running.jpg +0 -0
  271. {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/data/ships.jpg +0 -0
  272. {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/test_yolo.py +0 -0
  273. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_batching.py +0 -0
  274. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_catalog.py +0 -0
  275. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_client.py +0 -0
  276. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_cloud_transfer.py +0 -0
  277. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_data_storage.py +0 -0
  278. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_datachain_merge.py +0 -0
  279. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_feature_pickling.py +0 -0
  280. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_file.py +0 -0
  281. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_func.py +0 -0
  282. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_hf.py +0 -0
  283. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_hidden_field.py +0 -0
  284. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_image.py +0 -0
  285. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_listing.py +0 -0
  286. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_ls.py +0 -0
  287. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_meta_formats.py +0 -0
  288. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_metrics.py +0 -0
  289. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_pull.py +0 -0
  290. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_pytorch.py +0 -0
  291. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_query.py +0 -0
  292. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_read_database.py +0 -0
  293. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_session.py +0 -0
  294. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_toolkit.py +0 -0
  295. {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_video.py +0 -0
  296. {datachain-0.17.1 → datachain-0.18.0}/tests/scripts/feature_class.py +0 -0
  297. {datachain-0.17.1 → datachain-0.18.0}/tests/scripts/feature_class_exception.py +0 -0
  298. {datachain-0.17.1 → datachain-0.18.0}/tests/scripts/feature_class_parallel.py +0 -0
  299. {datachain-0.17.1 → datachain-0.18.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  300. {datachain-0.17.1 → datachain-0.18.0}/tests/scripts/name_len_slow.py +0 -0
  301. {datachain-0.17.1 → datachain-0.18.0}/tests/test_atomicity.py +0 -0
  302. {datachain-0.17.1 → datachain-0.18.0}/tests/test_cli_e2e.py +0 -0
  303. {datachain-0.17.1 → datachain-0.18.0}/tests/test_import_time.py +0 -0
  304. {datachain-0.17.1 → datachain-0.18.0}/tests/test_query_e2e.py +0 -0
  305. {datachain-0.17.1 → datachain-0.18.0}/tests/test_telemetry.py +0 -0
  306. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/__init__.py +0 -0
  307. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/__init__.py +0 -0
  308. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/conftest.py +0 -0
  309. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_arrow.py +0 -0
  310. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_clip.py +0 -0
  311. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  312. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  313. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_diff.py +0 -0
  314. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_feature.py +0 -0
  315. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_feature_utils.py +0 -0
  316. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_file.py +0 -0
  317. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_hf.py +0 -0
  318. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_image.py +0 -0
  319. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_listing_info.py +0 -0
  320. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  321. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_schema.py +0 -0
  322. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  323. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_text.py +0 -0
  324. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_udf.py +0 -0
  325. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_udf_signature.py +0 -0
  326. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_utils.py +0 -0
  327. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_webdataset.py +0 -0
  328. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/model/__init__.py +0 -0
  329. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/model/test_bbox.py +0 -0
  330. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/model/test_pose.py +0 -0
  331. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/model/test_segment.py +0 -0
  332. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/model/test_utils.py +0 -0
  333. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/__init__.py +0 -0
  334. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  335. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  336. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  337. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_array.py +0 -0
  338. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_conditional.py +0 -0
  339. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_path.py +0 -0
  340. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_random.py +0 -0
  341. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_selectable.py +0 -0
  342. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_string.py +0 -0
  343. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_asyn.py +0 -0
  344. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_cache.py +0 -0
  345. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_catalog.py +0 -0
  346. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_catalog_loader.py +0 -0
  347. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_cli_parsing.py +0 -0
  348. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_client.py +0 -0
  349. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_client_gcs.py +0 -0
  350. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_client_s3.py +0 -0
  351. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_config.py +0 -0
  352. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_data_storage.py +0 -0
  353. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_database_engine.py +0 -0
  354. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_dataset.py +0 -0
  355. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_dispatch.py +0 -0
  356. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_fileslice.py +0 -0
  357. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_func.py +0 -0
  358. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_listing.py +0 -0
  359. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_metastore.py +0 -0
  360. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_module_exports.py +0 -0
  361. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_pytorch.py +0 -0
  362. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_query.py +0 -0
  363. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_query_metrics.py +0 -0
  364. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_query_params.py +0 -0
  365. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_script_meta.py +0 -0
  366. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_semver.py +0 -0
  367. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_serializer.py +0 -0
  368. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_session.py +0 -0
  369. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_utils.py +0 -0
  370. {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_warehouse.py +0 -0
  371. {datachain-0.17.1 → datachain-0.18.0}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.11.8'
27
+ rev: 'v0.11.9'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.17.1
3
+ Version: 0.18.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -44,7 +44,7 @@ Requires-Dist: datamodel-code-generator>=0.25
44
44
  Requires-Dist: Pillow<12,>=10.0.0
45
45
  Requires-Dist: msgpack<2,>=1.0.4
46
46
  Requires-Dist: psutil
47
- Requires-Dist: huggingface_hub<0.31
47
+ Requires-Dist: huggingface_hub
48
48
  Requires-Dist: iterative-telemetry>=0.0.10
49
49
  Requires-Dist: platformdirs
50
50
  Requires-Dist: dvc-studio-client<1,>=0.21
@@ -29,6 +29,7 @@ This command runs a job in Studio using the specified query file. You can config
29
29
  * `--python-version PYTHON_VERSION` - Python version for the job (e.g., 3.9, 3.10, 3.11)
30
30
  * `--req-file REQ_FILE` - Python requirements file
31
31
  * `--req REQ` - Python package requirements
32
+ * `--priority PRIORITY` - Priority for the job in range 0-5. Lower value is higher priority (default: 5)
32
33
  * `-h`, `--help` - Show the help message and exit.
33
34
  * `-v`, `--verbose` - Be verbose.
34
35
  * `-q`, `--quiet` - Be quiet.
@@ -65,6 +66,11 @@ datachain job run --env API_KEY=123 --req pandas numpy query.py
65
66
  datachain job run --repository https://github.com/iterative/datachain query.py
66
67
  ```
67
68
 
69
+ 7. Run a job with higher priority
70
+ ```bash
71
+ datachain job run --priority 2 query.py
72
+ ```
73
+
68
74
  ## Notes
69
75
 
70
76
  * Closing the logs command (e.g., with Ctrl+C) will only stop displaying the logs but will not cancel the job execution
@@ -48,7 +48,7 @@ dependencies = [
48
48
  "Pillow>=10.0.0,<12",
49
49
  "msgpack>=1.0.4,<2",
50
50
  "psutil",
51
- "huggingface_hub<0.31", # fix for "Provider 'featherless-ai' not supported" error
51
+ "huggingface_hub",
52
52
  "iterative-telemetry>=0.0.10",
53
53
  "platformdirs",
54
54
  "dvc-studio-client>=0.21,<1",
@@ -779,6 +779,7 @@ class Catalog:
779
779
  uuid: Optional[str] = None,
780
780
  description: Optional[str] = None,
781
781
  attrs: Optional[list[str]] = None,
782
+ update_version: Optional[str] = "patch",
782
783
  ) -> "DatasetRecord":
783
784
  """
784
785
  Creates new dataset of a specific version.
@@ -795,6 +796,11 @@ class Catalog:
795
796
  try:
796
797
  dataset = self.get_dataset(name)
797
798
  default_version = dataset.next_version_patch
799
+ if update_version == "major":
800
+ default_version = dataset.next_version_major
801
+ if update_version == "minor":
802
+ default_version = dataset.next_version_minor
803
+
798
804
  if (description or attrs) and (
799
805
  dataset.description != description or dataset.attrs != attrs
800
806
  ):
@@ -82,6 +82,13 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
82
82
  nargs="+",
83
83
  help="Python package requirements",
84
84
  )
85
+ studio_run_parser.add_argument(
86
+ "--priority",
87
+ type=int,
88
+ default=5,
89
+ help="Priority for the job in range 0-5. "
90
+ "Lower value is higher priority (default: 5)",
91
+ )
85
92
 
86
93
  studio_ls_help = "List jobs in Studio"
87
94
  studio_ls_description = "List jobs in Studio."
@@ -258,7 +258,7 @@ class AbstractWarehouse(ABC, Serializable):
258
258
  if Client.is_data_source_uri(dataset_name):
259
259
  # for datasets that are created for bucket listing we use different prefix
260
260
  prefix = self.DATASET_SOURCE_TABLE_PREFIX
261
- return f"{prefix}{dataset_name}_{version}"
261
+ return f"{prefix}{dataset_name}_{version.replace('.', '_')}"
262
262
 
263
263
  def temp_table_name(self) -> str:
264
264
  return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
@@ -107,24 +107,21 @@ class DatasetDependency:
107
107
  dataset_version: Optional[str],
108
108
  dataset_version_created_at: Optional[datetime],
109
109
  ) -> Optional["DatasetDependency"]:
110
- from datachain.client import Client
111
- from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
110
+ from datachain.lib.listing import is_listing_dataset
112
111
 
113
112
  if not dataset_id:
114
113
  return None
115
114
 
116
115
  assert dataset_name is not None
117
- dependency_type = DatasetDependencyType.DATASET
118
- dependency_name = dataset_name
119
-
120
- if is_listing_dataset(dataset_name):
121
- dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
122
- dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
123
116
 
124
117
  return cls(
125
118
  id,
126
- dependency_type,
127
- dependency_name,
119
+ (
120
+ DatasetDependencyType.STORAGE
121
+ if is_listing_dataset(dataset_name)
122
+ else DatasetDependencyType.DATASET
123
+ ),
124
+ dataset_name,
128
125
  (
129
126
  dataset_version # type: ignore[arg-type]
130
127
  if dataset_version
@@ -0,0 +1,119 @@
1
+ from collections.abc import Sequence
2
+ from copy import copy
3
+ from functools import wraps
4
+ from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
5
+
6
+ import datachain
7
+ from datachain.dataset import DatasetDependency
8
+ from datachain.error import DatasetNotFoundError
9
+
10
+ if TYPE_CHECKING:
11
+ from typing_extensions import Concatenate, ParamSpec
12
+
13
+ from datachain.lib.dc import DataChain
14
+
15
+ P = ParamSpec("P")
16
+
17
+
18
+ T = TypeVar("T", bound="DataChain")
19
+
20
+
21
+ def delta_disabled(
22
+ method: "Callable[Concatenate[T, P], T]",
23
+ ) -> "Callable[Concatenate[T, P], T]":
24
+ """
25
+ Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to
26
+ work with delta updates. It throws `NotImplementedError` if chain on which
27
+ method is called is marked as delta.
28
+ """
29
+
30
+ @wraps(method)
31
+ def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
32
+ if self.delta:
33
+ raise NotImplementedError(
34
+ f"Delta update cannot be used with {method.__name__}"
35
+ )
36
+ return method(self, *args, **kwargs)
37
+
38
+ return _inner
39
+
40
+
41
+ def _append_steps(dc: "DataChain", other: "DataChain"):
42
+ """Returns cloned chain with appended steps from other chain.
43
+ Steps are all those modification methods applied like filters, mappers etc.
44
+ """
45
+ dc = dc.clone()
46
+ dc._query.steps += other._query.steps.copy()
47
+ dc.signals_schema = other.signals_schema
48
+ return dc
49
+
50
+
51
+ def delta_update(
52
+ dc: "DataChain",
53
+ name: str,
54
+ on: Union[str, Sequence[str]],
55
+ right_on: Optional[Union[str, Sequence[str]]] = None,
56
+ compare: Optional[Union[str, Sequence[str]]] = None,
57
+ ) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]:
58
+ """
59
+ Creates new chain that consists of the last version of current delta dataset
60
+ plus diff from the source with all needed modifications.
61
+ This way we don't need to re-calculate the whole chain from the source again(
62
+ apply all the DataChain methods like filters, mappers, generators etc.)
63
+ but just the diff part which is very important for performance.
64
+
65
+ Note that currently delta update works only if there is only one direct dependency.
66
+ """
67
+ catalog = dc.session.catalog
68
+ dc._query.apply_listing_pre_step()
69
+
70
+ try:
71
+ latest_version = catalog.get_dataset(name).latest_version
72
+ except DatasetNotFoundError:
73
+ # first creation of delta update dataset
74
+ return None, None, True
75
+
76
+ dependencies = catalog.get_dataset_dependencies(
77
+ name, latest_version, indirect=False
78
+ )
79
+
80
+ dep = dependencies[0]
81
+ if not dep:
82
+ # starting dataset (e.g listing) was removed so we are backing off to normal
83
+ # dataset creation, as it was created first time
84
+ return None, None, True
85
+
86
+ source_ds_name = dep.name
87
+ source_ds_version = dep.version
88
+ source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
89
+ dependencies = copy(dependencies)
90
+ dependencies = [d for d in dependencies if d is not None] # filter out removed dep
91
+ dependencies[0].version = source_ds_latest_version # type: ignore[union-attr]
92
+
93
+ source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
94
+ source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
95
+
96
+ diff = source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
97
+ # We append all the steps from the original chain to diff, e.g filters, mappers.
98
+ diff = _append_steps(diff, dc)
99
+
100
+ # to avoid re-calculating diff multiple times
101
+ diff = diff.persist()
102
+
103
+ if diff.empty:
104
+ return None, None, False
105
+
106
+ # merging diff and the latest version of dataset
107
+ delta_chain = (
108
+ datachain.read_dataset(name, latest_version)
109
+ .compare(
110
+ diff,
111
+ on=right_on or on,
112
+ added=True,
113
+ modified=False,
114
+ deleted=False,
115
+ )
116
+ .union(diff)
117
+ )
118
+
119
+ return delta_chain, dependencies, True # type: ignore[return-value]
@@ -30,7 +30,7 @@ class CompareStatus(str, Enum):
30
30
  SAME = "S"
31
31
 
32
32
 
33
- def _compare( # noqa: C901
33
+ def _compare( # noqa: C901, PLR0912
34
34
  left: "DataChain",
35
35
  right: "DataChain",
36
36
  on: Union[str, Sequence[str]],
@@ -77,14 +77,16 @@ def _compare( # noqa: C901
77
77
  cols_select = list(left.signals_schema.clone_without_sys_signals().values.keys())
78
78
 
79
79
  # getting correct on and right_on column names
80
+ on_ = on
80
81
  on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
81
- right_on = right.signals_schema.resolve(*(right_on or on)).db_signals() # type: ignore[assignment]
82
+ right_on = right.signals_schema.resolve(*(right_on or on_)).db_signals() # type: ignore[assignment]
82
83
 
83
84
  # getting correct compare and right_compare column names if they are defined
84
85
  if compare:
86
+ compare_ = compare
85
87
  compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment]
86
88
  right_compare = right.signals_schema.resolve(
87
- *(right_compare or compare)
89
+ *(right_compare or compare_)
88
90
  ).db_signals() # type: ignore[assignment]
89
91
  elif not compare and len(cols) != len(right_cols):
90
92
  # here we will mark all rows that are not added or deleted as modified since
@@ -155,7 +157,11 @@ def _compare( # noqa: C901
155
157
  if status_col:
156
158
  cols_select.append(diff_col)
157
159
 
158
- dc_diff = dc_diff.select(*cols_select)
160
+ if not dc_diff._sys:
161
+ # TODO workaround when sys signal is not available in diff
162
+ dc_diff = dc_diff.settings(sys=True).select(*cols_select).settings(sys=False)
163
+ else:
164
+ dc_diff = dc_diff.select(*cols_select)
159
165
 
160
166
  # final schema is schema from the left chain with status column added if needed
161
167
  dc_diff.signals_schema = (
@@ -25,6 +25,7 @@ from tqdm import tqdm
25
25
 
26
26
  from datachain import semver
27
27
  from datachain.dataset import DatasetRecord
28
+ from datachain.delta import delta_disabled, delta_update
28
29
  from datachain.func import literal
29
30
  from datachain.func.base import Function
30
31
  from datachain.func.func import Func
@@ -72,6 +73,9 @@ if TYPE_CHECKING:
72
73
  P = ParamSpec("P")
73
74
 
74
75
 
76
+ T = TypeVar("T", bound="DataChain")
77
+
78
+
75
79
  class DataChain:
76
80
  """DataChain - a data structure for batch data processing and evaluation.
77
81
 
@@ -164,6 +168,7 @@ class DataChain:
164
168
  self.signals_schema = signal_schema
165
169
  self._setup: dict = setup or {}
166
170
  self._sys = _sys
171
+ self._delta = False
167
172
 
168
173
  def __repr__(self) -> str:
169
174
  """Return a string representation of the chain."""
@@ -177,6 +182,32 @@ class DataChain:
177
182
  self.print_schema(file=file)
178
183
  return file.getvalue()
179
184
 
185
+ def _as_delta(
186
+ self,
187
+ on: Optional[Union[str, Sequence[str]]] = None,
188
+ right_on: Optional[Union[str, Sequence[str]]] = None,
189
+ compare: Optional[Union[str, Sequence[str]]] = None,
190
+ ) -> "Self":
191
+ """Marks this chain as delta, which means special delta process will be
192
+ called on saving dataset for optimization"""
193
+ if on is None:
194
+ raise ValueError("'delta on' fields must be defined")
195
+ self._delta = True
196
+ self._delta_on = on
197
+ self._delta_result_on = right_on
198
+ self._delta_compare = compare
199
+ return self
200
+
201
+ @property
202
+ def empty(self) -> bool:
203
+ """Returns True if chain has zero number of rows"""
204
+ return not bool(self.count())
205
+
206
+ @property
207
+ def delta(self) -> bool:
208
+ """Returns True if this chain is ran in "delta" update mode"""
209
+ return self._delta
210
+
180
211
  @property
181
212
  def schema(self) -> dict[str, DataType]:
182
213
  """Get schema of the chain."""
@@ -254,9 +285,17 @@ class DataChain:
254
285
  signal_schema = copy.deepcopy(self.signals_schema)
255
286
  if _sys is None:
256
287
  _sys = self._sys
257
- return type(self)(
288
+ chain = type(self)(
258
289
  query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
259
290
  )
291
+ if self.delta:
292
+ chain = chain._as_delta(
293
+ on=self._delta_on,
294
+ right_on=self._delta_result_on,
295
+ compare=self._delta_compare,
296
+ )
297
+
298
+ return chain
260
299
 
261
300
  def settings(
262
301
  self,
@@ -461,8 +500,9 @@ class DataChain:
461
500
  version: Optional[str] = None,
462
501
  description: Optional[str] = None,
463
502
  attrs: Optional[list[str]] = None,
503
+ update_version: Optional[str] = "patch",
464
504
  **kwargs,
465
- ) -> "Self":
505
+ ) -> "DataChain":
466
506
  """Save to a Dataset. It returns the chain itself.
467
507
 
468
508
  Parameters:
@@ -472,11 +512,52 @@ class DataChain:
472
512
  description : description of a dataset.
473
513
  attrs : attributes of a dataset. They can be without value, e.g "NLP",
474
514
  or with a value, e.g "location=US".
515
+ update_version: which part of the dataset version to automatically increase.
516
+ Available values: `major`, `minor` or `patch`. Default is `patch`.
475
517
  """
476
518
  if version is not None:
477
519
  semver.validate(version)
478
520
 
521
+ if update_version is not None and update_version not in [
522
+ "patch",
523
+ "major",
524
+ "minor",
525
+ ]:
526
+ raise ValueError(
527
+ "update_version can have one of the following values: major, minor or"
528
+ " patch"
529
+ )
530
+
479
531
  schema = self.signals_schema.clone_without_sys_signals().serialize()
532
+ if self.delta and name:
533
+ delta_ds, dependencies, has_changes = delta_update(
534
+ self,
535
+ name,
536
+ on=self._delta_on,
537
+ right_on=self._delta_result_on,
538
+ compare=self._delta_compare,
539
+ )
540
+
541
+ if delta_ds:
542
+ return self._evolve(
543
+ query=delta_ds._query.save(
544
+ name=name,
545
+ version=version,
546
+ feature_schema=schema,
547
+ dependencies=dependencies,
548
+ **kwargs,
549
+ )
550
+ )
551
+
552
+ if not has_changes:
553
+ # sources have not been changed so new version of resulting dataset
554
+ # would be the same as previous one. To avoid duplicating exact
555
+ # datasets, we won't create new version of it and we will return
556
+ # current latest version instead.
557
+ from .datasets import read_dataset
558
+
559
+ return read_dataset(name, **kwargs)
560
+
480
561
  return self._evolve(
481
562
  query=self._query.save(
482
563
  name=name,
@@ -484,6 +565,7 @@ class DataChain:
484
565
  description=description,
485
566
  attrs=attrs,
486
567
  feature_schema=schema,
568
+ update_version=update_version,
487
569
  **kwargs,
488
570
  )
489
571
  )
@@ -601,6 +683,7 @@ class DataChain:
601
683
  signal_schema=udf_obj.output,
602
684
  )
603
685
 
686
+ @delta_disabled
604
687
  def agg(
605
688
  self,
606
689
  func: Optional[Callable] = None,
@@ -754,6 +837,7 @@ class DataChain:
754
837
 
755
838
  return self._evolve(query=self._query.order_by(*args))
756
839
 
840
+ @delta_disabled
757
841
  def distinct(self, arg: str, *args: str) -> "Self": # type: ignore[override]
758
842
  """Removes duplicate rows based on uniqueness of some input column(s)
759
843
  i.e if rows are found with the same value of input column(s), only one
@@ -788,6 +872,7 @@ class DataChain:
788
872
  query=self._query.select(*columns), signal_schema=new_schema
789
873
  )
790
874
 
875
+ @delta_disabled # type: ignore[arg-type]
791
876
  def group_by(
792
877
  self,
793
878
  *,
@@ -1146,6 +1231,7 @@ class DataChain:
1146
1231
  schema = self.signals_schema.clone_without_file_signals()
1147
1232
  return self.select(*schema.values.keys())
1148
1233
 
1234
+ @delta_disabled
1149
1235
  def merge(
1150
1236
  self,
1151
1237
  right_ds: "DataChain",
@@ -1254,6 +1340,7 @@ class DataChain:
1254
1340
 
1255
1341
  return ds
1256
1342
 
1343
+ @delta_disabled
1257
1344
  def union(self, other: "Self") -> "Self":
1258
1345
  """Return the set union of the two datasets.
1259
1346
 
@@ -1,3 +1,4 @@
1
+ from collections.abc import Sequence
1
2
  from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
2
3
 
3
4
  from datachain.error import DatasetVersionNotFoundError
@@ -27,6 +28,10 @@ def read_dataset(
27
28
  session: Optional[Session] = None,
28
29
  settings: Optional[dict] = None,
29
30
  fallback_to_studio: bool = True,
31
+ delta: Optional[bool] = False,
32
+ delta_on: Optional[Union[str, Sequence[str]]] = None,
33
+ delta_result_on: Optional[Union[str, Sequence[str]]] = None,
34
+ delta_compare: Optional[Union[str, Sequence[str]]] = None,
30
35
  ) -> "DataChain":
31
36
  """Get data from a saved Dataset. It returns the chain itself.
32
37
  If dataset or version is not found locally, it will try to pull it from Studio.
@@ -38,6 +43,36 @@ def read_dataset(
38
43
  settings : Settings to use for the chain.
39
44
  fallback_to_studio : Try to pull dataset from Studio if not found locally.
40
45
  Default is True.
46
+ delta: If set to True, we optimize the creation of new dataset versions by
47
+ calculating the diff between the latest version of this storage and the
48
+ version used to create the most recent version of the resulting chain
49
+ dataset (the one specified in `.save()`). We then run the "diff" chain
50
+ using only the diff data, rather than the entire storage data, and merge
51
+ that diff chain with the latest version of the resulting dataset to create
52
+ a new version. This approach avoids applying modifications to all records
53
+ from storage every time, which can be an expensive operation.
54
+ The diff is calculated using the `DataChain.compare()` method, which
55
+ compares the `delta_on` fields to find matches and checks the compare
56
+ fields to determine if a record has changed. Note that this process only
57
+ considers added and modified records in storage; deleted records are not
58
+ removed from the new dataset version.
59
+ This calculation is based on the difference between the current version
60
+ of the source and the version used to create the dataset.
61
+ delta_on: A list of fields that uniquely identify rows in the source.
62
+ If two rows have the same values, they are considered the same (e.g., they
63
+ could be different versions of the same row in a versioned source).
64
+ This is used in the delta update to calculate the diff.
65
+ delta_result_on: A list of fields in the resulting dataset that correspond
66
+ to the `delta_on` fields from the source.
67
+ This is needed to identify rows that have changed in the source but are
68
+ already present in the current version of the resulting dataset, in order
69
+ to avoid including outdated versions of those rows in the new dataset.
70
+ We retain only the latest versions of rows to prevent duplication.
71
+ There is no need to define this if the `delta_on` fields are present in
72
+ the final dataset and have not been renamed.
73
+ delta_compare: A list of fields used to check if the same row has been modified
74
+ in the new version of the source.
75
+ If not defined, all fields except those defined in delta_on will be used.
41
76
 
42
77
  Example:
43
78
  ```py
@@ -113,7 +148,12 @@ def read_dataset(
113
148
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
114
149
  else:
115
150
  signals_schema |= SignalSchema.from_column_types(query.column_types or {})
116
- return DataChain(query, _settings, signals_schema)
151
+ chain = DataChain(query, _settings, signals_schema)
152
+ if delta:
153
+ chain = chain._as_delta(
154
+ on=delta_on, right_on=delta_result_on, compare=delta_compare
155
+ )
156
+ return chain
117
157
 
118
158
 
119
159
  def datasets(
@@ -1,11 +1,12 @@
1
1
  import os.path
2
+ from collections.abc import Sequence
3
+ from functools import reduce
2
4
  from typing import (
3
5
  TYPE_CHECKING,
4
6
  Optional,
5
7
  Union,
6
8
  )
7
9
 
8
- from datachain.error import DatasetNotFoundError
9
10
  from datachain.lib.file import (
10
11
  FileType,
11
12
  get_file_type,
@@ -33,6 +34,10 @@ def read_storage(
33
34
  column: str = "file",
34
35
  update: bool = False,
35
36
  anon: bool = False,
37
+ delta: Optional[bool] = False,
38
+ delta_on: Optional[Union[str, Sequence[str]]] = None,
39
+ delta_result_on: Optional[Union[str, Sequence[str]]] = None,
40
+ delta_compare: Optional[Union[str, Sequence[str]]] = None,
36
41
  client_config: Optional[dict] = None,
37
42
  ) -> "DataChain":
38
43
  """Get data from storage(s) as a list of file with all file attributes.
@@ -48,6 +53,36 @@ def read_storage(
48
53
  update : force storage reindexing. Default is False.
49
54
  anon : If True, we will treat cloud bucket as public one
50
55
  client_config : Optional client configuration for the storage client.
56
+ delta: If set to True, we optimize the creation of new dataset versions by
57
+ calculating the diff between the latest version of this storage and the
58
+ version used to create the most recent version of the resulting chain
59
+ dataset (the one specified in `.save()`). We then run the "diff" chain
60
+ using only the diff data, rather than the entire storage data, and merge
61
+ that diff chain with the latest version of the resulting dataset to create
62
+ a new version. This approach avoids applying modifications to all records
63
+ from storage every time, which can be an expensive operation.
64
+ The diff is calculated using the `DataChain.compare()` method, which
65
+ compares the `delta_on` fields to find matches and checks the compare
66
+ fields to determine if a record has changed. Note that this process only
67
+ considers added and modified records in storage; deleted records are not
68
+ removed from the new dataset version.
69
+ This calculation is based on the difference between the current version
70
+ of the source and the version used to create the dataset.
71
+ delta_on: A list of fields that uniquely identify rows in the source.
72
+ If two rows have the same values, they are considered the same (e.g., they
73
+ could be different versions of the same row in a versioned source).
74
+ This is used in the delta update to calculate the diff.
75
+ delta_result_on: A list of fields in the resulting dataset that correspond
76
+ to the `delta_on` fields from the source.
77
+ This is needed to identify rows that have changed in the source but are
78
+ already present in the current version of the resulting dataset, in order
79
+ to avoid including outdated versions of those rows in the new dataset.
80
+ We retain only the latest versions of rows to prevent duplication.
81
+ There is no need to define this if the `delta_on` fields are present in
82
+ the final dataset and have not been renamed.
83
+ delta_compare: A list of fields used to check if the same row has been modified
84
+ in the new version of the source.
85
+ If not defined, all fields except those defined in `delta_on` will be used.
51
86
 
52
87
  Returns:
53
88
  DataChain: A DataChain object containing the file information.
@@ -107,7 +142,7 @@ def read_storage(
107
142
  if not uris:
108
143
  raise ValueError("No URIs provided")
109
144
 
110
- storage_chain = None
145
+ chains = []
111
146
  listed_ds_name = set()
112
147
  file_values = []
113
148
 
@@ -132,11 +167,6 @@ def read_storage(
132
167
 
133
168
  def lst_fn(ds_name, lst_uri):
134
169
  # disable prefetch for listing, as it pre-downloads all files
135
- try:
136
- version = catalog.get_dataset(ds_name).next_version_major
137
- except DatasetNotFoundError:
138
- version = None
139
-
140
170
  (
141
171
  read_records(
142
172
  DataChain.DEFAULT_FILE_RECORD,
@@ -150,18 +180,18 @@ def read_storage(
150
180
  output={f"{column}": file_type},
151
181
  )
152
182
  # for internal listing datasets, we always bump major version
153
- .save(ds_name, listing=True, version=version)
183
+ .save(ds_name, listing=True, update_version="major")
154
184
  )
155
185
 
156
186
  dc._query.set_listing_fn(
157
187
  lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
158
188
  )
159
189
 
160
- chain = ls(dc, list_path, recursive=recursive, column=column)
161
-
162
- storage_chain = storage_chain.union(chain) if storage_chain else chain
190
+ chains.append(ls(dc, list_path, recursive=recursive, column=column))
163
191
  listed_ds_name.add(list_ds_name)
164
192
 
193
+ storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
194
+
165
195
  if file_values:
166
196
  file_chain = read_values(
167
197
  session=session,
@@ -176,4 +206,8 @@ def read_storage(
176
206
 
177
207
  assert storage_chain is not None
178
208
 
209
+ if delta:
210
+ storage_chain = storage_chain._as_delta(
211
+ on=delta_on, right_on=delta_result_on, compare=delta_compare
212
+ )
179
213
  return storage_chain