datachain 0.23.0__tar.gz → 0.24.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (399) hide show
  1. {datachain-0.23.0 → datachain-0.24.1}/PKG-INFO +1 -1
  2. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/catalog/catalog.py +25 -13
  3. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/dataset.py +34 -5
  4. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/delta.py +82 -25
  5. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/datachain.py +2 -0
  6. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/datasets.py +52 -35
  7. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/listings.py +2 -6
  8. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/projects.py +1 -1
  9. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/dataset.py +2 -8
  10. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/remote/studio.py +4 -3
  11. {datachain-0.23.0 → datachain-0.24.1}/src/datachain.egg-info/PKG-INFO +1 -1
  12. {datachain-0.23.0 → datachain-0.24.1}/src/datachain.egg-info/SOURCES.txt +2 -0
  13. {datachain-0.23.0 → datachain-0.24.1}/tests/conftest.py +169 -0
  14. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_dataset_query.py +2 -4
  15. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_delta.py +23 -8
  16. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_ls.py +4 -2
  17. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_pull.py +61 -199
  18. datachain-0.24.1/tests/func/test_read_dataset_remote.py +555 -0
  19. datachain-0.24.1/tests/func/test_read_dataset_version_specifiers.py +88 -0
  20. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_retry.py +166 -52
  21. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_datachain.py +10 -1
  22. {datachain-0.23.0 → datachain-0.24.1}/.cruft.json +0 -0
  23. {datachain-0.23.0 → datachain-0.24.1}/.gitattributes +0 -0
  24. {datachain-0.23.0 → datachain-0.24.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  25. {datachain-0.23.0 → datachain-0.24.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  26. {datachain-0.23.0 → datachain-0.24.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  27. {datachain-0.23.0 → datachain-0.24.1}/.github/codecov.yaml +0 -0
  28. {datachain-0.23.0 → datachain-0.24.1}/.github/dependabot.yml +0 -0
  29. {datachain-0.23.0 → datachain-0.24.1}/.github/workflows/benchmarks.yml +0 -0
  30. {datachain-0.23.0 → datachain-0.24.1}/.github/workflows/release.yml +0 -0
  31. {datachain-0.23.0 → datachain-0.24.1}/.github/workflows/tests-studio.yml +0 -0
  32. {datachain-0.23.0 → datachain-0.24.1}/.github/workflows/tests.yml +0 -0
  33. {datachain-0.23.0 → datachain-0.24.1}/.github/workflows/update-template.yaml +0 -0
  34. {datachain-0.23.0 → datachain-0.24.1}/.gitignore +0 -0
  35. {datachain-0.23.0 → datachain-0.24.1}/.pre-commit-config.yaml +0 -0
  36. {datachain-0.23.0 → datachain-0.24.1}/CODE_OF_CONDUCT.rst +0 -0
  37. {datachain-0.23.0 → datachain-0.24.1}/LICENSE +0 -0
  38. {datachain-0.23.0 → datachain-0.24.1}/README.rst +0 -0
  39. {datachain-0.23.0 → datachain-0.24.1}/docs/assets/captioned_cartoons.png +0 -0
  40. {datachain-0.23.0 → datachain-0.24.1}/docs/assets/datachain-white.svg +0 -0
  41. {datachain-0.23.0 → datachain-0.24.1}/docs/assets/datachain.svg +0 -0
  42. {datachain-0.23.0 → datachain-0.24.1}/docs/commands/auth/login.md +0 -0
  43. {datachain-0.23.0 → datachain-0.24.1}/docs/commands/auth/logout.md +0 -0
  44. {datachain-0.23.0 → datachain-0.24.1}/docs/commands/auth/team.md +0 -0
  45. {datachain-0.23.0 → datachain-0.24.1}/docs/commands/auth/token.md +0 -0
  46. {datachain-0.23.0 → datachain-0.24.1}/docs/commands/index.md +0 -0
  47. {datachain-0.23.0 → datachain-0.24.1}/docs/commands/job/cancel.md +0 -0
  48. {datachain-0.23.0 → datachain-0.24.1}/docs/commands/job/clusters.md +0 -0
  49. {datachain-0.23.0 → datachain-0.24.1}/docs/commands/job/logs.md +0 -0
  50. {datachain-0.23.0 → datachain-0.24.1}/docs/commands/job/ls.md +0 -0
  51. {datachain-0.23.0 → datachain-0.24.1}/docs/commands/job/run.md +0 -0
  52. {datachain-0.23.0 → datachain-0.24.1}/docs/contributing.md +0 -0
  53. {datachain-0.23.0 → datachain-0.24.1}/docs/css/github-permalink-style.css +0 -0
  54. {datachain-0.23.0 → datachain-0.24.1}/docs/examples.md +0 -0
  55. {datachain-0.23.0 → datachain-0.24.1}/docs/guide/db_migrations.md +0 -0
  56. {datachain-0.23.0 → datachain-0.24.1}/docs/guide/delta.md +0 -0
  57. {datachain-0.23.0 → datachain-0.24.1}/docs/guide/env.md +0 -0
  58. {datachain-0.23.0 → datachain-0.24.1}/docs/guide/index.md +0 -0
  59. {datachain-0.23.0 → datachain-0.24.1}/docs/guide/namespaces.md +0 -0
  60. {datachain-0.23.0 → datachain-0.24.1}/docs/guide/processing.md +0 -0
  61. {datachain-0.23.0 → datachain-0.24.1}/docs/guide/remotes.md +0 -0
  62. {datachain-0.23.0 → datachain-0.24.1}/docs/guide/retry.md +0 -0
  63. {datachain-0.23.0 → datachain-0.24.1}/docs/index.md +0 -0
  64. {datachain-0.23.0 → datachain-0.24.1}/docs/overrides/main.html +0 -0
  65. {datachain-0.23.0 → datachain-0.24.1}/docs/quick-start.md +0 -0
  66. {datachain-0.23.0 → datachain-0.24.1}/docs/references/data-types/arrowrow.md +0 -0
  67. {datachain-0.23.0 → datachain-0.24.1}/docs/references/data-types/bbox.md +0 -0
  68. {datachain-0.23.0 → datachain-0.24.1}/docs/references/data-types/file.md +0 -0
  69. {datachain-0.23.0 → datachain-0.24.1}/docs/references/data-types/imagefile.md +0 -0
  70. {datachain-0.23.0 → datachain-0.24.1}/docs/references/data-types/index.md +0 -0
  71. {datachain-0.23.0 → datachain-0.24.1}/docs/references/data-types/pose.md +0 -0
  72. {datachain-0.23.0 → datachain-0.24.1}/docs/references/data-types/segment.md +0 -0
  73. {datachain-0.23.0 → datachain-0.24.1}/docs/references/data-types/tarvfile.md +0 -0
  74. {datachain-0.23.0 → datachain-0.24.1}/docs/references/data-types/textfile.md +0 -0
  75. {datachain-0.23.0 → datachain-0.24.1}/docs/references/data-types/videofile.md +0 -0
  76. {datachain-0.23.0 → datachain-0.24.1}/docs/references/datachain.md +0 -0
  77. {datachain-0.23.0 → datachain-0.24.1}/docs/references/func.md +0 -0
  78. {datachain-0.23.0 → datachain-0.24.1}/docs/references/index.md +0 -0
  79. {datachain-0.23.0 → datachain-0.24.1}/docs/references/toolkit.md +0 -0
  80. {datachain-0.23.0 → datachain-0.24.1}/docs/references/torch.md +0 -0
  81. {datachain-0.23.0 → datachain-0.24.1}/docs/references/udf.md +0 -0
  82. {datachain-0.23.0 → datachain-0.24.1}/docs/tutorials.md +0 -0
  83. {datachain-0.23.0 → datachain-0.24.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  84. {datachain-0.23.0 → datachain-0.24.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  85. {datachain-0.23.0 → datachain-0.24.1}/examples/computer_vision/openimage-detect.py +0 -0
  86. {datachain-0.23.0 → datachain-0.24.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
  87. {datachain-0.23.0 → datachain-0.24.1}/examples/computer_vision/ultralytics-pose.py +0 -0
  88. {datachain-0.23.0 → datachain-0.24.1}/examples/computer_vision/ultralytics-segment.py +0 -0
  89. {datachain-0.23.0 → datachain-0.24.1}/examples/get_started/common_sql_functions.py +0 -0
  90. {datachain-0.23.0 → datachain-0.24.1}/examples/get_started/json-csv-reader.py +0 -0
  91. {datachain-0.23.0 → datachain-0.24.1}/examples/get_started/torch-loader.py +0 -0
  92. {datachain-0.23.0 → datachain-0.24.1}/examples/get_started/udfs/parallel.py +0 -0
  93. {datachain-0.23.0 → datachain-0.24.1}/examples/get_started/udfs/simple.py +0 -0
  94. {datachain-0.23.0 → datachain-0.24.1}/examples/get_started/udfs/stateful.py +0 -0
  95. {datachain-0.23.0 → datachain-0.24.1}/examples/incremental_processing/delta.py +0 -0
  96. {datachain-0.23.0 → datachain-0.24.1}/examples/incremental_processing/retry.py +0 -0
  97. {datachain-0.23.0 → datachain-0.24.1}/examples/incremental_processing/utils.py +0 -0
  98. {datachain-0.23.0 → datachain-0.24.1}/examples/llm_and_nlp/claude-query.py +0 -0
  99. {datachain-0.23.0 → datachain-0.24.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  100. {datachain-0.23.0 → datachain-0.24.1}/examples/multimodal/clip_inference.py +0 -0
  101. {datachain-0.23.0 → datachain-0.24.1}/examples/multimodal/hf_pipeline.py +0 -0
  102. {datachain-0.23.0 → datachain-0.24.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  103. {datachain-0.23.0 → datachain-0.24.1}/examples/multimodal/wds.py +0 -0
  104. {datachain-0.23.0 → datachain-0.24.1}/examples/multimodal/wds_filtered.py +0 -0
  105. {datachain-0.23.0 → datachain-0.24.1}/mkdocs.yml +0 -0
  106. {datachain-0.23.0 → datachain-0.24.1}/noxfile.py +0 -0
  107. {datachain-0.23.0 → datachain-0.24.1}/pyproject.toml +0 -0
  108. {datachain-0.23.0 → datachain-0.24.1}/setup.cfg +0 -0
  109. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/__init__.py +0 -0
  110. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/__main__.py +0 -0
  111. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/asyn.py +0 -0
  112. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cache.py +0 -0
  113. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/catalog/__init__.py +0 -0
  114. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/catalog/datasource.py +0 -0
  115. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/catalog/loader.py +0 -0
  116. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/__init__.py +0 -0
  117. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/commands/__init__.py +0 -0
  118. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/commands/datasets.py +0 -0
  119. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/commands/du.py +0 -0
  120. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/commands/index.py +0 -0
  121. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/commands/ls.py +0 -0
  122. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/commands/misc.py +0 -0
  123. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/commands/query.py +0 -0
  124. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/commands/show.py +0 -0
  125. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/parser/__init__.py +0 -0
  126. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/parser/job.py +0 -0
  127. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/parser/studio.py +0 -0
  128. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/parser/utils.py +0 -0
  129. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/cli/utils.py +0 -0
  130. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/client/__init__.py +0 -0
  131. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/client/azure.py +0 -0
  132. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/client/fileslice.py +0 -0
  133. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/client/fsspec.py +0 -0
  134. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/client/gcs.py +0 -0
  135. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/client/hf.py +0 -0
  136. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/client/local.py +0 -0
  137. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/client/s3.py +0 -0
  138. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/config.py +0 -0
  139. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/data_storage/__init__.py +0 -0
  140. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/data_storage/db_engine.py +0 -0
  141. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/data_storage/job.py +0 -0
  142. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/data_storage/metastore.py +0 -0
  143. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/data_storage/schema.py +0 -0
  144. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/data_storage/serializer.py +0 -0
  145. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/data_storage/sqlite.py +0 -0
  146. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/data_storage/warehouse.py +0 -0
  147. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/diff/__init__.py +0 -0
  148. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/error.py +0 -0
  149. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/fs/__init__.py +0 -0
  150. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/fs/reference.py +0 -0
  151. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/fs/utils.py +0 -0
  152. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/__init__.py +0 -0
  153. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/aggregate.py +0 -0
  154. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/array.py +0 -0
  155. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/base.py +0 -0
  156. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/conditional.py +0 -0
  157. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/func.py +0 -0
  158. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/numeric.py +0 -0
  159. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/path.py +0 -0
  160. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/random.py +0 -0
  161. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/string.py +0 -0
  162. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/func/window.py +0 -0
  163. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/job.py +0 -0
  164. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/__init__.py +0 -0
  165. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/arrow.py +0 -0
  166. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/clip.py +0 -0
  167. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/convert/__init__.py +0 -0
  168. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/convert/flatten.py +0 -0
  169. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  170. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  171. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/convert/unflatten.py +0 -0
  172. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  173. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/data_model.py +0 -0
  174. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dataset_info.py +0 -0
  175. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/__init__.py +0 -0
  176. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/csv.py +0 -0
  177. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/database.py +0 -0
  178. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/hf.py +0 -0
  179. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/json.py +0 -0
  180. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/pandas.py +0 -0
  181. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/parquet.py +0 -0
  182. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/records.py +0 -0
  183. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/storage.py +0 -0
  184. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/utils.py +0 -0
  185. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/dc/values.py +0 -0
  186. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/file.py +0 -0
  187. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/hf.py +0 -0
  188. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/image.py +0 -0
  189. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/listing.py +0 -0
  190. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/listing_info.py +0 -0
  191. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/meta_formats.py +0 -0
  192. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/model_store.py +0 -0
  193. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/namespaces.py +0 -0
  194. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/pytorch.py +0 -0
  195. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/settings.py +0 -0
  196. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/signal_schema.py +0 -0
  197. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/tar.py +0 -0
  198. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/text.py +0 -0
  199. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/udf.py +0 -0
  200. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/udf_signature.py +0 -0
  201. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/utils.py +0 -0
  202. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/video.py +0 -0
  203. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/webdataset.py +0 -0
  204. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/lib/webdataset_laion.py +0 -0
  205. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/listing.py +0 -0
  206. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/model/__init__.py +0 -0
  207. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/model/bbox.py +0 -0
  208. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/model/pose.py +0 -0
  209. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/model/segment.py +0 -0
  210. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/model/ultralytics/__init__.py +0 -0
  211. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/model/ultralytics/bbox.py +0 -0
  212. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/model/ultralytics/pose.py +0 -0
  213. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/model/ultralytics/segment.py +0 -0
  214. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/model/utils.py +0 -0
  215. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/namespace.py +0 -0
  216. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/node.py +0 -0
  217. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/nodes_fetcher.py +0 -0
  218. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/nodes_thread_pool.py +0 -0
  219. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/progress.py +0 -0
  220. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/project.py +0 -0
  221. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/py.typed +0 -0
  222. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/__init__.py +0 -0
  223. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/batch.py +0 -0
  224. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/dispatch.py +0 -0
  225. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/metrics.py +0 -0
  226. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/params.py +0 -0
  227. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/queue.py +0 -0
  228. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/schema.py +0 -0
  229. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/session.py +0 -0
  230. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/udf.py +0 -0
  231. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/query/utils.py +0 -0
  232. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/remote/__init__.py +0 -0
  233. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/script_meta.py +0 -0
  234. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/semver.py +0 -0
  235. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/__init__.py +0 -0
  236. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/default/__init__.py +0 -0
  237. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/default/base.py +0 -0
  238. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/functions/__init__.py +0 -0
  239. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/functions/aggregate.py +0 -0
  240. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/functions/array.py +0 -0
  241. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/functions/conditional.py +0 -0
  242. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/functions/numeric.py +0 -0
  243. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/functions/path.py +0 -0
  244. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/functions/random.py +0 -0
  245. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/functions/string.py +0 -0
  246. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/selectable.py +0 -0
  247. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  248. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/sqlite/base.py +0 -0
  249. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/sqlite/types.py +0 -0
  250. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/sqlite/vector.py +0 -0
  251. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/types.py +0 -0
  252. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/sql/utils.py +0 -0
  253. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/studio.py +0 -0
  254. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/telemetry.py +0 -0
  255. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/toolkit/__init__.py +0 -0
  256. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/toolkit/split.py +0 -0
  257. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/torch/__init__.py +0 -0
  258. {datachain-0.23.0 → datachain-0.24.1}/src/datachain/utils.py +0 -0
  259. {datachain-0.23.0 → datachain-0.24.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  260. {datachain-0.23.0 → datachain-0.24.1}/src/datachain.egg-info/entry_points.txt +0 -0
  261. {datachain-0.23.0 → datachain-0.24.1}/src/datachain.egg-info/requires.txt +0 -0
  262. {datachain-0.23.0 → datachain-0.24.1}/src/datachain.egg-info/top_level.txt +0 -0
  263. {datachain-0.23.0 → datachain-0.24.1}/tests/__init__.py +0 -0
  264. {datachain-0.23.0 → datachain-0.24.1}/tests/benchmarks/__init__.py +0 -0
  265. {datachain-0.23.0 → datachain-0.24.1}/tests/benchmarks/conftest.py +0 -0
  266. {datachain-0.23.0 → datachain-0.24.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  267. {datachain-0.23.0 → datachain-0.24.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  268. {datachain-0.23.0 → datachain-0.24.1}/tests/benchmarks/datasets/.gitignore +0 -0
  269. {datachain-0.23.0 → datachain-0.24.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  270. {datachain-0.23.0 → datachain-0.24.1}/tests/benchmarks/test_datachain.py +0 -0
  271. {datachain-0.23.0 → datachain-0.24.1}/tests/benchmarks/test_ls.py +0 -0
  272. {datachain-0.23.0 → datachain-0.24.1}/tests/benchmarks/test_version.py +0 -0
  273. {datachain-0.23.0 → datachain-0.24.1}/tests/data.py +0 -0
  274. {datachain-0.23.0 → datachain-0.24.1}/tests/examples/__init__.py +0 -0
  275. {datachain-0.23.0 → datachain-0.24.1}/tests/examples/test_examples.py +0 -0
  276. {datachain-0.23.0 → datachain-0.24.1}/tests/examples/test_wds_e2e.py +0 -0
  277. {datachain-0.23.0 → datachain-0.24.1}/tests/examples/wds_data.py +0 -0
  278. {datachain-0.23.0 → datachain-0.24.1}/tests/func/__init__.py +0 -0
  279. {datachain-0.23.0 → datachain-0.24.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  280. {datachain-0.23.0 → datachain-0.24.1}/tests/func/data/lena.jpg +0 -0
  281. {datachain-0.23.0 → datachain-0.24.1}/tests/func/fake-service-account-credentials.json +0 -0
  282. {datachain-0.23.0 → datachain-0.24.1}/tests/func/functions/__init__.py +0 -0
  283. {datachain-0.23.0 → datachain-0.24.1}/tests/func/functions/test_aggregate.py +0 -0
  284. {datachain-0.23.0 → datachain-0.24.1}/tests/func/functions/test_array.py +0 -0
  285. {datachain-0.23.0 → datachain-0.24.1}/tests/func/functions/test_conditional.py +0 -0
  286. {datachain-0.23.0 → datachain-0.24.1}/tests/func/functions/test_numeric.py +0 -0
  287. {datachain-0.23.0 → datachain-0.24.1}/tests/func/functions/test_path.py +0 -0
  288. {datachain-0.23.0 → datachain-0.24.1}/tests/func/functions/test_random.py +0 -0
  289. {datachain-0.23.0 → datachain-0.24.1}/tests/func/functions/test_string.py +0 -0
  290. {datachain-0.23.0 → datachain-0.24.1}/tests/func/model/__init__.py +0 -0
  291. {datachain-0.23.0 → datachain-0.24.1}/tests/func/model/data/running-mask0.png +0 -0
  292. {datachain-0.23.0 → datachain-0.24.1}/tests/func/model/data/running-mask1.png +0 -0
  293. {datachain-0.23.0 → datachain-0.24.1}/tests/func/model/data/running.jpg +0 -0
  294. {datachain-0.23.0 → datachain-0.24.1}/tests/func/model/data/ships.jpg +0 -0
  295. {datachain-0.23.0 → datachain-0.24.1}/tests/func/model/test_yolo.py +0 -0
  296. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_batching.py +0 -0
  297. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_catalog.py +0 -0
  298. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_client.py +0 -0
  299. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_cloud_transfer.py +0 -0
  300. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_data_storage.py +0 -0
  301. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_datachain.py +0 -0
  302. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_datachain_merge.py +0 -0
  303. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_datasets.py +0 -0
  304. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_feature_pickling.py +0 -0
  305. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_file.py +0 -0
  306. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_hf.py +0 -0
  307. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_hidden_field.py +0 -0
  308. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_image.py +0 -0
  309. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_listing.py +0 -0
  310. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_meta_formats.py +0 -0
  311. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_metastore.py +0 -0
  312. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_metrics.py +0 -0
  313. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_pytorch.py +0 -0
  314. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_query.py +0 -0
  315. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_read_database.py +0 -0
  316. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_session.py +0 -0
  317. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_toolkit.py +0 -0
  318. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_video.py +0 -0
  319. {datachain-0.23.0 → datachain-0.24.1}/tests/func/test_warehouse.py +0 -0
  320. {datachain-0.23.0 → datachain-0.24.1}/tests/scripts/feature_class.py +0 -0
  321. {datachain-0.23.0 → datachain-0.24.1}/tests/scripts/feature_class_exception.py +0 -0
  322. {datachain-0.23.0 → datachain-0.24.1}/tests/scripts/feature_class_parallel.py +0 -0
  323. {datachain-0.23.0 → datachain-0.24.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  324. {datachain-0.23.0 → datachain-0.24.1}/tests/scripts/name_len_slow.py +0 -0
  325. {datachain-0.23.0 → datachain-0.24.1}/tests/test_atomicity.py +0 -0
  326. {datachain-0.23.0 → datachain-0.24.1}/tests/test_cli_e2e.py +0 -0
  327. {datachain-0.23.0 → datachain-0.24.1}/tests/test_cli_studio.py +0 -0
  328. {datachain-0.23.0 → datachain-0.24.1}/tests/test_import_time.py +0 -0
  329. {datachain-0.23.0 → datachain-0.24.1}/tests/test_query_e2e.py +0 -0
  330. {datachain-0.23.0 → datachain-0.24.1}/tests/test_telemetry.py +0 -0
  331. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/__init__.py +0 -0
  332. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/__init__.py +0 -0
  333. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/conftest.py +0 -0
  334. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_arrow.py +0 -0
  335. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_clip.py +0 -0
  336. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  337. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  338. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_diff.py +0 -0
  339. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_feature.py +0 -0
  340. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_feature_utils.py +0 -0
  341. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_file.py +0 -0
  342. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_hf.py +0 -0
  343. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_image.py +0 -0
  344. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_listing_info.py +0 -0
  345. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_namespace.py +0 -0
  346. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_project.py +0 -0
  347. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_python_to_sql.py +0 -0
  348. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_schema.py +0 -0
  349. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_signal_schema.py +0 -0
  350. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  351. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_text.py +0 -0
  352. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_udf.py +0 -0
  353. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_udf_signature.py +0 -0
  354. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_utils.py +0 -0
  355. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/lib/test_webdataset.py +0 -0
  356. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/model/__init__.py +0 -0
  357. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/model/test_bbox.py +0 -0
  358. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/model/test_pose.py +0 -0
  359. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/model/test_segment.py +0 -0
  360. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/model/test_utils.py +0 -0
  361. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/sql/__init__.py +0 -0
  362. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  363. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/sql/sqlite/test_types.py +0 -0
  364. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  365. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/sql/test_array.py +0 -0
  366. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/sql/test_conditional.py +0 -0
  367. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/sql/test_path.py +0 -0
  368. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/sql/test_random.py +0 -0
  369. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/sql/test_selectable.py +0 -0
  370. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/sql/test_string.py +0 -0
  371. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_asyn.py +0 -0
  372. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_cache.py +0 -0
  373. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_catalog.py +0 -0
  374. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_catalog_loader.py +0 -0
  375. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_cli_parsing.py +0 -0
  376. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_client.py +0 -0
  377. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_client_gcs.py +0 -0
  378. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_client_s3.py +0 -0
  379. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_config.py +0 -0
  380. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_data_storage.py +0 -0
  381. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_database_engine.py +0 -0
  382. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_dataset.py +0 -0
  383. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_dispatch.py +0 -0
  384. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_fileslice.py +0 -0
  385. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_func.py +0 -0
  386. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_listing.py +0 -0
  387. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_metastore.py +0 -0
  388. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_module_exports.py +0 -0
  389. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_pytorch.py +0 -0
  390. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_query.py +0 -0
  391. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_query_metrics.py +0 -0
  392. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_query_params.py +0 -0
  393. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_script_meta.py +0 -0
  394. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_semver.py +0 -0
  395. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_serializer.py +0 -0
  396. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_session.py +0 -0
  397. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_utils.py +0 -0
  398. {datachain-0.23.0 → datachain-0.24.1}/tests/unit/test_warehouse.py +0 -0
  399. {datachain-0.23.0 → datachain-0.24.1}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.23.0
3
+ Version: 0.24.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -49,6 +49,7 @@ from datachain.error import (
49
49
  DatasetInvalidVersionError,
50
50
  DatasetNotFoundError,
51
51
  DatasetVersionNotFoundError,
52
+ NamespaceNotFoundError,
52
53
  ProjectNotFoundError,
53
54
  QueryScriptCancelError,
54
55
  QueryScriptRunError,
@@ -1107,21 +1108,26 @@ class Catalog:
1107
1108
  namespace_name: str,
1108
1109
  project_name: str,
1109
1110
  version: Optional[str] = None,
1111
+ pull_dataset: bool = False,
1112
+ update: bool = False,
1110
1113
  ) -> DatasetRecord:
1111
- try:
1112
- project = self.metastore.get_project(project_name, namespace_name)
1113
- ds = self.get_dataset(name, project)
1114
- if version and not ds.has_version(version):
1115
- raise DatasetVersionNotFoundError(
1116
- f"Dataset {name} does not have version {version}"
1117
- )
1118
- return ds
1114
+ if self.metastore.is_local_dataset(namespace_name) or not update:
1115
+ try:
1116
+ project = self.metastore.get_project(project_name, namespace_name)
1117
+ ds = self.get_dataset(name, project)
1118
+ if not version or ds.has_version(version):
1119
+ return ds
1120
+ except (NamespaceNotFoundError, ProjectNotFoundError, DatasetNotFoundError):
1121
+ pass
1122
+
1123
+ if self.metastore.is_local_dataset(namespace_name):
1124
+ raise DatasetNotFoundError(
1125
+ f"Dataset {name}"
1126
+ + (f" version {version} " if version else " ")
1127
+ + "not found"
1128
+ )
1119
1129
 
1120
- except (
1121
- ProjectNotFoundError,
1122
- DatasetNotFoundError,
1123
- DatasetVersionNotFoundError,
1124
- ):
1130
+ if pull_dataset:
1125
1131
  print("Dataset not found in local catalog, trying to get from studio")
1126
1132
  remote_ds_uri = create_dataset_uri(
1127
1133
  name, namespace_name, project_name, version
@@ -1136,6 +1142,8 @@ class Catalog:
1136
1142
  name, self.metastore.get_project(project_name, namespace_name)
1137
1143
  )
1138
1144
 
1145
+ return self.get_remote_dataset(namespace_name, project_name, name)
1146
+
1139
1147
  def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
1140
1148
  """Returns dataset that contains version with specific uuid"""
1141
1149
  for dataset in self.ls_datasets():
@@ -1152,6 +1160,10 @@ class Catalog:
1152
1160
 
1153
1161
  info_response = studio_client.dataset_info(namespace, project, name)
1154
1162
  if not info_response.ok:
1163
+ if info_response.status == 404:
1164
+ raise DatasetNotFoundError(
1165
+ f"Dataset {namespace}.{project}.{name} not found"
1166
+ )
1155
1167
  raise DataChainError(info_response.message)
1156
1168
 
1157
1169
  dataset_info = info_response.data
@@ -12,6 +12,9 @@ from typing import (
12
12
  )
13
13
  from urllib.parse import urlparse
14
14
 
15
+ from packaging.specifiers import SpecifierSet
16
+ from packaging.version import Version
17
+
15
18
  from datachain import semver
16
19
  from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
17
20
  from datachain.namespace import Namespace
@@ -661,13 +664,39 @@ class DatasetRecord:
661
664
  return None
662
665
  return max(versions).version
663
666
 
664
- @property
665
- def prev_version(self) -> Optional[str]:
666
- """Returns previous version of a dataset"""
667
- if len(self.versions) == 1:
667
+ def latest_compatible_version(self, version_spec: str) -> Optional[str]:
668
+ """
669
+ Returns the latest version that matches the given version specifier.
670
+
671
+ Supports Python version specifiers like:
672
+ - ">=1.0.0,<2.0.0" (compatible release range)
673
+ - "~=1.4.2" (compatible release clause)
674
+ - "==1.2.*" (prefix matching)
675
+ - ">1.0.0" (exclusive ordered comparison)
676
+ - ">=1.0.0" (inclusive ordered comparison)
677
+ - "!=1.3.0" (version exclusion)
678
+
679
+ Args:
680
+ version_spec: Version specifier string following PEP 440
681
+
682
+ Returns:
683
+ Latest compatible version string, or None if no compatible version found
684
+ """
685
+ spec_set = SpecifierSet(version_spec)
686
+
687
+ # Convert dataset versions to packaging.Version objects
688
+ # and filter compatible ones
689
+ compatible_versions = []
690
+ for v in self.versions:
691
+ pkg_version = Version(v.version)
692
+ if spec_set.contains(pkg_version):
693
+ compatible_versions.append(v)
694
+
695
+ if not compatible_versions:
668
696
  return None
669
697
 
670
- return sorted(self.versions)[-2].version
698
+ # Return the latest compatible version
699
+ return max(compatible_versions).version
671
700
 
672
701
  @classmethod
673
702
  def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
6
6
  import datachain
7
7
  from datachain.dataset import DatasetDependency
8
8
  from datachain.error import DatasetNotFoundError
9
+ from datachain.project import Project
9
10
 
10
11
  if TYPE_CHECKING:
11
12
  from typing_extensions import Concatenate, ParamSpec
@@ -50,15 +51,24 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
50
51
 
51
52
  def _get_delta_chain(
52
53
  source_ds_name: str,
54
+ source_ds_project: Project,
53
55
  source_ds_version: str,
54
56
  source_ds_latest_version: str,
55
57
  on: Union[str, Sequence[str]],
56
58
  compare: Optional[Union[str, Sequence[str]]] = None,
57
59
  ) -> "DataChain":
58
60
  """Get delta chain for processing changes between versions."""
59
- source_dc = datachain.read_dataset(source_ds_name, version=source_ds_version)
61
+ source_dc = datachain.read_dataset(
62
+ source_ds_name,
63
+ namespace=source_ds_project.namespace.name,
64
+ project=source_ds_project.name,
65
+ version=source_ds_version,
66
+ )
60
67
  source_dc_latest = datachain.read_dataset(
61
- source_ds_name, version=source_ds_latest_version
68
+ source_ds_name,
69
+ namespace=source_ds_project.namespace.name,
70
+ project=source_ds_project.name,
71
+ version=source_ds_latest_version,
62
72
  )
63
73
 
64
74
  # Calculate diff between source versions
@@ -67,12 +77,15 @@ def _get_delta_chain(
67
77
 
68
78
  def _get_retry_chain(
69
79
  name: str,
80
+ project: Project,
70
81
  latest_version: str,
71
82
  source_ds_name: str,
72
- source_ds_latest_version: str,
83
+ source_ds_project: Project,
84
+ source_ds_version: str,
73
85
  on: Union[str, Sequence[str]],
74
86
  right_on: Optional[Union[str, Sequence[str]]],
75
87
  delta_retry: Optional[Union[bool, str]],
88
+ diff_chain: "DataChain",
76
89
  ) -> Optional["DataChain"]:
77
90
  """Get retry chain for processing error records and missing records."""
78
91
  # Import here to avoid circular import
@@ -81,35 +94,49 @@ def _get_retry_chain(
81
94
  retry_chain = None
82
95
 
83
96
  # Read the latest version of the result dataset for retry logic
84
- result_dataset = datachain.read_dataset(name, version=latest_version)
85
- source_dc_latest = datachain.read_dataset(
86
- source_ds_name, version=source_ds_latest_version
97
+ result_dataset = datachain.read_dataset(
98
+ name,
99
+ namespace=project.namespace.name,
100
+ project=project.name,
101
+ version=latest_version,
102
+ )
103
+ source_dc = datachain.read_dataset(
104
+ source_ds_name,
105
+ namespace=source_ds_project.namespace.name,
106
+ project=source_ds_project.name,
107
+ version=source_ds_version,
87
108
  )
88
109
 
89
110
  # Handle error records if delta_retry is a string (column name)
90
111
  if isinstance(delta_retry, str):
91
112
  error_records = result_dataset.filter(C(delta_retry) != "")
92
- error_source_records = source_dc_latest.merge(
113
+ error_source_records = source_dc.merge(
93
114
  error_records, on=on, right_on=right_on, inner=True
94
- ).select(*list(source_dc_latest.signals_schema.values))
115
+ ).select(*list(source_dc.signals_schema.values))
95
116
  retry_chain = error_source_records
96
117
 
97
118
  # Handle missing records if delta_retry is True
98
119
  elif delta_retry is True:
99
- missing_records = source_dc_latest.subtract(
100
- result_dataset, on=on, right_on=right_on
101
- )
120
+ missing_records = source_dc.subtract(result_dataset, on=on, right_on=right_on)
102
121
  retry_chain = missing_records
103
122
 
104
- return retry_chain
123
+ # Subtract also diff chain since some items might be picked
124
+ # up by `delta=True` itself (e.g. records got modified AND are missing in the
125
+ # result dataset atm)
126
+ return retry_chain.subtract(diff_chain, on=on) if retry_chain else None
105
127
 
106
128
 
107
129
  def _get_source_info(
108
130
  name: str,
131
+ project: Project,
109
132
  latest_version: str,
110
133
  catalog,
111
134
  ) -> tuple[
112
- Optional[str], Optional[str], Optional[str], Optional[list[DatasetDependency]]
135
+ Optional[str],
136
+ Optional[Project],
137
+ Optional[str],
138
+ Optional[str],
139
+ Optional[list[DatasetDependency]],
113
140
  ]:
114
141
  """Get source dataset information and dependencies.
115
142
 
@@ -118,23 +145,34 @@ def _get_source_info(
118
145
  Returns (None, None, None, None) if source dataset was removed.
119
146
  """
120
147
  dependencies = catalog.get_dataset_dependencies(
121
- name, latest_version, indirect=False
148
+ name, latest_version, project=project, indirect=False
122
149
  )
123
150
 
124
151
  dep = dependencies[0]
125
152
  if not dep:
126
153
  # Starting dataset was removed, back off to normal dataset creation
127
- return None, None, None, None
154
+ return None, None, None, None, None
128
155
 
156
+ source_ds_project = catalog.metastore.get_project(dep.project, dep.namespace)
129
157
  source_ds_name = dep.name
130
158
  source_ds_version = dep.version
131
- source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
132
-
133
- return source_ds_name, source_ds_version, source_ds_latest_version, dependencies
159
+ source_ds_latest_version = catalog.get_dataset(
160
+ source_ds_name, project=source_ds_project
161
+ ).latest_version
162
+
163
+ return (
164
+ source_ds_name,
165
+ source_ds_project,
166
+ source_ds_version,
167
+ source_ds_latest_version,
168
+ dependencies,
169
+ )
134
170
 
135
171
 
136
172
  def delta_retry_update(
137
173
  dc: "DataChain",
174
+ namespace_name: str,
175
+ project_name: str,
138
176
  name: str,
139
177
  on: Union[str, Sequence[str]],
140
178
  right_on: Optional[Union[str, Sequence[str]]] = None,
@@ -173,11 +211,12 @@ def delta_retry_update(
173
211
  """
174
212
 
175
213
  catalog = dc.session.catalog
214
+ project = catalog.metastore.get_project(project_name, namespace_name)
176
215
  dc._query.apply_listing_pre_step()
177
216
 
178
217
  # Check if dataset exists
179
218
  try:
180
- dataset = catalog.get_dataset(name)
219
+ dataset = catalog.get_dataset(name, project=project)
181
220
  latest_version = dataset.latest_version
182
221
  except DatasetNotFoundError:
183
222
  # First creation of result dataset
@@ -189,19 +228,29 @@ def delta_retry_update(
189
228
  retry_chain = None
190
229
  processing_chain = None
191
230
 
192
- source_ds_name, source_ds_version, source_ds_latest_version, dependencies = (
193
- _get_source_info(name, latest_version, catalog)
194
- )
231
+ (
232
+ source_ds_name,
233
+ source_ds_project,
234
+ source_ds_version,
235
+ source_ds_latest_version,
236
+ dependencies,
237
+ ) = _get_source_info(name, project, latest_version, catalog)
195
238
 
196
239
  # If source_ds_name is None, starting dataset was removed
197
240
  if source_ds_name is None:
198
241
  return None, None, True
199
242
 
243
+ assert source_ds_project
200
244
  assert source_ds_version
201
245
  assert source_ds_latest_version
202
246
 
203
247
  diff_chain = _get_delta_chain(
204
- source_ds_name, source_ds_version, source_ds_latest_version, on, compare
248
+ source_ds_name,
249
+ source_ds_project,
250
+ source_ds_version,
251
+ source_ds_latest_version,
252
+ on,
253
+ compare,
205
254
  )
206
255
 
207
256
  # Filter out removed dep
@@ -215,12 +264,15 @@ def delta_retry_update(
215
264
  if delta_retry:
216
265
  retry_chain = _get_retry_chain(
217
266
  name,
267
+ project,
218
268
  latest_version,
219
269
  source_ds_name,
220
- source_ds_latest_version,
270
+ source_ds_project,
271
+ source_ds_version,
221
272
  on,
222
273
  right_on,
223
274
  delta_retry,
275
+ diff_chain,
224
276
  )
225
277
 
226
278
  # Combine delta and retry chains
@@ -236,7 +288,12 @@ def delta_retry_update(
236
288
  if processing_chain is None or (processing_chain and processing_chain.empty):
237
289
  return None, None, False
238
290
 
239
- latest_dataset = datachain.read_dataset(name, version=latest_version)
291
+ latest_dataset = datachain.read_dataset(
292
+ name,
293
+ namespace=project.namespace.name,
294
+ project=project.name,
295
+ version=latest_version,
296
+ )
240
297
  compared_chain = latest_dataset.diff(
241
298
  processing_chain,
242
299
  on=right_on or on,
@@ -598,6 +598,8 @@ class DataChain:
598
598
 
599
599
  result_ds, dependencies, has_changes = delta_retry_update(
600
600
  self,
601
+ namespace_name,
602
+ project_name,
601
603
  name,
602
604
  on=self._delta_on,
603
605
  right_on=self._delta_result_on,
@@ -7,9 +7,6 @@ from datachain.error import (
7
7
  ProjectNotFoundError,
8
8
  )
9
9
  from datachain.lib.dataset_info import DatasetInfo
10
- from datachain.lib.file import (
11
- File,
12
- )
13
10
  from datachain.lib.projects import get as get_project
14
11
  from datachain.lib.settings import Settings
15
12
  from datachain.lib.signal_schema import SignalSchema
@@ -34,7 +31,6 @@ def read_dataset(
34
31
  version: Optional[Union[str, int]] = None,
35
32
  session: Optional[Session] = None,
36
33
  settings: Optional[dict] = None,
37
- fallback_to_studio: bool = True,
38
34
  delta: Optional[bool] = False,
39
35
  delta_on: Optional[Union[str, Sequence[str]]] = (
40
36
  "file.path",
@@ -44,6 +40,7 @@ def read_dataset(
44
40
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
45
41
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
46
42
  delta_retry: Optional[Union[bool, str]] = None,
43
+ update: bool = False,
47
44
  ) -> "DataChain":
48
45
  """Get data from a saved Dataset. It returns the chain itself.
49
46
  If dataset or version is not found locally, it will try to pull it from Studio.
@@ -55,11 +52,12 @@ def read_dataset(
55
52
  set; otherwise, default values will be applied.
56
53
  namespace : optional name of namespace in which dataset to read is created
57
54
  project : optional name of project in which dataset to read is created
58
- version : dataset version
55
+ version : dataset version. Supports:
56
+ - Exact version strings: "1.2.3"
57
+ - Legacy integer versions: 1, 2, 3 (finds latest major version)
58
+ - Version specifiers (PEP 440): ">=1.0.0,<2.0.0", "~=1.4.2", "==1.2.*", etc.
59
59
  session : Session to use for the chain.
60
60
  settings : Settings to use for the chain.
61
- fallback_to_studio : Try to pull dataset from Studio if not found locally.
62
- Default is True.
63
61
  delta: If True, only process new or changed files instead of reprocessing
64
62
  everything. This saves time by skipping files that were already processed in
65
63
  previous versions. The optimization is working when a new version of the
@@ -79,6 +77,10 @@ def read_dataset(
79
77
  (error mode)
80
78
  - True: Reprocess records missing from the result dataset (missing mode)
81
79
  - None: No retry processing (default)
80
+ update: If True always checks for newer versions available on Studio, even if
81
+ some version of the dataset exists locally already. If False (default), it
82
+ will only fetch the dataset from Studio if it is not found locally.
83
+
82
84
 
83
85
  Example:
84
86
  ```py
@@ -92,11 +94,22 @@ def read_dataset(
92
94
  ```
93
95
 
94
96
  ```py
95
- chain = dc.read_dataset("my_cats", fallback_to_studio=False)
97
+ chain = dc.read_dataset("my_cats", version="1.0.0")
96
98
  ```
97
99
 
98
100
  ```py
99
- chain = dc.read_dataset("my_cats", version="1.0.0")
101
+ # Using version specifiers (PEP 440)
102
+ chain = dc.read_dataset("my_cats", version=">=1.0.0,<2.0.0")
103
+ ```
104
+
105
+ ```py
106
+ # Legacy integer version support (finds latest in major version)
107
+ chain = dc.read_dataset("my_cats", version=1) # Latest 1.x.x version
108
+ ```
109
+
110
+ ```py
111
+ # Always check for newer versions matching a version specifier from Studio
112
+ chain = dc.read_dataset("my_cats", version=">=1.0.0", update=True)
100
113
  ```
101
114
 
102
115
  ```py
@@ -113,7 +126,6 @@ def read_dataset(
113
126
  version="1.0.0",
114
127
  session=session,
115
128
  settings=settings,
116
- fallback_to_studio=True,
117
129
  )
118
130
  ```
119
131
  """
@@ -121,6 +133,8 @@ def read_dataset(
121
133
 
122
134
  from .datachain import DataChain
123
135
 
136
+ telemetry.send_event_once("class", "datachain_init", name=name, version=version)
137
+
124
138
  session = Session.get(session)
125
139
  catalog = session.catalog
126
140
 
@@ -131,31 +145,37 @@ def read_dataset(
131
145
  )
132
146
 
133
147
  if version is not None:
148
+ dataset = session.catalog.get_dataset_with_remote_fallback(
149
+ name, namespace_name, project_name, update=update
150
+ )
151
+
152
+ # Convert legacy integer versions to version specifiers
153
+ # For backward compatibility we still allow users to put version as integer
154
+ # in which case we convert it to a version specifier that finds the latest
155
+ # version where major part is equal to that input version.
156
+ # For example if user sets version=2, we convert it to ">=2.0.0,<3.0.0"
157
+ # which will find something like 2.4.3 (assuming 2.4.3 is the biggest among
158
+ # all 2.* dataset versions)
159
+ if isinstance(version, int):
160
+ version_spec = f">={version}.0.0,<{version + 1}.0.0"
161
+ else:
162
+ version_spec = str(version)
163
+
164
+ from packaging.specifiers import InvalidSpecifier, SpecifierSet
165
+
134
166
  try:
135
- # for backward compatibility we still allow users to put version as integer
136
- # in which case we are trying to find latest version where major part is
137
- # equal to that input version. For example if user sets version=2, we could
138
- # continue with something like 2.4.3 (assuming 2.4.3 is the biggest among
139
- # all 2.* dataset versions). If dataset doesn't have any versions where
140
- # major part is equal to that input, exception is thrown.
141
- major = int(version)
142
- try:
143
- ds_project = get_project(project_name, namespace_name, session=session)
144
- except ProjectNotFoundError:
145
- raise DatasetNotFoundError(
146
- f"Dataset {name} not found in namespace {namespace_name} and",
147
- f" project {project_name}",
148
- ) from None
149
-
150
- dataset = session.catalog.get_dataset(name, ds_project)
151
- latest_major = dataset.latest_major_version(major)
152
- if not latest_major:
167
+ # Try to parse as version specifier
168
+ SpecifierSet(version_spec)
169
+ # If it's a valid specifier set, find the latest compatible version
170
+ latest_compatible = dataset.latest_compatible_version(version_spec)
171
+ if not latest_compatible:
153
172
  raise DatasetVersionNotFoundError(
154
- f"Dataset {name} does not have version {version}"
173
+ f"No dataset {name} version matching specifier {version_spec}"
155
174
  )
156
- version = latest_major
157
- except ValueError:
158
- # version is in new semver string format, continuing as normal
175
+ version = latest_compatible
176
+ except InvalidSpecifier:
177
+ # If not a valid specifier, treat as exact version string
178
+ # This handles cases like "1.2.3" which are exact versions, not specifiers
159
179
  pass
160
180
 
161
181
  if settings:
@@ -169,11 +189,8 @@ def read_dataset(
169
189
  namespace_name=namespace_name,
170
190
  version=version, # type: ignore[arg-type]
171
191
  session=session,
172
- indexing_column_types=File._datachain_column_types,
173
- fallback_to_studio=fallback_to_studio,
174
192
  )
175
193
 
176
- telemetry.send_event_once("class", "datachain_init", name=name, version=version)
177
194
  signals_schema = SignalSchema({"sys": Sys})
178
195
  if query.feature_schema:
179
196
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
@@ -127,12 +127,8 @@ def read_listing_dataset(
127
127
  if version is None:
128
128
  version = dataset.latest_version
129
129
 
130
- query = DatasetQuery(
131
- name=name,
132
- session=session,
133
- indexing_column_types=File._datachain_column_types,
134
- fallback_to_studio=False,
135
- )
130
+ query = DatasetQuery(name=name, session=session)
131
+
136
132
  if settings:
137
133
  cfg = {**settings}
138
134
  if "prefetch" not in cfg:
@@ -54,7 +54,7 @@ def get(name: str, namespace: str, session: Optional[Session]) -> Project:
54
54
  ```py
55
55
  import datachain as dc
56
56
  from datachain.lib.projects import get as get_project
57
- project = get_project("my-project", "local")
57
+ project = get_project("my-project", "local")
58
58
  ```
59
59
  """
60
60
  return Session.get(session).catalog.metastore.get_project(name, namespace)
@@ -1099,13 +1099,9 @@ class DatasetQuery:
1099
1099
  namespace_name: Optional[str] = None,
1100
1100
  catalog: Optional["Catalog"] = None,
1101
1101
  session: Optional[Session] = None,
1102
- indexing_column_types: Optional[dict[str, Any]] = None,
1103
1102
  in_memory: bool = False,
1104
- fallback_to_studio: bool = True,
1105
1103
  update: bool = False,
1106
1104
  ) -> None:
1107
- from datachain.remote.studio import is_token_set
1108
-
1109
1105
  self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
1110
1106
  self.catalog = catalog or self.session.catalog
1111
1107
  self.steps: list[Step] = []
@@ -1137,18 +1133,16 @@ class DatasetQuery:
1137
1133
  # not setting query step yet as listing dataset might not exist at
1138
1134
  # this point
1139
1135
  self.list_ds_name = name
1140
- elif fallback_to_studio and is_token_set():
1136
+ else:
1141
1137
  self._set_starting_step(
1142
1138
  self.catalog.get_dataset_with_remote_fallback(
1143
1139
  name,
1144
1140
  namespace_name=namespace_name,
1145
1141
  project_name=project_name,
1146
1142
  version=version,
1143
+ pull_dataset=True,
1147
1144
  )
1148
1145
  )
1149
- else:
1150
- project = self.catalog.metastore.get_project(project_name, namespace_name)
1151
- self._set_starting_step(self.catalog.get_dataset(name, project=project))
1152
1146
 
1153
1147
  def _set_starting_step(self, ds: "DatasetRecord") -> None:
1154
1148
  if not self.version:
@@ -78,10 +78,11 @@ def _parse_dates(obj: dict, date_fields: list[str]):
78
78
 
79
79
 
80
80
  class Response(Generic[T]):
81
- def __init__(self, data: T, ok: bool, message: str) -> None:
81
+ def __init__(self, data: T, ok: bool, message: str, status: int) -> None:
82
82
  self.data = data
83
83
  self.ok = ok
84
84
  self.message = message
85
+ self.status = status
85
86
 
86
87
  def __repr__(self):
87
88
  return (
@@ -186,7 +187,7 @@ class StudioClient:
186
187
  message = "Indexing in progress"
187
188
  else:
188
189
  message = content.get("message", "")
189
- return Response(response_data, ok, message)
190
+ return Response(response_data, ok, message, response.status_code)
190
191
 
191
192
  @retry_with_backoff(retries=3, errors=(HTTPError, Timeout))
192
193
  def _send_request(
@@ -236,7 +237,7 @@ class StudioClient:
236
237
  else:
237
238
  message = ""
238
239
 
239
- return Response(data, ok, message)
240
+ return Response(data, ok, message, response.status_code)
240
241
 
241
242
  @staticmethod
242
243
  def _unpacker_hook(code, data):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.23.0
3
+ Version: 0.24.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -298,6 +298,8 @@ tests/func/test_pull.py
298
298
  tests/func/test_pytorch.py
299
299
  tests/func/test_query.py
300
300
  tests/func/test_read_database.py
301
+ tests/func/test_read_dataset_remote.py
302
+ tests/func/test_read_dataset_version_specifiers.py
301
303
  tests/func/test_retry.py
302
304
  tests/func/test_session.py
303
305
  tests/func/test_toolkit.py