datachain 0.24.1__tar.gz → 0.24.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (399) hide show
  1. {datachain-0.24.1 → datachain-0.24.3}/.pre-commit-config.yaml +2 -2
  2. {datachain-0.24.1 → datachain-0.24.3}/PKG-INFO +1 -1
  3. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/catalog/catalog.py +11 -2
  4. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/data_storage/metastore.py +3 -1
  5. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/data_storage/sqlite.py +9 -6
  6. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/data_storage/warehouse.py +6 -4
  7. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/listing.py +10 -3
  8. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/dataset.py +11 -10
  9. {datachain-0.24.1 → datachain-0.24.3}/src/datachain.egg-info/PKG-INFO +1 -1
  10. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_datachain.py +85 -0
  11. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_delta.py +7 -1
  12. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_datachain.py +24 -0
  13. {datachain-0.24.1 → datachain-0.24.3}/.cruft.json +0 -0
  14. {datachain-0.24.1 → datachain-0.24.3}/.gitattributes +0 -0
  15. {datachain-0.24.1 → datachain-0.24.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  16. {datachain-0.24.1 → datachain-0.24.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  17. {datachain-0.24.1 → datachain-0.24.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  18. {datachain-0.24.1 → datachain-0.24.3}/.github/codecov.yaml +0 -0
  19. {datachain-0.24.1 → datachain-0.24.3}/.github/dependabot.yml +0 -0
  20. {datachain-0.24.1 → datachain-0.24.3}/.github/workflows/benchmarks.yml +0 -0
  21. {datachain-0.24.1 → datachain-0.24.3}/.github/workflows/release.yml +0 -0
  22. {datachain-0.24.1 → datachain-0.24.3}/.github/workflows/tests-studio.yml +0 -0
  23. {datachain-0.24.1 → datachain-0.24.3}/.github/workflows/tests.yml +0 -0
  24. {datachain-0.24.1 → datachain-0.24.3}/.github/workflows/update-template.yaml +0 -0
  25. {datachain-0.24.1 → datachain-0.24.3}/.gitignore +0 -0
  26. {datachain-0.24.1 → datachain-0.24.3}/CODE_OF_CONDUCT.rst +0 -0
  27. {datachain-0.24.1 → datachain-0.24.3}/LICENSE +0 -0
  28. {datachain-0.24.1 → datachain-0.24.3}/README.rst +0 -0
  29. {datachain-0.24.1 → datachain-0.24.3}/docs/assets/captioned_cartoons.png +0 -0
  30. {datachain-0.24.1 → datachain-0.24.3}/docs/assets/datachain-white.svg +0 -0
  31. {datachain-0.24.1 → datachain-0.24.3}/docs/assets/datachain.svg +0 -0
  32. {datachain-0.24.1 → datachain-0.24.3}/docs/commands/auth/login.md +0 -0
  33. {datachain-0.24.1 → datachain-0.24.3}/docs/commands/auth/logout.md +0 -0
  34. {datachain-0.24.1 → datachain-0.24.3}/docs/commands/auth/team.md +0 -0
  35. {datachain-0.24.1 → datachain-0.24.3}/docs/commands/auth/token.md +0 -0
  36. {datachain-0.24.1 → datachain-0.24.3}/docs/commands/index.md +0 -0
  37. {datachain-0.24.1 → datachain-0.24.3}/docs/commands/job/cancel.md +0 -0
  38. {datachain-0.24.1 → datachain-0.24.3}/docs/commands/job/clusters.md +0 -0
  39. {datachain-0.24.1 → datachain-0.24.3}/docs/commands/job/logs.md +0 -0
  40. {datachain-0.24.1 → datachain-0.24.3}/docs/commands/job/ls.md +0 -0
  41. {datachain-0.24.1 → datachain-0.24.3}/docs/commands/job/run.md +0 -0
  42. {datachain-0.24.1 → datachain-0.24.3}/docs/contributing.md +0 -0
  43. {datachain-0.24.1 → datachain-0.24.3}/docs/css/github-permalink-style.css +0 -0
  44. {datachain-0.24.1 → datachain-0.24.3}/docs/examples.md +0 -0
  45. {datachain-0.24.1 → datachain-0.24.3}/docs/guide/db_migrations.md +0 -0
  46. {datachain-0.24.1 → datachain-0.24.3}/docs/guide/delta.md +0 -0
  47. {datachain-0.24.1 → datachain-0.24.3}/docs/guide/env.md +0 -0
  48. {datachain-0.24.1 → datachain-0.24.3}/docs/guide/index.md +0 -0
  49. {datachain-0.24.1 → datachain-0.24.3}/docs/guide/namespaces.md +0 -0
  50. {datachain-0.24.1 → datachain-0.24.3}/docs/guide/processing.md +0 -0
  51. {datachain-0.24.1 → datachain-0.24.3}/docs/guide/remotes.md +0 -0
  52. {datachain-0.24.1 → datachain-0.24.3}/docs/guide/retry.md +0 -0
  53. {datachain-0.24.1 → datachain-0.24.3}/docs/index.md +0 -0
  54. {datachain-0.24.1 → datachain-0.24.3}/docs/overrides/main.html +0 -0
  55. {datachain-0.24.1 → datachain-0.24.3}/docs/quick-start.md +0 -0
  56. {datachain-0.24.1 → datachain-0.24.3}/docs/references/data-types/arrowrow.md +0 -0
  57. {datachain-0.24.1 → datachain-0.24.3}/docs/references/data-types/bbox.md +0 -0
  58. {datachain-0.24.1 → datachain-0.24.3}/docs/references/data-types/file.md +0 -0
  59. {datachain-0.24.1 → datachain-0.24.3}/docs/references/data-types/imagefile.md +0 -0
  60. {datachain-0.24.1 → datachain-0.24.3}/docs/references/data-types/index.md +0 -0
  61. {datachain-0.24.1 → datachain-0.24.3}/docs/references/data-types/pose.md +0 -0
  62. {datachain-0.24.1 → datachain-0.24.3}/docs/references/data-types/segment.md +0 -0
  63. {datachain-0.24.1 → datachain-0.24.3}/docs/references/data-types/tarvfile.md +0 -0
  64. {datachain-0.24.1 → datachain-0.24.3}/docs/references/data-types/textfile.md +0 -0
  65. {datachain-0.24.1 → datachain-0.24.3}/docs/references/data-types/videofile.md +0 -0
  66. {datachain-0.24.1 → datachain-0.24.3}/docs/references/datachain.md +0 -0
  67. {datachain-0.24.1 → datachain-0.24.3}/docs/references/func.md +0 -0
  68. {datachain-0.24.1 → datachain-0.24.3}/docs/references/index.md +0 -0
  69. {datachain-0.24.1 → datachain-0.24.3}/docs/references/toolkit.md +0 -0
  70. {datachain-0.24.1 → datachain-0.24.3}/docs/references/torch.md +0 -0
  71. {datachain-0.24.1 → datachain-0.24.3}/docs/references/udf.md +0 -0
  72. {datachain-0.24.1 → datachain-0.24.3}/docs/tutorials.md +0 -0
  73. {datachain-0.24.1 → datachain-0.24.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  74. {datachain-0.24.1 → datachain-0.24.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  75. {datachain-0.24.1 → datachain-0.24.3}/examples/computer_vision/openimage-detect.py +0 -0
  76. {datachain-0.24.1 → datachain-0.24.3}/examples/computer_vision/ultralytics-bbox.py +0 -0
  77. {datachain-0.24.1 → datachain-0.24.3}/examples/computer_vision/ultralytics-pose.py +0 -0
  78. {datachain-0.24.1 → datachain-0.24.3}/examples/computer_vision/ultralytics-segment.py +0 -0
  79. {datachain-0.24.1 → datachain-0.24.3}/examples/get_started/common_sql_functions.py +0 -0
  80. {datachain-0.24.1 → datachain-0.24.3}/examples/get_started/json-csv-reader.py +0 -0
  81. {datachain-0.24.1 → datachain-0.24.3}/examples/get_started/torch-loader.py +0 -0
  82. {datachain-0.24.1 → datachain-0.24.3}/examples/get_started/udfs/parallel.py +0 -0
  83. {datachain-0.24.1 → datachain-0.24.3}/examples/get_started/udfs/simple.py +0 -0
  84. {datachain-0.24.1 → datachain-0.24.3}/examples/get_started/udfs/stateful.py +0 -0
  85. {datachain-0.24.1 → datachain-0.24.3}/examples/incremental_processing/delta.py +0 -0
  86. {datachain-0.24.1 → datachain-0.24.3}/examples/incremental_processing/retry.py +0 -0
  87. {datachain-0.24.1 → datachain-0.24.3}/examples/incremental_processing/utils.py +0 -0
  88. {datachain-0.24.1 → datachain-0.24.3}/examples/llm_and_nlp/claude-query.py +0 -0
  89. {datachain-0.24.1 → datachain-0.24.3}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  90. {datachain-0.24.1 → datachain-0.24.3}/examples/multimodal/clip_inference.py +0 -0
  91. {datachain-0.24.1 → datachain-0.24.3}/examples/multimodal/hf_pipeline.py +0 -0
  92. {datachain-0.24.1 → datachain-0.24.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
  93. {datachain-0.24.1 → datachain-0.24.3}/examples/multimodal/wds.py +0 -0
  94. {datachain-0.24.1 → datachain-0.24.3}/examples/multimodal/wds_filtered.py +0 -0
  95. {datachain-0.24.1 → datachain-0.24.3}/mkdocs.yml +0 -0
  96. {datachain-0.24.1 → datachain-0.24.3}/noxfile.py +0 -0
  97. {datachain-0.24.1 → datachain-0.24.3}/pyproject.toml +0 -0
  98. {datachain-0.24.1 → datachain-0.24.3}/setup.cfg +0 -0
  99. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/__init__.py +0 -0
  100. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/__main__.py +0 -0
  101. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/asyn.py +0 -0
  102. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cache.py +0 -0
  103. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/catalog/__init__.py +0 -0
  104. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/catalog/datasource.py +0 -0
  105. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/catalog/loader.py +0 -0
  106. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/__init__.py +0 -0
  107. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/commands/__init__.py +0 -0
  108. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/commands/datasets.py +0 -0
  109. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/commands/du.py +0 -0
  110. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/commands/index.py +0 -0
  111. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/commands/ls.py +0 -0
  112. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/commands/misc.py +0 -0
  113. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/commands/query.py +0 -0
  114. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/commands/show.py +0 -0
  115. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/parser/__init__.py +0 -0
  116. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/parser/job.py +0 -0
  117. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/parser/studio.py +0 -0
  118. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/parser/utils.py +0 -0
  119. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/cli/utils.py +0 -0
  120. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/client/__init__.py +0 -0
  121. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/client/azure.py +0 -0
  122. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/client/fileslice.py +0 -0
  123. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/client/fsspec.py +0 -0
  124. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/client/gcs.py +0 -0
  125. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/client/hf.py +0 -0
  126. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/client/local.py +0 -0
  127. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/client/s3.py +0 -0
  128. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/config.py +0 -0
  129. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/data_storage/__init__.py +0 -0
  130. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/data_storage/db_engine.py +0 -0
  131. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/data_storage/job.py +0 -0
  132. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/data_storage/schema.py +0 -0
  133. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/data_storage/serializer.py +0 -0
  134. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/dataset.py +0 -0
  135. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/delta.py +0 -0
  136. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/diff/__init__.py +0 -0
  137. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/error.py +0 -0
  138. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/fs/__init__.py +0 -0
  139. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/fs/reference.py +0 -0
  140. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/fs/utils.py +0 -0
  141. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/__init__.py +0 -0
  142. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/aggregate.py +0 -0
  143. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/array.py +0 -0
  144. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/base.py +0 -0
  145. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/conditional.py +0 -0
  146. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/func.py +0 -0
  147. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/numeric.py +0 -0
  148. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/path.py +0 -0
  149. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/random.py +0 -0
  150. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/string.py +0 -0
  151. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/func/window.py +0 -0
  152. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/job.py +0 -0
  153. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/__init__.py +0 -0
  154. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/arrow.py +0 -0
  155. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/clip.py +0 -0
  156. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/convert/__init__.py +0 -0
  157. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/convert/flatten.py +0 -0
  158. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
  159. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
  160. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/convert/unflatten.py +0 -0
  161. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  162. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/data_model.py +0 -0
  163. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dataset_info.py +0 -0
  164. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/__init__.py +0 -0
  165. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/csv.py +0 -0
  166. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/database.py +0 -0
  167. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/datachain.py +0 -0
  168. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/datasets.py +0 -0
  169. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/hf.py +0 -0
  170. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/json.py +0 -0
  171. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/listings.py +0 -0
  172. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/pandas.py +0 -0
  173. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/parquet.py +0 -0
  174. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/records.py +0 -0
  175. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/storage.py +0 -0
  176. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/utils.py +0 -0
  177. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/dc/values.py +0 -0
  178. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/file.py +0 -0
  179. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/hf.py +0 -0
  180. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/image.py +0 -0
  181. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/listing.py +0 -0
  182. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/listing_info.py +0 -0
  183. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/meta_formats.py +0 -0
  184. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/model_store.py +0 -0
  185. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/namespaces.py +0 -0
  186. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/projects.py +0 -0
  187. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/pytorch.py +0 -0
  188. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/settings.py +0 -0
  189. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/signal_schema.py +0 -0
  190. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/tar.py +0 -0
  191. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/text.py +0 -0
  192. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/udf.py +0 -0
  193. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/udf_signature.py +0 -0
  194. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/utils.py +0 -0
  195. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/video.py +0 -0
  196. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/webdataset.py +0 -0
  197. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/lib/webdataset_laion.py +0 -0
  198. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/model/__init__.py +0 -0
  199. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/model/bbox.py +0 -0
  200. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/model/pose.py +0 -0
  201. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/model/segment.py +0 -0
  202. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/model/ultralytics/__init__.py +0 -0
  203. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/model/ultralytics/bbox.py +0 -0
  204. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/model/ultralytics/pose.py +0 -0
  205. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/model/ultralytics/segment.py +0 -0
  206. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/model/utils.py +0 -0
  207. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/namespace.py +0 -0
  208. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/node.py +0 -0
  209. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/nodes_fetcher.py +0 -0
  210. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/nodes_thread_pool.py +0 -0
  211. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/progress.py +0 -0
  212. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/project.py +0 -0
  213. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/py.typed +0 -0
  214. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/__init__.py +0 -0
  215. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/batch.py +0 -0
  216. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/dispatch.py +0 -0
  217. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/metrics.py +0 -0
  218. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/params.py +0 -0
  219. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/queue.py +0 -0
  220. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/schema.py +0 -0
  221. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/session.py +0 -0
  222. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/udf.py +0 -0
  223. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/query/utils.py +0 -0
  224. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/remote/__init__.py +0 -0
  225. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/remote/studio.py +0 -0
  226. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/script_meta.py +0 -0
  227. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/semver.py +0 -0
  228. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/__init__.py +0 -0
  229. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/default/__init__.py +0 -0
  230. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/default/base.py +0 -0
  231. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/functions/__init__.py +0 -0
  232. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/functions/aggregate.py +0 -0
  233. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/functions/array.py +0 -0
  234. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/functions/conditional.py +0 -0
  235. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/functions/numeric.py +0 -0
  236. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/functions/path.py +0 -0
  237. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/functions/random.py +0 -0
  238. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/functions/string.py +0 -0
  239. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/selectable.py +0 -0
  240. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/sqlite/__init__.py +0 -0
  241. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/sqlite/base.py +0 -0
  242. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/sqlite/types.py +0 -0
  243. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/sqlite/vector.py +0 -0
  244. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/types.py +0 -0
  245. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/sql/utils.py +0 -0
  246. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/studio.py +0 -0
  247. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/telemetry.py +0 -0
  248. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/toolkit/__init__.py +0 -0
  249. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/toolkit/split.py +0 -0
  250. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/torch/__init__.py +0 -0
  251. {datachain-0.24.1 → datachain-0.24.3}/src/datachain/utils.py +0 -0
  252. {datachain-0.24.1 → datachain-0.24.3}/src/datachain.egg-info/SOURCES.txt +0 -0
  253. {datachain-0.24.1 → datachain-0.24.3}/src/datachain.egg-info/dependency_links.txt +0 -0
  254. {datachain-0.24.1 → datachain-0.24.3}/src/datachain.egg-info/entry_points.txt +0 -0
  255. {datachain-0.24.1 → datachain-0.24.3}/src/datachain.egg-info/requires.txt +0 -0
  256. {datachain-0.24.1 → datachain-0.24.3}/src/datachain.egg-info/top_level.txt +0 -0
  257. {datachain-0.24.1 → datachain-0.24.3}/tests/__init__.py +0 -0
  258. {datachain-0.24.1 → datachain-0.24.3}/tests/benchmarks/__init__.py +0 -0
  259. {datachain-0.24.1 → datachain-0.24.3}/tests/benchmarks/conftest.py +0 -0
  260. {datachain-0.24.1 → datachain-0.24.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  261. {datachain-0.24.1 → datachain-0.24.3}/tests/benchmarks/datasets/.dvc/config +0 -0
  262. {datachain-0.24.1 → datachain-0.24.3}/tests/benchmarks/datasets/.gitignore +0 -0
  263. {datachain-0.24.1 → datachain-0.24.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  264. {datachain-0.24.1 → datachain-0.24.3}/tests/benchmarks/test_datachain.py +0 -0
  265. {datachain-0.24.1 → datachain-0.24.3}/tests/benchmarks/test_ls.py +0 -0
  266. {datachain-0.24.1 → datachain-0.24.3}/tests/benchmarks/test_version.py +0 -0
  267. {datachain-0.24.1 → datachain-0.24.3}/tests/conftest.py +0 -0
  268. {datachain-0.24.1 → datachain-0.24.3}/tests/data.py +0 -0
  269. {datachain-0.24.1 → datachain-0.24.3}/tests/examples/__init__.py +0 -0
  270. {datachain-0.24.1 → datachain-0.24.3}/tests/examples/test_examples.py +0 -0
  271. {datachain-0.24.1 → datachain-0.24.3}/tests/examples/test_wds_e2e.py +0 -0
  272. {datachain-0.24.1 → datachain-0.24.3}/tests/examples/wds_data.py +0 -0
  273. {datachain-0.24.1 → datachain-0.24.3}/tests/func/__init__.py +0 -0
  274. {datachain-0.24.1 → datachain-0.24.3}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  275. {datachain-0.24.1 → datachain-0.24.3}/tests/func/data/lena.jpg +0 -0
  276. {datachain-0.24.1 → datachain-0.24.3}/tests/func/fake-service-account-credentials.json +0 -0
  277. {datachain-0.24.1 → datachain-0.24.3}/tests/func/functions/__init__.py +0 -0
  278. {datachain-0.24.1 → datachain-0.24.3}/tests/func/functions/test_aggregate.py +0 -0
  279. {datachain-0.24.1 → datachain-0.24.3}/tests/func/functions/test_array.py +0 -0
  280. {datachain-0.24.1 → datachain-0.24.3}/tests/func/functions/test_conditional.py +0 -0
  281. {datachain-0.24.1 → datachain-0.24.3}/tests/func/functions/test_numeric.py +0 -0
  282. {datachain-0.24.1 → datachain-0.24.3}/tests/func/functions/test_path.py +0 -0
  283. {datachain-0.24.1 → datachain-0.24.3}/tests/func/functions/test_random.py +0 -0
  284. {datachain-0.24.1 → datachain-0.24.3}/tests/func/functions/test_string.py +0 -0
  285. {datachain-0.24.1 → datachain-0.24.3}/tests/func/model/__init__.py +0 -0
  286. {datachain-0.24.1 → datachain-0.24.3}/tests/func/model/data/running-mask0.png +0 -0
  287. {datachain-0.24.1 → datachain-0.24.3}/tests/func/model/data/running-mask1.png +0 -0
  288. {datachain-0.24.1 → datachain-0.24.3}/tests/func/model/data/running.jpg +0 -0
  289. {datachain-0.24.1 → datachain-0.24.3}/tests/func/model/data/ships.jpg +0 -0
  290. {datachain-0.24.1 → datachain-0.24.3}/tests/func/model/test_yolo.py +0 -0
  291. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_batching.py +0 -0
  292. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_catalog.py +0 -0
  293. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_client.py +0 -0
  294. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_cloud_transfer.py +0 -0
  295. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_data_storage.py +0 -0
  296. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_datachain_merge.py +0 -0
  297. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_dataset_query.py +0 -0
  298. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_datasets.py +0 -0
  299. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_feature_pickling.py +0 -0
  300. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_file.py +0 -0
  301. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_hf.py +0 -0
  302. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_hidden_field.py +0 -0
  303. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_image.py +0 -0
  304. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_listing.py +0 -0
  305. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_ls.py +0 -0
  306. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_meta_formats.py +0 -0
  307. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_metastore.py +0 -0
  308. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_metrics.py +0 -0
  309. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_pull.py +0 -0
  310. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_pytorch.py +0 -0
  311. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_query.py +0 -0
  312. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_read_database.py +0 -0
  313. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_read_dataset_remote.py +0 -0
  314. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  315. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_retry.py +0 -0
  316. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_session.py +0 -0
  317. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_toolkit.py +0 -0
  318. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_video.py +0 -0
  319. {datachain-0.24.1 → datachain-0.24.3}/tests/func/test_warehouse.py +0 -0
  320. {datachain-0.24.1 → datachain-0.24.3}/tests/scripts/feature_class.py +0 -0
  321. {datachain-0.24.1 → datachain-0.24.3}/tests/scripts/feature_class_exception.py +0 -0
  322. {datachain-0.24.1 → datachain-0.24.3}/tests/scripts/feature_class_parallel.py +0 -0
  323. {datachain-0.24.1 → datachain-0.24.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  324. {datachain-0.24.1 → datachain-0.24.3}/tests/scripts/name_len_slow.py +0 -0
  325. {datachain-0.24.1 → datachain-0.24.3}/tests/test_atomicity.py +0 -0
  326. {datachain-0.24.1 → datachain-0.24.3}/tests/test_cli_e2e.py +0 -0
  327. {datachain-0.24.1 → datachain-0.24.3}/tests/test_cli_studio.py +0 -0
  328. {datachain-0.24.1 → datachain-0.24.3}/tests/test_import_time.py +0 -0
  329. {datachain-0.24.1 → datachain-0.24.3}/tests/test_query_e2e.py +0 -0
  330. {datachain-0.24.1 → datachain-0.24.3}/tests/test_telemetry.py +0 -0
  331. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/__init__.py +0 -0
  332. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/__init__.py +0 -0
  333. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/conftest.py +0 -0
  334. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_arrow.py +0 -0
  335. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_clip.py +0 -0
  336. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  337. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_datachain_merge.py +0 -0
  338. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_diff.py +0 -0
  339. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_feature.py +0 -0
  340. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_feature_utils.py +0 -0
  341. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_file.py +0 -0
  342. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_hf.py +0 -0
  343. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_image.py +0 -0
  344. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_listing_info.py +0 -0
  345. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_namespace.py +0 -0
  346. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_project.py +0 -0
  347. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_python_to_sql.py +0 -0
  348. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_schema.py +0 -0
  349. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_signal_schema.py +0 -0
  350. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_sql_to_python.py +0 -0
  351. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_text.py +0 -0
  352. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_udf.py +0 -0
  353. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_udf_signature.py +0 -0
  354. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_utils.py +0 -0
  355. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/lib/test_webdataset.py +0 -0
  356. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/model/__init__.py +0 -0
  357. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/model/test_bbox.py +0 -0
  358. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/model/test_pose.py +0 -0
  359. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/model/test_segment.py +0 -0
  360. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/model/test_utils.py +0 -0
  361. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/sql/__init__.py +0 -0
  362. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/sql/sqlite/__init__.py +0 -0
  363. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/sql/sqlite/test_types.py +0 -0
  364. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
  365. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/sql/test_array.py +0 -0
  366. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/sql/test_conditional.py +0 -0
  367. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/sql/test_path.py +0 -0
  368. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/sql/test_random.py +0 -0
  369. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/sql/test_selectable.py +0 -0
  370. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/sql/test_string.py +0 -0
  371. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_asyn.py +0 -0
  372. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_cache.py +0 -0
  373. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_catalog.py +0 -0
  374. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_catalog_loader.py +0 -0
  375. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_cli_parsing.py +0 -0
  376. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_client.py +0 -0
  377. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_client_gcs.py +0 -0
  378. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_client_s3.py +0 -0
  379. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_config.py +0 -0
  380. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_data_storage.py +0 -0
  381. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_database_engine.py +0 -0
  382. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_dataset.py +0 -0
  383. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_dispatch.py +0 -0
  384. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_fileslice.py +0 -0
  385. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_func.py +0 -0
  386. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_listing.py +0 -0
  387. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_metastore.py +0 -0
  388. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_module_exports.py +0 -0
  389. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_pytorch.py +0 -0
  390. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_query.py +0 -0
  391. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_query_metrics.py +0 -0
  392. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_query_params.py +0 -0
  393. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_script_meta.py +0 -0
  394. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_semver.py +0 -0
  395. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_serializer.py +0 -0
  396. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_session.py +0 -0
  397. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_utils.py +0 -0
  398. {datachain-0.24.1 → datachain-0.24.3}/tests/unit/test_warehouse.py +0 -0
  399. {datachain-0.24.1 → datachain-0.24.3}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.12.0'
27
+ rev: 'v0.12.1'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -35,7 +35,7 @@ repos:
35
35
  - id: codespell
36
36
  additional_dependencies: ["tomli"]
37
37
  - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
38
- rev: v2.14.0
38
+ rev: v2.15.0
39
39
  hooks:
40
40
  - id: pretty-format-toml
41
41
  args: [--autofix, --no-sort]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.1
3
+ Version: 0.24.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -1098,9 +1098,18 @@ class Catalog:
1098
1098
  ) -> DatasetRecord:
1099
1099
  from datachain.lib.listing import is_listing_dataset
1100
1100
 
1101
+ project = project or self.metastore.default_project
1102
+
1101
1103
  if is_listing_dataset(name):
1102
1104
  project = self.metastore.listing_project
1103
- return self.metastore.get_dataset(name, project.id if project else None)
1105
+
1106
+ try:
1107
+ return self.metastore.get_dataset(name, project.id if project else None)
1108
+ except DatasetNotFoundError:
1109
+ raise DatasetNotFoundError(
1110
+ f"Dataset {name} not found in namespace {project.namespace.name}"
1111
+ f" and project {project.name}"
1112
+ ) from None
1104
1113
 
1105
1114
  def get_dataset_with_remote_fallback(
1106
1115
  self,
@@ -1124,7 +1133,7 @@ class Catalog:
1124
1133
  raise DatasetNotFoundError(
1125
1134
  f"Dataset {name}"
1126
1135
  + (f" version {version} " if version else " ")
1127
- + "not found"
1136
+ + f"not found in namespace {namespace_name} and project {project_name}"
1128
1137
  )
1129
1138
 
1130
1139
  if pull_dataset:
@@ -1194,14 +1194,16 @@ class AbstractDBMetastore(AbstractMetastore):
1194
1194
  Gets a single dataset in project by dataset name.
1195
1195
  """
1196
1196
  project_id = project_id or self.default_project.id
1197
+
1197
1198
  d = self._datasets
1198
1199
  query = self._base_dataset_query()
1199
1200
  query = query.where(d.c.name == name, d.c.project_id == project_id) # type: ignore [attr-defined]
1200
1201
  ds = self._parse_dataset(self.db.execute(query, conn=conn))
1201
1202
  if not ds:
1202
1203
  raise DatasetNotFoundError(
1203
- f"Dataset {name} not found in project {project_id}"
1204
+ f"Dataset {name} not found in project with id {project_id}"
1204
1205
  )
1206
+
1205
1207
  return ds
1206
1208
 
1207
1209
  def remove_dataset_version(
@@ -774,7 +774,15 @@ class SQLiteWarehouse(AbstractWarehouse):
774
774
  query: Select,
775
775
  progress_cb: Optional[Callable[[int], None]] = None,
776
776
  ) -> None:
777
- if len(query._group_by_clause) > 0:
777
+ col_id = (
778
+ query.selected_columns.sys__id
779
+ if "sys__id" in query.selected_columns
780
+ else None
781
+ )
782
+
783
+ # If there is no sys__id column, we cannot copy the table in batches,
784
+ # and we need to copy all rows at once. Same if there is a group by clause.
785
+ if col_id is None or len(query._group_by_clause) > 0:
778
786
  select_q = query.with_only_columns(
779
787
  *[c for c in query.selected_columns if c.name != "sys__id"]
780
788
  )
@@ -782,12 +790,7 @@ class SQLiteWarehouse(AbstractWarehouse):
782
790
  self.db.execute(q)
783
791
  return
784
792
 
785
- if "sys__id" in query.selected_columns:
786
- col_id = query.selected_columns.sys__id
787
- else:
788
- col_id = sqlalchemy.column("sys__id")
789
793
  select_ids = query.with_only_columns(col_id)
790
-
791
794
  ids = self.db.execute(select_ids).fetchall()
792
795
 
793
796
  select_q = (
@@ -218,7 +218,7 @@ class AbstractWarehouse(ABC, Serializable):
218
218
  limit = query._limit
219
219
  paginated_query = query.limit(page_size)
220
220
 
221
- offset = 0
221
+ offset = query._offset or 0
222
222
  num_yielded = 0
223
223
 
224
224
  # Ensure we're using a thread-local connection
@@ -234,13 +234,13 @@ class AbstractWarehouse(ABC, Serializable):
234
234
  # Cursor results are not thread-safe, so we convert them to a list
235
235
  results = list(wh.dataset_rows_select(paginated_query.offset(offset)))
236
236
 
237
- processed = False
237
+ processed = 0
238
238
  for row in results:
239
- processed = True
239
+ processed += 1
240
240
  yield row
241
241
  num_yielded += 1
242
242
 
243
- if not processed:
243
+ if processed < page_size:
244
244
  break # no more results
245
245
  offset += page_size
246
246
 
@@ -343,6 +343,8 @@ class AbstractWarehouse(ABC, Serializable):
343
343
  if (id_col := get_query_id_column(query)) is None:
344
344
  raise RuntimeError("sys__id column not found in query")
345
345
 
346
+ query = query._clone().offset(None).limit(None).order_by(None)
347
+
346
348
  if is_batched:
347
349
  for batch in ids:
348
350
  yield list(self.dataset_rows_select(query.where(id_col.in_(batch))))
@@ -65,10 +65,17 @@ class Listing:
65
65
 
66
66
  @cached_property
67
67
  def dataset(self) -> "DatasetRecord":
68
+ from datachain.error import DatasetNotFoundError
69
+
68
70
  assert self.dataset_name
69
- return self.metastore.get_dataset(
70
- self.dataset_name, self.metastore.listing_project.id
71
- )
71
+ project = self.metastore.listing_project
72
+ try:
73
+ return self.metastore.get_dataset(self.dataset_name, project.id)
74
+ except DatasetNotFoundError:
75
+ raise DatasetNotFoundError(
76
+ f"Dataset {self.dataset_name} not found in namespace"
77
+ f" {project.namespace.name} and project {project.name}"
78
+ ) from None
72
79
 
73
80
  @cached_property
74
81
  def dataset_rows(self):
@@ -11,6 +11,7 @@ from collections.abc import Generator, Iterable, Iterator, Sequence
11
11
  from copy import copy
12
12
  from functools import wraps
13
13
  from secrets import token_hex
14
+ from types import GeneratorType
14
15
  from typing import (
15
16
  TYPE_CHECKING,
16
17
  Any,
@@ -557,8 +558,8 @@ class UDFStep(Step, ABC):
557
558
  """
558
559
  assert self.partition_by is not None
559
560
 
560
- if isinstance(self.partition_by, Sequence):
561
- list_partition_by = self.partition_by
561
+ if isinstance(self.partition_by, (list, tuple, GeneratorType)):
562
+ list_partition_by = list(self.partition_by)
562
563
  else:
563
564
  list_partition_by = [self.partition_by]
564
565
 
@@ -575,7 +576,10 @@ class UDFStep(Step, ABC):
575
576
  f.dense_rank().over(order_by=partition_by).label(PARTITION_COLUMN_ID),
576
577
  ]
577
578
  self.catalog.warehouse.db.execute(
578
- tbl.insert().from_select(cols, query.with_only_columns(*cols))
579
+ tbl.insert().from_select(
580
+ cols,
581
+ query.offset(None).limit(None).with_only_columns(*cols),
582
+ )
579
583
  )
580
584
 
581
585
  return tbl
@@ -601,13 +605,10 @@ class UDFStep(Step, ABC):
601
605
  if self.partition_by is not None:
602
606
  partition_tbl = self.create_partitions_table(query)
603
607
  temp_tables.append(partition_tbl.name)
604
-
605
- subq = query.subquery()
606
- query = (
607
- sqlalchemy.select(*subq.c)
608
- .outerjoin(partition_tbl, partition_tbl.c.sys__id == subq.c.sys__id)
609
- .add_columns(*partition_columns())
610
- )
608
+ query = query.outerjoin(
609
+ partition_tbl,
610
+ partition_tbl.c.sys__id == query.selected_columns.sys__id,
611
+ ).add_columns(*partition_columns())
611
612
 
612
613
  query, tables = self.process_input_query(query)
613
614
  temp_tables.extend(t.name for t in tables)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.1
3
+ Version: 0.24.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -236,6 +236,22 @@ def test_read_storage_dependencies(cloud_test_catalog, cloud_type):
236
236
  assert dependencies[0].name == dep_name
237
237
 
238
238
 
239
+ def test_persist_after_mutate(test_session):
240
+ chain = (
241
+ dc.read_values(fib=[1, 1, 2, 3, 5, 8, 13, 21], session=test_session)
242
+ .map(mod3=lambda fib: fib % 3, output=int)
243
+ .group_by(
244
+ cnt=dc.func.count(),
245
+ partition_by="mod3",
246
+ )
247
+ .mutate(x=1)
248
+ .persist()
249
+ )
250
+
251
+ assert chain.count() == 3
252
+ assert set(chain.to_values("mod3")) == {0, 1, 2}
253
+
254
+
239
255
  def test_persist_not_affects_dependencies(tmp_dir, test_session):
240
256
  for i in range(4):
241
257
  (tmp_dir / f"file{i}.txt").write_text(f"file{i}")
@@ -2324,3 +2340,72 @@ def test_agg(catalog_tmpfile, parallel):
2324
2340
  ],
2325
2341
  "parent",
2326
2342
  )
2343
+
2344
+
2345
+ @pytest.mark.parametrize("parallel", [1, 2])
2346
+ @pytest.mark.parametrize(
2347
+ "offset,limit,files",
2348
+ [
2349
+ (None, 1000, [f"file{i:02d}" for i in range(100)]),
2350
+ (None, 3, ["file00", "file01", "file02"]),
2351
+ (0, 3, ["file00", "file01", "file02"]),
2352
+ (97, 1000, ["file97", "file98", "file99"]),
2353
+ (1, 2, ["file01", "file02"]),
2354
+ (50, 3, ["file50", "file51", "file52"]),
2355
+ (None, 0, []),
2356
+ (50, 0, []),
2357
+ ],
2358
+ )
2359
+ def test_agg_offset_limit(catalog_tmpfile, parallel, offset, limit, files):
2360
+ def process(filename: list[str]) -> Iterator[tuple[str, int]]:
2361
+ yield filename[0], len(filename)
2362
+
2363
+ ds = dc.read_values(
2364
+ filename=[f"file{i:02d}" for i in range(100)],
2365
+ value=list(range(100)),
2366
+ session=catalog_tmpfile.session,
2367
+ )
2368
+ if offset is not None:
2369
+ ds = ds.offset(offset)
2370
+ if limit is not None:
2371
+ ds = ds.limit(limit)
2372
+ ds = (
2373
+ ds.settings(parallel=parallel)
2374
+ .agg(
2375
+ process,
2376
+ output={"filename": str, "count": int},
2377
+ partition_by="filename",
2378
+ )
2379
+ .save("my-ds")
2380
+ )
2381
+
2382
+ records = list(ds.to_records())
2383
+ assert len(records) == len(files)
2384
+ assert all(row["count"] == 1 for row in records)
2385
+ assert sorted(row["filename"] for row in records) == sorted(files)
2386
+
2387
+
2388
+ @pytest.mark.parametrize("parallel", [1, 2])
2389
+ @pytest.mark.parametrize("sample", [0, 1, 3, 10, 50, 100])
2390
+ def test_agg_sample(catalog_tmpfile, parallel, sample):
2391
+ def process(filename: list[str]) -> Iterator[tuple[str, int]]:
2392
+ yield filename[0], len(filename)
2393
+
2394
+ ds = (
2395
+ dc.read_values(
2396
+ filename=[f"file{i:02d}" for i in range(100)],
2397
+ session=catalog_tmpfile.session,
2398
+ )
2399
+ .sample(sample)
2400
+ .settings(parallel=parallel)
2401
+ .agg(
2402
+ process,
2403
+ output={"filename": str, "count": int},
2404
+ partition_by="filename",
2405
+ )
2406
+ .save("my-ds")
2407
+ )
2408
+
2409
+ records = list(ds.to_records())
2410
+ assert len(records) == sample
2411
+ assert all(row["count"] == 1 for row in records)
@@ -248,6 +248,9 @@ def test_delta_update_check_num_calls(test_session, tmp_dir, tmp_path, capsys):
248
248
 
249
249
 
250
250
  def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
251
+ catalog = test_session.catalog
252
+ default_namespace_name = catalog.metastore.default_namespace_name
253
+ default_project_name = catalog.metastore.default_project_name
251
254
  ds_name = "delta_ds"
252
255
  path = tmp_dir.as_uri()
253
256
  tmp_dir = tmp_dir / "images"
@@ -296,7 +299,10 @@ def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
296
299
  with pytest.raises(DatasetNotFoundError) as exc_info:
297
300
  dc.read_dataset(ds_name, version="1.0.1")
298
301
 
299
- assert str(exc_info.value) == f"Dataset {ds_name} version 1.0.1 not found"
302
+ assert str(exc_info.value) == (
303
+ f"Dataset {ds_name} version 1.0.1 not found in namespace "
304
+ f"{default_namespace_name} and project {default_project_name}"
305
+ )
300
306
 
301
307
 
302
308
  @pytest.fixture
@@ -16,6 +16,7 @@ from pydantic import BaseModel
16
16
 
17
17
  import datachain as dc
18
18
  from datachain import Column
19
+ from datachain.data_storage import AbstractMetastore
19
20
  from datachain.error import (
20
21
  DatasetInvalidVersionError,
21
22
  DatasetNotFoundError,
@@ -3428,6 +3429,29 @@ def test_save_to_non_default_namespace_and_project(
3428
3429
  dc.read_dataset(name="fibonacci")
3429
3430
 
3430
3431
 
3432
+ def test_dataset_not_found_in_default_project(test_session):
3433
+ metastore = test_session.catalog.metastore
3434
+ with pytest.raises(DatasetNotFoundError) as excinfo:
3435
+ with patch.object(AbstractMetastore, "is_local_dataset", return_value=True):
3436
+ dc.read_dataset("fibonacci")
3437
+ assert str(excinfo.value) == (
3438
+ f"Dataset fibonacci not found in namespace {metastore.default_namespace_name}"
3439
+ f" and project {metastore.default_project_name}"
3440
+ )
3441
+
3442
+
3443
+ @pytest.mark.parametrize("project_created", (True, False))
3444
+ def test_dataset_not_found_in_non_default_project(test_session, project_created):
3445
+ if project_created:
3446
+ dc.create_project("dev", "numbers")
3447
+ with pytest.raises(DatasetNotFoundError) as excinfo:
3448
+ with patch.object(AbstractMetastore, "is_local_dataset", return_value=True):
3449
+ dc.read_dataset("dev.numbers.fibonacci")
3450
+ assert str(excinfo.value) == (
3451
+ "Dataset fibonacci not found in namespace dev and project numbers"
3452
+ )
3453
+
3454
+
3431
3455
  @pytest.mark.parametrize("use_settings", (True, False))
3432
3456
  @pytest.mark.parametrize("project_created_upfront", (True, False))
3433
3457
  def test_save_specify_only_non_default_project(
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes