datachain 0.24.2__tar.gz → 0.24.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (399) hide show
  1. {datachain-0.24.2 → datachain-0.24.4}/.github/workflows/tests-studio.yml +1 -1
  2. {datachain-0.24.2 → datachain-0.24.4}/PKG-INFO +1 -1
  3. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/catalog/catalog.py +19 -2
  4. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/metastore.py +3 -1
  5. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/sqlite.py +9 -6
  6. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/dataset.py +1 -1
  7. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/datachain.py +26 -1
  8. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/datasets.py +1 -0
  9. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/listing.py +10 -3
  10. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/namespace.py +1 -1
  11. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/project.py +1 -1
  12. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/dataset.py +5 -1
  13. {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/PKG-INFO +1 -1
  14. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_datachain.py +16 -0
  15. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_delta.py +7 -1
  16. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_read_dataset_remote.py +49 -4
  17. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_datachain.py +81 -0
  18. {datachain-0.24.2 → datachain-0.24.4}/.cruft.json +0 -0
  19. {datachain-0.24.2 → datachain-0.24.4}/.gitattributes +0 -0
  20. {datachain-0.24.2 → datachain-0.24.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  21. {datachain-0.24.2 → datachain-0.24.4}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  22. {datachain-0.24.2 → datachain-0.24.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  23. {datachain-0.24.2 → datachain-0.24.4}/.github/codecov.yaml +0 -0
  24. {datachain-0.24.2 → datachain-0.24.4}/.github/dependabot.yml +0 -0
  25. {datachain-0.24.2 → datachain-0.24.4}/.github/workflows/benchmarks.yml +0 -0
  26. {datachain-0.24.2 → datachain-0.24.4}/.github/workflows/release.yml +0 -0
  27. {datachain-0.24.2 → datachain-0.24.4}/.github/workflows/tests.yml +0 -0
  28. {datachain-0.24.2 → datachain-0.24.4}/.github/workflows/update-template.yaml +0 -0
  29. {datachain-0.24.2 → datachain-0.24.4}/.gitignore +0 -0
  30. {datachain-0.24.2 → datachain-0.24.4}/.pre-commit-config.yaml +0 -0
  31. {datachain-0.24.2 → datachain-0.24.4}/CODE_OF_CONDUCT.rst +0 -0
  32. {datachain-0.24.2 → datachain-0.24.4}/LICENSE +0 -0
  33. {datachain-0.24.2 → datachain-0.24.4}/README.rst +0 -0
  34. {datachain-0.24.2 → datachain-0.24.4}/docs/assets/captioned_cartoons.png +0 -0
  35. {datachain-0.24.2 → datachain-0.24.4}/docs/assets/datachain-white.svg +0 -0
  36. {datachain-0.24.2 → datachain-0.24.4}/docs/assets/datachain.svg +0 -0
  37. {datachain-0.24.2 → datachain-0.24.4}/docs/commands/auth/login.md +0 -0
  38. {datachain-0.24.2 → datachain-0.24.4}/docs/commands/auth/logout.md +0 -0
  39. {datachain-0.24.2 → datachain-0.24.4}/docs/commands/auth/team.md +0 -0
  40. {datachain-0.24.2 → datachain-0.24.4}/docs/commands/auth/token.md +0 -0
  41. {datachain-0.24.2 → datachain-0.24.4}/docs/commands/index.md +0 -0
  42. {datachain-0.24.2 → datachain-0.24.4}/docs/commands/job/cancel.md +0 -0
  43. {datachain-0.24.2 → datachain-0.24.4}/docs/commands/job/clusters.md +0 -0
  44. {datachain-0.24.2 → datachain-0.24.4}/docs/commands/job/logs.md +0 -0
  45. {datachain-0.24.2 → datachain-0.24.4}/docs/commands/job/ls.md +0 -0
  46. {datachain-0.24.2 → datachain-0.24.4}/docs/commands/job/run.md +0 -0
  47. {datachain-0.24.2 → datachain-0.24.4}/docs/contributing.md +0 -0
  48. {datachain-0.24.2 → datachain-0.24.4}/docs/css/github-permalink-style.css +0 -0
  49. {datachain-0.24.2 → datachain-0.24.4}/docs/examples.md +0 -0
  50. {datachain-0.24.2 → datachain-0.24.4}/docs/guide/db_migrations.md +0 -0
  51. {datachain-0.24.2 → datachain-0.24.4}/docs/guide/delta.md +0 -0
  52. {datachain-0.24.2 → datachain-0.24.4}/docs/guide/env.md +0 -0
  53. {datachain-0.24.2 → datachain-0.24.4}/docs/guide/index.md +0 -0
  54. {datachain-0.24.2 → datachain-0.24.4}/docs/guide/namespaces.md +0 -0
  55. {datachain-0.24.2 → datachain-0.24.4}/docs/guide/processing.md +0 -0
  56. {datachain-0.24.2 → datachain-0.24.4}/docs/guide/remotes.md +0 -0
  57. {datachain-0.24.2 → datachain-0.24.4}/docs/guide/retry.md +0 -0
  58. {datachain-0.24.2 → datachain-0.24.4}/docs/index.md +0 -0
  59. {datachain-0.24.2 → datachain-0.24.4}/docs/overrides/main.html +0 -0
  60. {datachain-0.24.2 → datachain-0.24.4}/docs/quick-start.md +0 -0
  61. {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/arrowrow.md +0 -0
  62. {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/bbox.md +0 -0
  63. {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/file.md +0 -0
  64. {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/imagefile.md +0 -0
  65. {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/index.md +0 -0
  66. {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/pose.md +0 -0
  67. {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/segment.md +0 -0
  68. {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/tarvfile.md +0 -0
  69. {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/textfile.md +0 -0
  70. {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/videofile.md +0 -0
  71. {datachain-0.24.2 → datachain-0.24.4}/docs/references/datachain.md +0 -0
  72. {datachain-0.24.2 → datachain-0.24.4}/docs/references/func.md +0 -0
  73. {datachain-0.24.2 → datachain-0.24.4}/docs/references/index.md +0 -0
  74. {datachain-0.24.2 → datachain-0.24.4}/docs/references/toolkit.md +0 -0
  75. {datachain-0.24.2 → datachain-0.24.4}/docs/references/torch.md +0 -0
  76. {datachain-0.24.2 → datachain-0.24.4}/docs/references/udf.md +0 -0
  77. {datachain-0.24.2 → datachain-0.24.4}/docs/tutorials.md +0 -0
  78. {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  79. {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  80. {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/openimage-detect.py +0 -0
  81. {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/ultralytics-bbox.py +0 -0
  82. {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/ultralytics-pose.py +0 -0
  83. {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/ultralytics-segment.py +0 -0
  84. {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/common_sql_functions.py +0 -0
  85. {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/json-csv-reader.py +0 -0
  86. {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/torch-loader.py +0 -0
  87. {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/udfs/parallel.py +0 -0
  88. {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/udfs/simple.py +0 -0
  89. {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/udfs/stateful.py +0 -0
  90. {datachain-0.24.2 → datachain-0.24.4}/examples/incremental_processing/delta.py +0 -0
  91. {datachain-0.24.2 → datachain-0.24.4}/examples/incremental_processing/retry.py +0 -0
  92. {datachain-0.24.2 → datachain-0.24.4}/examples/incremental_processing/utils.py +0 -0
  93. {datachain-0.24.2 → datachain-0.24.4}/examples/llm_and_nlp/claude-query.py +0 -0
  94. {datachain-0.24.2 → datachain-0.24.4}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  95. {datachain-0.24.2 → datachain-0.24.4}/examples/multimodal/clip_inference.py +0 -0
  96. {datachain-0.24.2 → datachain-0.24.4}/examples/multimodal/hf_pipeline.py +0 -0
  97. {datachain-0.24.2 → datachain-0.24.4}/examples/multimodal/openai_image_desc_lib.py +0 -0
  98. {datachain-0.24.2 → datachain-0.24.4}/examples/multimodal/wds.py +0 -0
  99. {datachain-0.24.2 → datachain-0.24.4}/examples/multimodal/wds_filtered.py +0 -0
  100. {datachain-0.24.2 → datachain-0.24.4}/mkdocs.yml +0 -0
  101. {datachain-0.24.2 → datachain-0.24.4}/noxfile.py +0 -0
  102. {datachain-0.24.2 → datachain-0.24.4}/pyproject.toml +0 -0
  103. {datachain-0.24.2 → datachain-0.24.4}/setup.cfg +0 -0
  104. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/__init__.py +0 -0
  105. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/__main__.py +0 -0
  106. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/asyn.py +0 -0
  107. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cache.py +0 -0
  108. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/catalog/__init__.py +0 -0
  109. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/catalog/datasource.py +0 -0
  110. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/catalog/loader.py +0 -0
  111. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/__init__.py +0 -0
  112. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/__init__.py +0 -0
  113. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/datasets.py +0 -0
  114. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/du.py +0 -0
  115. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/index.py +0 -0
  116. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/ls.py +0 -0
  117. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/misc.py +0 -0
  118. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/query.py +0 -0
  119. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/show.py +0 -0
  120. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/parser/__init__.py +0 -0
  121. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/parser/job.py +0 -0
  122. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/parser/studio.py +0 -0
  123. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/parser/utils.py +0 -0
  124. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/utils.py +0 -0
  125. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/__init__.py +0 -0
  126. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/azure.py +0 -0
  127. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/fileslice.py +0 -0
  128. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/fsspec.py +0 -0
  129. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/gcs.py +0 -0
  130. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/hf.py +0 -0
  131. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/local.py +0 -0
  132. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/s3.py +0 -0
  133. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/config.py +0 -0
  134. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/__init__.py +0 -0
  135. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/db_engine.py +0 -0
  136. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/job.py +0 -0
  137. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/schema.py +0 -0
  138. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/serializer.py +0 -0
  139. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/warehouse.py +0 -0
  140. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/delta.py +0 -0
  141. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/diff/__init__.py +0 -0
  142. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/error.py +0 -0
  143. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/fs/__init__.py +0 -0
  144. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/fs/reference.py +0 -0
  145. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/fs/utils.py +0 -0
  146. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/__init__.py +0 -0
  147. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/aggregate.py +0 -0
  148. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/array.py +0 -0
  149. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/base.py +0 -0
  150. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/conditional.py +0 -0
  151. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/func.py +0 -0
  152. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/numeric.py +0 -0
  153. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/path.py +0 -0
  154. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/random.py +0 -0
  155. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/string.py +0 -0
  156. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/window.py +0 -0
  157. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/job.py +0 -0
  158. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/__init__.py +0 -0
  159. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/arrow.py +0 -0
  160. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/clip.py +0 -0
  161. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/__init__.py +0 -0
  162. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/flatten.py +0 -0
  163. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/python_to_sql.py +0 -0
  164. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/sql_to_python.py +0 -0
  165. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/unflatten.py +0 -0
  166. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  167. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/data_model.py +0 -0
  168. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dataset_info.py +0 -0
  169. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/__init__.py +0 -0
  170. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/csv.py +0 -0
  171. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/database.py +0 -0
  172. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/hf.py +0 -0
  173. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/json.py +0 -0
  174. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/listings.py +0 -0
  175. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/pandas.py +0 -0
  176. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/parquet.py +0 -0
  177. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/records.py +0 -0
  178. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/storage.py +0 -0
  179. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/utils.py +0 -0
  180. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/values.py +0 -0
  181. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/file.py +0 -0
  182. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/hf.py +0 -0
  183. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/image.py +0 -0
  184. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/listing.py +0 -0
  185. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/listing_info.py +0 -0
  186. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/meta_formats.py +0 -0
  187. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/model_store.py +0 -0
  188. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/namespaces.py +0 -0
  189. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/projects.py +0 -0
  190. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/pytorch.py +0 -0
  191. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/settings.py +0 -0
  192. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/signal_schema.py +0 -0
  193. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/tar.py +0 -0
  194. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/text.py +0 -0
  195. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/udf.py +0 -0
  196. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/udf_signature.py +0 -0
  197. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/utils.py +0 -0
  198. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/video.py +0 -0
  199. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/webdataset.py +0 -0
  200. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/webdataset_laion.py +0 -0
  201. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/__init__.py +0 -0
  202. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/bbox.py +0 -0
  203. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/pose.py +0 -0
  204. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/segment.py +0 -0
  205. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/ultralytics/__init__.py +0 -0
  206. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/ultralytics/bbox.py +0 -0
  207. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/ultralytics/pose.py +0 -0
  208. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/ultralytics/segment.py +0 -0
  209. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/utils.py +0 -0
  210. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/node.py +0 -0
  211. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/nodes_fetcher.py +0 -0
  212. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/nodes_thread_pool.py +0 -0
  213. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/progress.py +0 -0
  214. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/py.typed +0 -0
  215. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/__init__.py +0 -0
  216. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/batch.py +0 -0
  217. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/dispatch.py +0 -0
  218. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/metrics.py +0 -0
  219. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/params.py +0 -0
  220. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/queue.py +0 -0
  221. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/schema.py +0 -0
  222. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/session.py +0 -0
  223. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/udf.py +0 -0
  224. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/utils.py +0 -0
  225. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/remote/__init__.py +0 -0
  226. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/remote/studio.py +0 -0
  227. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/script_meta.py +0 -0
  228. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/semver.py +0 -0
  229. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/__init__.py +0 -0
  230. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/default/__init__.py +0 -0
  231. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/default/base.py +0 -0
  232. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/__init__.py +0 -0
  233. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/aggregate.py +0 -0
  234. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/array.py +0 -0
  235. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/conditional.py +0 -0
  236. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/numeric.py +0 -0
  237. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/path.py +0 -0
  238. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/random.py +0 -0
  239. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/string.py +0 -0
  240. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/selectable.py +0 -0
  241. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/sqlite/__init__.py +0 -0
  242. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/sqlite/base.py +0 -0
  243. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/sqlite/types.py +0 -0
  244. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/sqlite/vector.py +0 -0
  245. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/types.py +0 -0
  246. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/utils.py +0 -0
  247. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/studio.py +0 -0
  248. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/telemetry.py +0 -0
  249. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/toolkit/__init__.py +0 -0
  250. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/toolkit/split.py +0 -0
  251. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/torch/__init__.py +0 -0
  252. {datachain-0.24.2 → datachain-0.24.4}/src/datachain/utils.py +0 -0
  253. {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/SOURCES.txt +0 -0
  254. {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/dependency_links.txt +0 -0
  255. {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/entry_points.txt +0 -0
  256. {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/requires.txt +0 -0
  257. {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/top_level.txt +0 -0
  258. {datachain-0.24.2 → datachain-0.24.4}/tests/__init__.py +0 -0
  259. {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/__init__.py +0 -0
  260. {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/conftest.py +0 -0
  261. {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  262. {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/datasets/.dvc/config +0 -0
  263. {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/datasets/.gitignore +0 -0
  264. {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  265. {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/test_datachain.py +0 -0
  266. {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/test_ls.py +0 -0
  267. {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/test_version.py +0 -0
  268. {datachain-0.24.2 → datachain-0.24.4}/tests/conftest.py +0 -0
  269. {datachain-0.24.2 → datachain-0.24.4}/tests/data.py +0 -0
  270. {datachain-0.24.2 → datachain-0.24.4}/tests/examples/__init__.py +0 -0
  271. {datachain-0.24.2 → datachain-0.24.4}/tests/examples/test_examples.py +0 -0
  272. {datachain-0.24.2 → datachain-0.24.4}/tests/examples/test_wds_e2e.py +0 -0
  273. {datachain-0.24.2 → datachain-0.24.4}/tests/examples/wds_data.py +0 -0
  274. {datachain-0.24.2 → datachain-0.24.4}/tests/func/__init__.py +0 -0
  275. {datachain-0.24.2 → datachain-0.24.4}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  276. {datachain-0.24.2 → datachain-0.24.4}/tests/func/data/lena.jpg +0 -0
  277. {datachain-0.24.2 → datachain-0.24.4}/tests/func/fake-service-account-credentials.json +0 -0
  278. {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/__init__.py +0 -0
  279. {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_aggregate.py +0 -0
  280. {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_array.py +0 -0
  281. {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_conditional.py +0 -0
  282. {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_numeric.py +0 -0
  283. {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_path.py +0 -0
  284. {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_random.py +0 -0
  285. {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_string.py +0 -0
  286. {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/__init__.py +0 -0
  287. {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/data/running-mask0.png +0 -0
  288. {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/data/running-mask1.png +0 -0
  289. {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/data/running.jpg +0 -0
  290. {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/data/ships.jpg +0 -0
  291. {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/test_yolo.py +0 -0
  292. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_batching.py +0 -0
  293. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_catalog.py +0 -0
  294. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_client.py +0 -0
  295. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_cloud_transfer.py +0 -0
  296. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_data_storage.py +0 -0
  297. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_datachain_merge.py +0 -0
  298. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_dataset_query.py +0 -0
  299. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_datasets.py +0 -0
  300. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_feature_pickling.py +0 -0
  301. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_file.py +0 -0
  302. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_hf.py +0 -0
  303. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_hidden_field.py +0 -0
  304. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_image.py +0 -0
  305. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_listing.py +0 -0
  306. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_ls.py +0 -0
  307. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_meta_formats.py +0 -0
  308. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_metastore.py +0 -0
  309. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_metrics.py +0 -0
  310. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_pull.py +0 -0
  311. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_pytorch.py +0 -0
  312. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_query.py +0 -0
  313. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_read_database.py +0 -0
  314. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  315. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_retry.py +0 -0
  316. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_session.py +0 -0
  317. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_toolkit.py +0 -0
  318. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_video.py +0 -0
  319. {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_warehouse.py +0 -0
  320. {datachain-0.24.2 → datachain-0.24.4}/tests/scripts/feature_class.py +0 -0
  321. {datachain-0.24.2 → datachain-0.24.4}/tests/scripts/feature_class_exception.py +0 -0
  322. {datachain-0.24.2 → datachain-0.24.4}/tests/scripts/feature_class_parallel.py +0 -0
  323. {datachain-0.24.2 → datachain-0.24.4}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  324. {datachain-0.24.2 → datachain-0.24.4}/tests/scripts/name_len_slow.py +0 -0
  325. {datachain-0.24.2 → datachain-0.24.4}/tests/test_atomicity.py +0 -0
  326. {datachain-0.24.2 → datachain-0.24.4}/tests/test_cli_e2e.py +0 -0
  327. {datachain-0.24.2 → datachain-0.24.4}/tests/test_cli_studio.py +0 -0
  328. {datachain-0.24.2 → datachain-0.24.4}/tests/test_import_time.py +0 -0
  329. {datachain-0.24.2 → datachain-0.24.4}/tests/test_query_e2e.py +0 -0
  330. {datachain-0.24.2 → datachain-0.24.4}/tests/test_telemetry.py +0 -0
  331. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/__init__.py +0 -0
  332. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/__init__.py +0 -0
  333. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/conftest.py +0 -0
  334. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_arrow.py +0 -0
  335. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_clip.py +0 -0
  336. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  337. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_datachain_merge.py +0 -0
  338. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_diff.py +0 -0
  339. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_feature.py +0 -0
  340. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_feature_utils.py +0 -0
  341. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_file.py +0 -0
  342. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_hf.py +0 -0
  343. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_image.py +0 -0
  344. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_listing_info.py +0 -0
  345. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_namespace.py +0 -0
  346. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_project.py +0 -0
  347. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_python_to_sql.py +0 -0
  348. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_schema.py +0 -0
  349. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_signal_schema.py +0 -0
  350. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_sql_to_python.py +0 -0
  351. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_text.py +0 -0
  352. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_udf.py +0 -0
  353. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_udf_signature.py +0 -0
  354. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_utils.py +0 -0
  355. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_webdataset.py +0 -0
  356. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/model/__init__.py +0 -0
  357. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/model/test_bbox.py +0 -0
  358. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/model/test_pose.py +0 -0
  359. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/model/test_segment.py +0 -0
  360. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/model/test_utils.py +0 -0
  361. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/__init__.py +0 -0
  362. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/sqlite/__init__.py +0 -0
  363. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/sqlite/test_types.py +0 -0
  364. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/sqlite/test_utils.py +0 -0
  365. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_array.py +0 -0
  366. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_conditional.py +0 -0
  367. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_path.py +0 -0
  368. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_random.py +0 -0
  369. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_selectable.py +0 -0
  370. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_string.py +0 -0
  371. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_asyn.py +0 -0
  372. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_cache.py +0 -0
  373. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_catalog.py +0 -0
  374. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_catalog_loader.py +0 -0
  375. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_cli_parsing.py +0 -0
  376. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_client.py +0 -0
  377. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_client_gcs.py +0 -0
  378. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_client_s3.py +0 -0
  379. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_config.py +0 -0
  380. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_data_storage.py +0 -0
  381. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_database_engine.py +0 -0
  382. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_dataset.py +0 -0
  383. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_dispatch.py +0 -0
  384. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_fileslice.py +0 -0
  385. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_func.py +0 -0
  386. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_listing.py +0 -0
  387. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_metastore.py +0 -0
  388. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_module_exports.py +0 -0
  389. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_pytorch.py +0 -0
  390. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_query.py +0 -0
  391. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_query_metrics.py +0 -0
  392. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_query_params.py +0 -0
  393. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_script_meta.py +0 -0
  394. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_semver.py +0 -0
  395. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_serializer.py +0 -0
  396. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_session.py +0 -0
  397. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_utils.py +0 -0
  398. {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_warehouse.py +0 -0
  399. {datachain-0.24.2 → datachain-0.24.4}/tests/utils.py +0 -0
@@ -98,7 +98,7 @@ jobs:
98
98
  - name: Run tests
99
99
  # Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
100
100
  run: >
101
- DATACHAIN_METASTORE_ARG_PROJECT=john
101
+ DATACHAIN_METASTORE_ARG_USERNAME=john
102
102
  PYTHONPATH="$(pwd)/..:${PYTHONPATH}"
103
103
  pytest
104
104
  --config-file=pyproject.toml -rs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.2
3
+ Version: 0.24.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -1098,9 +1098,18 @@ class Catalog:
1098
1098
  ) -> DatasetRecord:
1099
1099
  from datachain.lib.listing import is_listing_dataset
1100
1100
 
1101
+ project = project or self.metastore.default_project
1102
+
1101
1103
  if is_listing_dataset(name):
1102
1104
  project = self.metastore.listing_project
1103
- return self.metastore.get_dataset(name, project.id if project else None)
1105
+
1106
+ try:
1107
+ return self.metastore.get_dataset(name, project.id if project else None)
1108
+ except DatasetNotFoundError:
1109
+ raise DatasetNotFoundError(
1110
+ f"Dataset {name} not found in namespace {project.namespace.name}"
1111
+ f" and project {project.name}"
1112
+ ) from None
1104
1113
 
1105
1114
  def get_dataset_with_remote_fallback(
1106
1115
  self,
@@ -1111,6 +1120,14 @@ class Catalog:
1111
1120
  pull_dataset: bool = False,
1112
1121
  update: bool = False,
1113
1122
  ) -> DatasetRecord:
1123
+ # Intentionally ignore update flag is version is provided. Here only exact
1124
+ # version can be provided and update then doesn't make sense.
1125
+ # It corresponds to a query like this for example:
1126
+ #
1127
+ # dc.read_dataset("some.remote.dataset", version="1.0.0", update=True)
1128
+ if version:
1129
+ update = False
1130
+
1114
1131
  if self.metastore.is_local_dataset(namespace_name) or not update:
1115
1132
  try:
1116
1133
  project = self.metastore.get_project(project_name, namespace_name)
@@ -1124,7 +1141,7 @@ class Catalog:
1124
1141
  raise DatasetNotFoundError(
1125
1142
  f"Dataset {name}"
1126
1143
  + (f" version {version} " if version else " ")
1127
- + "not found"
1144
+ + f"not found in namespace {namespace_name} and project {project_name}"
1128
1145
  )
1129
1146
 
1130
1147
  if pull_dataset:
@@ -1194,14 +1194,16 @@ class AbstractDBMetastore(AbstractMetastore):
1194
1194
  Gets a single dataset in project by dataset name.
1195
1195
  """
1196
1196
  project_id = project_id or self.default_project.id
1197
+
1197
1198
  d = self._datasets
1198
1199
  query = self._base_dataset_query()
1199
1200
  query = query.where(d.c.name == name, d.c.project_id == project_id) # type: ignore [attr-defined]
1200
1201
  ds = self._parse_dataset(self.db.execute(query, conn=conn))
1201
1202
  if not ds:
1202
1203
  raise DatasetNotFoundError(
1203
- f"Dataset {name} not found in project {project_id}"
1204
+ f"Dataset {name} not found in project with id {project_id}"
1204
1205
  )
1206
+
1205
1207
  return ds
1206
1208
 
1207
1209
  def remove_dataset_version(
@@ -774,7 +774,15 @@ class SQLiteWarehouse(AbstractWarehouse):
774
774
  query: Select,
775
775
  progress_cb: Optional[Callable[[int], None]] = None,
776
776
  ) -> None:
777
- if len(query._group_by_clause) > 0:
777
+ col_id = (
778
+ query.selected_columns.sys__id
779
+ if "sys__id" in query.selected_columns
780
+ else None
781
+ )
782
+
783
+ # If there is no sys__id column, we cannot copy the table in batches,
784
+ # and we need to copy all rows at once. Same if there is a group by clause.
785
+ if col_id is None or len(query._group_by_clause) > 0:
778
786
  select_q = query.with_only_columns(
779
787
  *[c for c in query.selected_columns if c.name != "sys__id"]
780
788
  )
@@ -782,12 +790,7 @@ class SQLiteWarehouse(AbstractWarehouse):
782
790
  self.db.execute(q)
783
791
  return
784
792
 
785
- if "sys__id" in query.selected_columns:
786
- col_id = query.selected_columns.sys__id
787
- else:
788
- col_id = sqlalchemy.column("sys__id")
789
793
  select_ids = query.with_only_columns(col_id)
790
-
791
794
  ids = self.db.execute(select_ids).fetchall()
792
795
 
793
796
  select_q = (
@@ -32,7 +32,7 @@ QUERY_DATASET_PREFIX = "ds_query_"
32
32
  LISTING_PREFIX = "lst__"
33
33
 
34
34
  DEFAULT_DATASET_VERSION = "1.0.0"
35
- DATASET_NAME_RESERVED_CHARS = ["."]
35
+ DATASET_NAME_RESERVED_CHARS = [".", "@"]
36
36
  DATASET_NAME_REPLACEMENT_CHAR = "_"
37
37
 
38
38
 
@@ -21,6 +21,7 @@ from typing import (
21
21
  import orjson
22
22
  import sqlalchemy
23
23
  from pydantic import BaseModel
24
+ from sqlalchemy.sql.elements import ColumnElement
24
25
  from tqdm import tqdm
25
26
 
26
27
  from datachain import semver
@@ -806,11 +807,35 @@ class DataChain:
806
807
  chain.save("new_dataset")
807
808
  ```
808
809
  """
810
+ # Convert string partition_by parameters to Column objects
811
+ processed_partition_by = partition_by
812
+ if partition_by is not None:
813
+ if isinstance(partition_by, (str, Function, ColumnElement)):
814
+ list_partition_by = [partition_by]
815
+ else:
816
+ list_partition_by = list(partition_by)
817
+
818
+ processed_partition_columns: list[ColumnElement] = []
819
+ for col in list_partition_by:
820
+ if isinstance(col, str):
821
+ col_db_name = ColumnMeta.to_db_name(col)
822
+ col_type = self.signals_schema.get_column_type(col_db_name)
823
+ column = Column(col_db_name, python_to_sql(col_type))
824
+ processed_partition_columns.append(column)
825
+ elif isinstance(col, Function):
826
+ column = col.get_column(self.signals_schema)
827
+ processed_partition_columns.append(column)
828
+ else:
829
+ # Assume it's already a ColumnElement
830
+ processed_partition_columns.append(col)
831
+
832
+ processed_partition_by = processed_partition_columns
833
+
809
834
  udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
810
835
  return self._evolve(
811
836
  query=self._query.generate(
812
837
  udf_obj.to_udf_wrapper(),
813
- partition_by=partition_by,
838
+ partition_by=processed_partition_by,
814
839
  **self._settings.to_dict(),
815
840
  ),
816
841
  signal_schema=udf_obj.output,
@@ -189,6 +189,7 @@ def read_dataset(
189
189
  namespace_name=namespace_name,
190
190
  version=version, # type: ignore[arg-type]
191
191
  session=session,
192
+ update=update,
192
193
  )
193
194
 
194
195
  signals_schema = SignalSchema({"sys": Sys})
@@ -65,10 +65,17 @@ class Listing:
65
65
 
66
66
  @cached_property
67
67
  def dataset(self) -> "DatasetRecord":
68
+ from datachain.error import DatasetNotFoundError
69
+
68
70
  assert self.dataset_name
69
- return self.metastore.get_dataset(
70
- self.dataset_name, self.metastore.listing_project.id
71
- )
71
+ project = self.metastore.listing_project
72
+ try:
73
+ return self.metastore.get_dataset(self.dataset_name, project.id)
74
+ except DatasetNotFoundError:
75
+ raise DatasetNotFoundError(
76
+ f"Dataset {self.dataset_name} not found in namespace"
77
+ f" {project.namespace.name} and project {project.name}"
78
+ ) from None
72
79
 
73
80
  @cached_property
74
81
  def dataset_rows(self):
@@ -6,7 +6,7 @@ from typing import Any, Optional, TypeVar
6
6
  from datachain.error import InvalidNamespaceNameError
7
7
 
8
8
  N = TypeVar("N", bound="Namespace")
9
- NAMESPACE_NAME_RESERVED_CHARS = ["."]
9
+ NAMESPACE_NAME_RESERVED_CHARS = [".", "@"]
10
10
 
11
11
 
12
12
  @dataclass(frozen=True)
@@ -7,7 +7,7 @@ from datachain.error import InvalidProjectNameError
7
7
  from datachain.namespace import Namespace
8
8
 
9
9
  P = TypeVar("P", bound="Project")
10
- PROJECT_NAME_RESERVED_CHARS = ["."]
10
+ PROJECT_NAME_RESERVED_CHARS = [".", "@"]
11
11
 
12
12
 
13
13
  @dataclass(frozen=True)
@@ -82,7 +82,10 @@ if TYPE_CHECKING:
82
82
  INSERT_BATCH_SIZE = 10000
83
83
 
84
84
  PartitionByType = Union[
85
- Function, ColumnElement, Sequence[Union[Function, ColumnElement]]
85
+ str,
86
+ Function,
87
+ ColumnElement,
88
+ Sequence[Union[str, Function, ColumnElement]],
86
89
  ]
87
90
  JoinPredicateType = Union[str, ColumnClause, ColumnElement]
88
91
  DatasetDependencyType = tuple["DatasetRecord", str]
@@ -1142,6 +1145,7 @@ class DatasetQuery:
1142
1145
  project_name=project_name,
1143
1146
  version=version,
1144
1147
  pull_dataset=True,
1148
+ update=update,
1145
1149
  )
1146
1150
  )
1147
1151
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.2
3
+ Version: 0.24.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -236,6 +236,22 @@ def test_read_storage_dependencies(cloud_test_catalog, cloud_type):
236
236
  assert dependencies[0].name == dep_name
237
237
 
238
238
 
239
+ def test_persist_after_mutate(test_session):
240
+ chain = (
241
+ dc.read_values(fib=[1, 1, 2, 3, 5, 8, 13, 21], session=test_session)
242
+ .map(mod3=lambda fib: fib % 3, output=int)
243
+ .group_by(
244
+ cnt=dc.func.count(),
245
+ partition_by="mod3",
246
+ )
247
+ .mutate(x=1)
248
+ .persist()
249
+ )
250
+
251
+ assert chain.count() == 3
252
+ assert set(chain.to_values("mod3")) == {0, 1, 2}
253
+
254
+
239
255
  def test_persist_not_affects_dependencies(tmp_dir, test_session):
240
256
  for i in range(4):
241
257
  (tmp_dir / f"file{i}.txt").write_text(f"file{i}")
@@ -248,6 +248,9 @@ def test_delta_update_check_num_calls(test_session, tmp_dir, tmp_path, capsys):
248
248
 
249
249
 
250
250
  def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
251
+ catalog = test_session.catalog
252
+ default_namespace_name = catalog.metastore.default_namespace_name
253
+ default_project_name = catalog.metastore.default_project_name
251
254
  ds_name = "delta_ds"
252
255
  path = tmp_dir.as_uri()
253
256
  tmp_dir = tmp_dir / "images"
@@ -296,7 +299,10 @@ def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
296
299
  with pytest.raises(DatasetNotFoundError) as exc_info:
297
300
  dc.read_dataset(ds_name, version="1.0.1")
298
301
 
299
- assert str(exc_info.value) == f"Dataset {ds_name} version 1.0.1 not found"
302
+ assert str(exc_info.value) == (
303
+ f"Dataset {ds_name} version 1.0.1 not found in namespace "
304
+ f"{default_namespace_name} and project {default_project_name}"
305
+ )
300
306
 
301
307
 
302
308
  @pytest.fixture
@@ -362,8 +362,16 @@ def test_read_dataset_remote_update_flag(
362
362
  assert dc.datasets().to_values("version") == ["1.0.0"]
363
363
  assert ds1.to_values("version")[0] == "1.0.0"
364
364
 
365
+ # Read without update and version returns a cached version
366
+ ds1 = dc.read_dataset(
367
+ f"{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs",
368
+ session=test_session,
369
+ )
370
+ assert dc.datasets().to_values("version") == ["1.0.0"]
371
+ assert ds1.to_values("version")[0] == "1.0.0"
372
+
365
373
  # Second read with update=True with the exact version
366
- # returns the same
374
+ # returns the same dataset version
367
375
  ds2 = dc.read_dataset(
368
376
  f"{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs",
369
377
  version="1.0.0",
@@ -385,9 +393,7 @@ def test_read_dataset_remote_update_flag(
385
393
  assert dc.datasets().to_values("version") == ["1.0.0"]
386
394
  assert ds3.to_values("version")[0] == "1.0.0"
387
395
 
388
- # Finally, read with update=False even with version specifier
389
- # that allows for newer version still bring the same version
390
- # as the one already downloaded
396
+ # Finally, read with update=True brings the latest version
391
397
  ds4 = dc.read_dataset(
392
398
  f"{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs",
393
399
  version=">=1.0.0",
@@ -399,6 +405,45 @@ def test_read_dataset_remote_update_flag(
399
405
  assert dc.datasets().to_values("version") == ["1.0.0", "2.0.0"]
400
406
 
401
407
 
408
+ @skip_if_not_sqlite
409
+ def test_read_dataset_remote_update_flag_no_version(
410
+ studio_token,
411
+ test_session,
412
+ remote_dataset_multi_version,
413
+ mock_dataset_info_endpoint,
414
+ mock_export_endpoint_with_urls,
415
+ mock_export_status_completed,
416
+ mock_s3_parquet_download,
417
+ mock_dataset_rows_fetcher_status_check,
418
+ requests_mock,
419
+ ):
420
+ """Test read_dataset with update=True flag to force remote check."""
421
+
422
+ # Mock the Studio API responses
423
+ mock_dataset_info_endpoint(remote_dataset_multi_version)
424
+ mock_s3_parquet_download()
425
+
426
+ # First read - downloads version 1.0.0
427
+ ds1 = dc.read_dataset(
428
+ f"{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs",
429
+ version="1.0.0",
430
+ session=test_session,
431
+ )
432
+ assert dc.datasets().to_values("version") == ["1.0.0"]
433
+ assert ds1.to_values("version")[0] == "1.0.0"
434
+
435
+ # Read with update=True w/o version specifier also
436
+ # checks the most recent remote version and brings it
437
+ ds4 = dc.read_dataset(
438
+ f"{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs",
439
+ update=True,
440
+ session=test_session,
441
+ )
442
+
443
+ assert ds4.to_values("version")[0] == "2.0.0"
444
+ assert dc.datasets().to_values("version") == ["1.0.0", "2.0.0"]
445
+
446
+
402
447
  @skip_if_not_sqlite
403
448
  def test_read_dataset_remote_version_specifiers(
404
449
  studio_token,
@@ -16,6 +16,7 @@ from pydantic import BaseModel
16
16
 
17
17
  import datachain as dc
18
18
  from datachain import Column
19
+ from datachain.data_storage import AbstractMetastore
19
20
  from datachain.error import (
20
21
  DatasetInvalidVersionError,
21
22
  DatasetNotFoundError,
@@ -3428,6 +3429,29 @@ def test_save_to_non_default_namespace_and_project(
3428
3429
  dc.read_dataset(name="fibonacci")
3429
3430
 
3430
3431
 
3432
+ def test_dataset_not_found_in_default_project(test_session):
3433
+ metastore = test_session.catalog.metastore
3434
+ with pytest.raises(DatasetNotFoundError) as excinfo:
3435
+ with patch.object(AbstractMetastore, "is_local_dataset", return_value=True):
3436
+ dc.read_dataset("fibonacci")
3437
+ assert str(excinfo.value) == (
3438
+ f"Dataset fibonacci not found in namespace {metastore.default_namespace_name}"
3439
+ f" and project {metastore.default_project_name}"
3440
+ )
3441
+
3442
+
3443
+ @pytest.mark.parametrize("project_created", (True, False))
3444
+ def test_dataset_not_found_in_non_default_project(test_session, project_created):
3445
+ if project_created:
3446
+ dc.create_project("dev", "numbers")
3447
+ with pytest.raises(DatasetNotFoundError) as excinfo:
3448
+ with patch.object(AbstractMetastore, "is_local_dataset", return_value=True):
3449
+ dc.read_dataset("dev.numbers.fibonacci")
3450
+ assert str(excinfo.value) == (
3451
+ "Dataset fibonacci not found in namespace dev and project numbers"
3452
+ )
3453
+
3454
+
3431
3455
  @pytest.mark.parametrize("use_settings", (True, False))
3432
3456
  @pytest.mark.parametrize("project_created_upfront", (True, False))
3433
3457
  def test_save_specify_only_non_default_project(
@@ -3571,3 +3595,60 @@ def test_save_create_project_not_allowed(test_session, allow_create_project):
3571
3595
  dc.read_values(fib=[1, 1, 2, 3, 5, 8], session=test_session).save(
3572
3596
  "dev.numbers.fibonacci"
3573
3597
  )
3598
+
3599
+
3600
+ def test_agg_partition_by_string_notation(test_session):
3601
+ """Test that agg method supports string notation for partition_by."""
3602
+
3603
+ class _ImageGroup(BaseModel):
3604
+ name: str
3605
+ size: int
3606
+
3607
+ def func(key, val) -> Iterator[tuple[File, _ImageGroup]]:
3608
+ n = "-".join(key)
3609
+ v = sum(val)
3610
+ yield File(path=n), _ImageGroup(name=n, size=v)
3611
+
3612
+ keys = ["n1", "n2", "n1"]
3613
+ values = [1, 5, 9]
3614
+
3615
+ # Test using string notation (NEW functionality)
3616
+ ds = dc.read_values(key=keys, val=values, session=test_session).agg(
3617
+ x=func,
3618
+ partition_by="key", # String notation instead of C("key")
3619
+ )
3620
+
3621
+ assert ds.order_by("x_1.name").to_values("x_1.name") == ["n1-n1", "n2"]
3622
+ assert ds.order_by("x_1.size").to_values("x_1.size") == [5, 10]
3623
+
3624
+
3625
+ def test_agg_partition_by_string_sequence(test_session):
3626
+ """Test that agg method supports sequence of strings for partition_by."""
3627
+
3628
+ class _ImageGroup(BaseModel):
3629
+ name: str
3630
+ size: int
3631
+
3632
+ def func(key1, key2, val) -> Iterator[tuple[File, _ImageGroup]]:
3633
+ n = f"{key1[0]}-{key2[0]}"
3634
+ v = sum(val)
3635
+ yield File(path=n), _ImageGroup(name=n, size=v)
3636
+
3637
+ key1_values = ["a", "a", "b"]
3638
+ key2_values = ["x", "y", "x"]
3639
+ values = [1, 5, 9]
3640
+
3641
+ # Test using sequence of strings (NEW functionality)
3642
+ ds = dc.read_values(
3643
+ key1=key1_values, key2=key2_values, val=values, session=test_session
3644
+ ).agg(
3645
+ x=func,
3646
+ partition_by=["key1", "key2"], # Sequence of strings
3647
+ )
3648
+
3649
+ result_names = ds.order_by("x_1.name").to_values("x_1.name")
3650
+ result_sizes = ds.order_by("x_1.size").to_values("x_1.size")
3651
+
3652
+ # Should have 3 partitions: (a,x), (a,y), (b,x)
3653
+ assert len(result_names) == 3
3654
+ assert len(result_sizes) == 3
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes