datachain 0.22.0__tar.gz → 0.24.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (399) hide show
  1. {datachain-0.22.0 → datachain-0.24.0}/PKG-INFO +1 -1
  2. {datachain-0.22.0 → datachain-0.24.0}/docs/guide/env.md +4 -0
  3. {datachain-0.22.0 → datachain-0.24.0}/docs/guide/namespaces.md +43 -1
  4. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/catalog/catalog.py +58 -13
  5. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/commands/datasets.py +4 -10
  6. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/data_storage/metastore.py +13 -2
  7. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/data_storage/sqlite.py +6 -2
  8. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/dataset.py +37 -6
  9. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/datachain.py +6 -12
  10. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/datasets.py +60 -44
  11. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/listings.py +2 -6
  12. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/records.py +1 -1
  13. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/projects.py +1 -1
  14. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/signal_schema.py +8 -0
  15. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/dataset.py +2 -8
  16. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/remote/studio.py +4 -3
  17. {datachain-0.22.0 → datachain-0.24.0}/src/datachain.egg-info/PKG-INFO +1 -1
  18. {datachain-0.22.0 → datachain-0.24.0}/src/datachain.egg-info/SOURCES.txt +2 -0
  19. {datachain-0.22.0 → datachain-0.24.0}/tests/conftest.py +169 -0
  20. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_datachain.py +46 -0
  21. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_dataset_query.py +2 -4
  22. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_delta.py +3 -3
  23. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_ls.py +4 -2
  24. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_pull.py +61 -199
  25. datachain-0.24.0/tests/func/test_read_dataset_remote.py +555 -0
  26. datachain-0.24.0/tests/func/test_read_dataset_version_specifiers.py +88 -0
  27. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_retry.py +2 -2
  28. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_datachain.py +121 -2
  29. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_dataset.py +1 -1
  30. {datachain-0.22.0 → datachain-0.24.0}/.cruft.json +0 -0
  31. {datachain-0.22.0 → datachain-0.24.0}/.gitattributes +0 -0
  32. {datachain-0.22.0 → datachain-0.24.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  33. {datachain-0.22.0 → datachain-0.24.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  34. {datachain-0.22.0 → datachain-0.24.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  35. {datachain-0.22.0 → datachain-0.24.0}/.github/codecov.yaml +0 -0
  36. {datachain-0.22.0 → datachain-0.24.0}/.github/dependabot.yml +0 -0
  37. {datachain-0.22.0 → datachain-0.24.0}/.github/workflows/benchmarks.yml +0 -0
  38. {datachain-0.22.0 → datachain-0.24.0}/.github/workflows/release.yml +0 -0
  39. {datachain-0.22.0 → datachain-0.24.0}/.github/workflows/tests-studio.yml +0 -0
  40. {datachain-0.22.0 → datachain-0.24.0}/.github/workflows/tests.yml +0 -0
  41. {datachain-0.22.0 → datachain-0.24.0}/.github/workflows/update-template.yaml +0 -0
  42. {datachain-0.22.0 → datachain-0.24.0}/.gitignore +0 -0
  43. {datachain-0.22.0 → datachain-0.24.0}/.pre-commit-config.yaml +0 -0
  44. {datachain-0.22.0 → datachain-0.24.0}/CODE_OF_CONDUCT.rst +0 -0
  45. {datachain-0.22.0 → datachain-0.24.0}/LICENSE +0 -0
  46. {datachain-0.22.0 → datachain-0.24.0}/README.rst +0 -0
  47. {datachain-0.22.0 → datachain-0.24.0}/docs/assets/captioned_cartoons.png +0 -0
  48. {datachain-0.22.0 → datachain-0.24.0}/docs/assets/datachain-white.svg +0 -0
  49. {datachain-0.22.0 → datachain-0.24.0}/docs/assets/datachain.svg +0 -0
  50. {datachain-0.22.0 → datachain-0.24.0}/docs/commands/auth/login.md +0 -0
  51. {datachain-0.22.0 → datachain-0.24.0}/docs/commands/auth/logout.md +0 -0
  52. {datachain-0.22.0 → datachain-0.24.0}/docs/commands/auth/team.md +0 -0
  53. {datachain-0.22.0 → datachain-0.24.0}/docs/commands/auth/token.md +0 -0
  54. {datachain-0.22.0 → datachain-0.24.0}/docs/commands/index.md +0 -0
  55. {datachain-0.22.0 → datachain-0.24.0}/docs/commands/job/cancel.md +0 -0
  56. {datachain-0.22.0 → datachain-0.24.0}/docs/commands/job/clusters.md +0 -0
  57. {datachain-0.22.0 → datachain-0.24.0}/docs/commands/job/logs.md +0 -0
  58. {datachain-0.22.0 → datachain-0.24.0}/docs/commands/job/ls.md +0 -0
  59. {datachain-0.22.0 → datachain-0.24.0}/docs/commands/job/run.md +0 -0
  60. {datachain-0.22.0 → datachain-0.24.0}/docs/contributing.md +0 -0
  61. {datachain-0.22.0 → datachain-0.24.0}/docs/css/github-permalink-style.css +0 -0
  62. {datachain-0.22.0 → datachain-0.24.0}/docs/examples.md +0 -0
  63. {datachain-0.22.0 → datachain-0.24.0}/docs/guide/db_migrations.md +0 -0
  64. {datachain-0.22.0 → datachain-0.24.0}/docs/guide/delta.md +0 -0
  65. {datachain-0.22.0 → datachain-0.24.0}/docs/guide/index.md +0 -0
  66. {datachain-0.22.0 → datachain-0.24.0}/docs/guide/processing.md +0 -0
  67. {datachain-0.22.0 → datachain-0.24.0}/docs/guide/remotes.md +0 -0
  68. {datachain-0.22.0 → datachain-0.24.0}/docs/guide/retry.md +0 -0
  69. {datachain-0.22.0 → datachain-0.24.0}/docs/index.md +0 -0
  70. {datachain-0.22.0 → datachain-0.24.0}/docs/overrides/main.html +0 -0
  71. {datachain-0.22.0 → datachain-0.24.0}/docs/quick-start.md +0 -0
  72. {datachain-0.22.0 → datachain-0.24.0}/docs/references/data-types/arrowrow.md +0 -0
  73. {datachain-0.22.0 → datachain-0.24.0}/docs/references/data-types/bbox.md +0 -0
  74. {datachain-0.22.0 → datachain-0.24.0}/docs/references/data-types/file.md +0 -0
  75. {datachain-0.22.0 → datachain-0.24.0}/docs/references/data-types/imagefile.md +0 -0
  76. {datachain-0.22.0 → datachain-0.24.0}/docs/references/data-types/index.md +0 -0
  77. {datachain-0.22.0 → datachain-0.24.0}/docs/references/data-types/pose.md +0 -0
  78. {datachain-0.22.0 → datachain-0.24.0}/docs/references/data-types/segment.md +0 -0
  79. {datachain-0.22.0 → datachain-0.24.0}/docs/references/data-types/tarvfile.md +0 -0
  80. {datachain-0.22.0 → datachain-0.24.0}/docs/references/data-types/textfile.md +0 -0
  81. {datachain-0.22.0 → datachain-0.24.0}/docs/references/data-types/videofile.md +0 -0
  82. {datachain-0.22.0 → datachain-0.24.0}/docs/references/datachain.md +0 -0
  83. {datachain-0.22.0 → datachain-0.24.0}/docs/references/func.md +0 -0
  84. {datachain-0.22.0 → datachain-0.24.0}/docs/references/index.md +0 -0
  85. {datachain-0.22.0 → datachain-0.24.0}/docs/references/toolkit.md +0 -0
  86. {datachain-0.22.0 → datachain-0.24.0}/docs/references/torch.md +0 -0
  87. {datachain-0.22.0 → datachain-0.24.0}/docs/references/udf.md +0 -0
  88. {datachain-0.22.0 → datachain-0.24.0}/docs/tutorials.md +0 -0
  89. {datachain-0.22.0 → datachain-0.24.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  90. {datachain-0.22.0 → datachain-0.24.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  91. {datachain-0.22.0 → datachain-0.24.0}/examples/computer_vision/openimage-detect.py +0 -0
  92. {datachain-0.22.0 → datachain-0.24.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  93. {datachain-0.22.0 → datachain-0.24.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  94. {datachain-0.22.0 → datachain-0.24.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  95. {datachain-0.22.0 → datachain-0.24.0}/examples/get_started/common_sql_functions.py +0 -0
  96. {datachain-0.22.0 → datachain-0.24.0}/examples/get_started/json-csv-reader.py +0 -0
  97. {datachain-0.22.0 → datachain-0.24.0}/examples/get_started/torch-loader.py +0 -0
  98. {datachain-0.22.0 → datachain-0.24.0}/examples/get_started/udfs/parallel.py +0 -0
  99. {datachain-0.22.0 → datachain-0.24.0}/examples/get_started/udfs/simple.py +0 -0
  100. {datachain-0.22.0 → datachain-0.24.0}/examples/get_started/udfs/stateful.py +0 -0
  101. {datachain-0.22.0 → datachain-0.24.0}/examples/incremental_processing/delta.py +0 -0
  102. {datachain-0.22.0 → datachain-0.24.0}/examples/incremental_processing/retry.py +0 -0
  103. {datachain-0.22.0 → datachain-0.24.0}/examples/incremental_processing/utils.py +0 -0
  104. {datachain-0.22.0 → datachain-0.24.0}/examples/llm_and_nlp/claude-query.py +0 -0
  105. {datachain-0.22.0 → datachain-0.24.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  106. {datachain-0.22.0 → datachain-0.24.0}/examples/multimodal/clip_inference.py +0 -0
  107. {datachain-0.22.0 → datachain-0.24.0}/examples/multimodal/hf_pipeline.py +0 -0
  108. {datachain-0.22.0 → datachain-0.24.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  109. {datachain-0.22.0 → datachain-0.24.0}/examples/multimodal/wds.py +0 -0
  110. {datachain-0.22.0 → datachain-0.24.0}/examples/multimodal/wds_filtered.py +0 -0
  111. {datachain-0.22.0 → datachain-0.24.0}/mkdocs.yml +0 -0
  112. {datachain-0.22.0 → datachain-0.24.0}/noxfile.py +0 -0
  113. {datachain-0.22.0 → datachain-0.24.0}/pyproject.toml +0 -0
  114. {datachain-0.22.0 → datachain-0.24.0}/setup.cfg +0 -0
  115. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/__init__.py +0 -0
  116. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/__main__.py +0 -0
  117. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/asyn.py +0 -0
  118. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cache.py +0 -0
  119. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/catalog/__init__.py +0 -0
  120. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/catalog/datasource.py +0 -0
  121. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/catalog/loader.py +0 -0
  122. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/__init__.py +0 -0
  123. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/commands/__init__.py +0 -0
  124. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/commands/du.py +0 -0
  125. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/commands/index.py +0 -0
  126. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/commands/ls.py +0 -0
  127. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/commands/misc.py +0 -0
  128. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/commands/query.py +0 -0
  129. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/commands/show.py +0 -0
  130. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/parser/__init__.py +0 -0
  131. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/parser/job.py +0 -0
  132. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/parser/studio.py +0 -0
  133. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/parser/utils.py +0 -0
  134. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/cli/utils.py +0 -0
  135. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/client/__init__.py +0 -0
  136. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/client/azure.py +0 -0
  137. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/client/fileslice.py +0 -0
  138. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/client/fsspec.py +0 -0
  139. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/client/gcs.py +0 -0
  140. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/client/hf.py +0 -0
  141. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/client/local.py +0 -0
  142. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/client/s3.py +0 -0
  143. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/config.py +0 -0
  144. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/data_storage/__init__.py +0 -0
  145. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/data_storage/db_engine.py +0 -0
  146. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/data_storage/job.py +0 -0
  147. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/data_storage/schema.py +0 -0
  148. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/data_storage/serializer.py +0 -0
  149. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/data_storage/warehouse.py +0 -0
  150. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/delta.py +0 -0
  151. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/diff/__init__.py +0 -0
  152. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/error.py +0 -0
  153. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/fs/__init__.py +0 -0
  154. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/fs/reference.py +0 -0
  155. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/fs/utils.py +0 -0
  156. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/__init__.py +0 -0
  157. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/aggregate.py +0 -0
  158. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/array.py +0 -0
  159. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/base.py +0 -0
  160. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/conditional.py +0 -0
  161. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/func.py +0 -0
  162. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/numeric.py +0 -0
  163. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/path.py +0 -0
  164. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/random.py +0 -0
  165. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/string.py +0 -0
  166. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/func/window.py +0 -0
  167. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/job.py +0 -0
  168. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/__init__.py +0 -0
  169. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/arrow.py +0 -0
  170. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/clip.py +0 -0
  171. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/convert/__init__.py +0 -0
  172. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/convert/flatten.py +0 -0
  173. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  174. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  175. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/convert/unflatten.py +0 -0
  176. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  177. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/data_model.py +0 -0
  178. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dataset_info.py +0 -0
  179. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/__init__.py +0 -0
  180. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/csv.py +0 -0
  181. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/database.py +0 -0
  182. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/hf.py +0 -0
  183. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/json.py +0 -0
  184. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/pandas.py +0 -0
  185. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/parquet.py +0 -0
  186. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/storage.py +0 -0
  187. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/utils.py +0 -0
  188. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/dc/values.py +0 -0
  189. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/file.py +0 -0
  190. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/hf.py +0 -0
  191. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/image.py +0 -0
  192. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/listing.py +0 -0
  193. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/listing_info.py +0 -0
  194. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/meta_formats.py +0 -0
  195. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/model_store.py +0 -0
  196. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/namespaces.py +0 -0
  197. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/pytorch.py +0 -0
  198. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/settings.py +0 -0
  199. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/tar.py +0 -0
  200. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/text.py +0 -0
  201. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/udf.py +0 -0
  202. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/udf_signature.py +0 -0
  203. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/utils.py +0 -0
  204. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/video.py +0 -0
  205. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/webdataset.py +0 -0
  206. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/lib/webdataset_laion.py +0 -0
  207. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/listing.py +0 -0
  208. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/model/__init__.py +0 -0
  209. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/model/bbox.py +0 -0
  210. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/model/pose.py +0 -0
  211. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/model/segment.py +0 -0
  212. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  213. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  214. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/model/ultralytics/pose.py +0 -0
  215. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/model/ultralytics/segment.py +0 -0
  216. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/model/utils.py +0 -0
  217. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/namespace.py +0 -0
  218. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/node.py +0 -0
  219. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/nodes_fetcher.py +0 -0
  220. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/nodes_thread_pool.py +0 -0
  221. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/progress.py +0 -0
  222. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/project.py +0 -0
  223. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/py.typed +0 -0
  224. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/__init__.py +0 -0
  225. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/batch.py +0 -0
  226. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/dispatch.py +0 -0
  227. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/metrics.py +0 -0
  228. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/params.py +0 -0
  229. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/queue.py +0 -0
  230. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/schema.py +0 -0
  231. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/session.py +0 -0
  232. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/udf.py +0 -0
  233. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/query/utils.py +0 -0
  234. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/remote/__init__.py +0 -0
  235. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/script_meta.py +0 -0
  236. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/semver.py +0 -0
  237. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/__init__.py +0 -0
  238. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/default/__init__.py +0 -0
  239. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/default/base.py +0 -0
  240. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/functions/__init__.py +0 -0
  241. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/functions/aggregate.py +0 -0
  242. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/functions/array.py +0 -0
  243. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/functions/conditional.py +0 -0
  244. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/functions/numeric.py +0 -0
  245. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/functions/path.py +0 -0
  246. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/functions/random.py +0 -0
  247. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/functions/string.py +0 -0
  248. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/selectable.py +0 -0
  249. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  250. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/sqlite/base.py +0 -0
  251. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/sqlite/types.py +0 -0
  252. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/sqlite/vector.py +0 -0
  253. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/types.py +0 -0
  254. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/sql/utils.py +0 -0
  255. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/studio.py +0 -0
  256. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/telemetry.py +0 -0
  257. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/toolkit/__init__.py +0 -0
  258. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/toolkit/split.py +0 -0
  259. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/torch/__init__.py +0 -0
  260. {datachain-0.22.0 → datachain-0.24.0}/src/datachain/utils.py +0 -0
  261. {datachain-0.22.0 → datachain-0.24.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  262. {datachain-0.22.0 → datachain-0.24.0}/src/datachain.egg-info/entry_points.txt +0 -0
  263. {datachain-0.22.0 → datachain-0.24.0}/src/datachain.egg-info/requires.txt +0 -0
  264. {datachain-0.22.0 → datachain-0.24.0}/src/datachain.egg-info/top_level.txt +0 -0
  265. {datachain-0.22.0 → datachain-0.24.0}/tests/__init__.py +0 -0
  266. {datachain-0.22.0 → datachain-0.24.0}/tests/benchmarks/__init__.py +0 -0
  267. {datachain-0.22.0 → datachain-0.24.0}/tests/benchmarks/conftest.py +0 -0
  268. {datachain-0.22.0 → datachain-0.24.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  269. {datachain-0.22.0 → datachain-0.24.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  270. {datachain-0.22.0 → datachain-0.24.0}/tests/benchmarks/datasets/.gitignore +0 -0
  271. {datachain-0.22.0 → datachain-0.24.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  272. {datachain-0.22.0 → datachain-0.24.0}/tests/benchmarks/test_datachain.py +0 -0
  273. {datachain-0.22.0 → datachain-0.24.0}/tests/benchmarks/test_ls.py +0 -0
  274. {datachain-0.22.0 → datachain-0.24.0}/tests/benchmarks/test_version.py +0 -0
  275. {datachain-0.22.0 → datachain-0.24.0}/tests/data.py +0 -0
  276. {datachain-0.22.0 → datachain-0.24.0}/tests/examples/__init__.py +0 -0
  277. {datachain-0.22.0 → datachain-0.24.0}/tests/examples/test_examples.py +0 -0
  278. {datachain-0.22.0 → datachain-0.24.0}/tests/examples/test_wds_e2e.py +0 -0
  279. {datachain-0.22.0 → datachain-0.24.0}/tests/examples/wds_data.py +0 -0
  280. {datachain-0.22.0 → datachain-0.24.0}/tests/func/__init__.py +0 -0
  281. {datachain-0.22.0 → datachain-0.24.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  282. {datachain-0.22.0 → datachain-0.24.0}/tests/func/data/lena.jpg +0 -0
  283. {datachain-0.22.0 → datachain-0.24.0}/tests/func/fake-service-account-credentials.json +0 -0
  284. {datachain-0.22.0 → datachain-0.24.0}/tests/func/functions/__init__.py +0 -0
  285. {datachain-0.22.0 → datachain-0.24.0}/tests/func/functions/test_aggregate.py +0 -0
  286. {datachain-0.22.0 → datachain-0.24.0}/tests/func/functions/test_array.py +0 -0
  287. {datachain-0.22.0 → datachain-0.24.0}/tests/func/functions/test_conditional.py +0 -0
  288. {datachain-0.22.0 → datachain-0.24.0}/tests/func/functions/test_numeric.py +0 -0
  289. {datachain-0.22.0 → datachain-0.24.0}/tests/func/functions/test_path.py +0 -0
  290. {datachain-0.22.0 → datachain-0.24.0}/tests/func/functions/test_random.py +0 -0
  291. {datachain-0.22.0 → datachain-0.24.0}/tests/func/functions/test_string.py +0 -0
  292. {datachain-0.22.0 → datachain-0.24.0}/tests/func/model/__init__.py +0 -0
  293. {datachain-0.22.0 → datachain-0.24.0}/tests/func/model/data/running-mask0.png +0 -0
  294. {datachain-0.22.0 → datachain-0.24.0}/tests/func/model/data/running-mask1.png +0 -0
  295. {datachain-0.22.0 → datachain-0.24.0}/tests/func/model/data/running.jpg +0 -0
  296. {datachain-0.22.0 → datachain-0.24.0}/tests/func/model/data/ships.jpg +0 -0
  297. {datachain-0.22.0 → datachain-0.24.0}/tests/func/model/test_yolo.py +0 -0
  298. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_batching.py +0 -0
  299. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_catalog.py +0 -0
  300. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_client.py +0 -0
  301. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_cloud_transfer.py +0 -0
  302. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_data_storage.py +0 -0
  303. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_datachain_merge.py +0 -0
  304. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_datasets.py +0 -0
  305. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_feature_pickling.py +0 -0
  306. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_file.py +0 -0
  307. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_hf.py +0 -0
  308. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_hidden_field.py +0 -0
  309. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_image.py +0 -0
  310. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_listing.py +0 -0
  311. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_meta_formats.py +0 -0
  312. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_metastore.py +0 -0
  313. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_metrics.py +0 -0
  314. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_pytorch.py +0 -0
  315. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_query.py +0 -0
  316. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_read_database.py +0 -0
  317. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_session.py +0 -0
  318. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_toolkit.py +0 -0
  319. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_video.py +0 -0
  320. {datachain-0.22.0 → datachain-0.24.0}/tests/func/test_warehouse.py +0 -0
  321. {datachain-0.22.0 → datachain-0.24.0}/tests/scripts/feature_class.py +0 -0
  322. {datachain-0.22.0 → datachain-0.24.0}/tests/scripts/feature_class_exception.py +0 -0
  323. {datachain-0.22.0 → datachain-0.24.0}/tests/scripts/feature_class_parallel.py +0 -0
  324. {datachain-0.22.0 → datachain-0.24.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  325. {datachain-0.22.0 → datachain-0.24.0}/tests/scripts/name_len_slow.py +0 -0
  326. {datachain-0.22.0 → datachain-0.24.0}/tests/test_atomicity.py +0 -0
  327. {datachain-0.22.0 → datachain-0.24.0}/tests/test_cli_e2e.py +0 -0
  328. {datachain-0.22.0 → datachain-0.24.0}/tests/test_cli_studio.py +0 -0
  329. {datachain-0.22.0 → datachain-0.24.0}/tests/test_import_time.py +0 -0
  330. {datachain-0.22.0 → datachain-0.24.0}/tests/test_query_e2e.py +0 -0
  331. {datachain-0.22.0 → datachain-0.24.0}/tests/test_telemetry.py +0 -0
  332. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/__init__.py +0 -0
  333. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/__init__.py +0 -0
  334. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/conftest.py +0 -0
  335. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_arrow.py +0 -0
  336. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_clip.py +0 -0
  337. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  338. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  339. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_diff.py +0 -0
  340. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_feature.py +0 -0
  341. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_feature_utils.py +0 -0
  342. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_file.py +0 -0
  343. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_hf.py +0 -0
  344. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_image.py +0 -0
  345. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_listing_info.py +0 -0
  346. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_namespace.py +0 -0
  347. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_project.py +0 -0
  348. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  349. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_schema.py +0 -0
  350. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_signal_schema.py +0 -0
  351. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  352. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_text.py +0 -0
  353. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_udf.py +0 -0
  354. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_udf_signature.py +0 -0
  355. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_utils.py +0 -0
  356. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/lib/test_webdataset.py +0 -0
  357. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/model/__init__.py +0 -0
  358. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/model/test_bbox.py +0 -0
  359. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/model/test_pose.py +0 -0
  360. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/model/test_segment.py +0 -0
  361. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/model/test_utils.py +0 -0
  362. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/sql/__init__.py +0 -0
  363. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  364. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  365. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  366. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/sql/test_array.py +0 -0
  367. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/sql/test_conditional.py +0 -0
  368. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/sql/test_path.py +0 -0
  369. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/sql/test_random.py +0 -0
  370. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/sql/test_selectable.py +0 -0
  371. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/sql/test_string.py +0 -0
  372. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_asyn.py +0 -0
  373. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_cache.py +0 -0
  374. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_catalog.py +0 -0
  375. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_catalog_loader.py +0 -0
  376. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_cli_parsing.py +0 -0
  377. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_client.py +0 -0
  378. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_client_gcs.py +0 -0
  379. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_client_s3.py +0 -0
  380. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_config.py +0 -0
  381. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_data_storage.py +0 -0
  382. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_database_engine.py +0 -0
  383. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_dispatch.py +0 -0
  384. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_fileslice.py +0 -0
  385. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_func.py +0 -0
  386. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_listing.py +0 -0
  387. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_metastore.py +0 -0
  388. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_module_exports.py +0 -0
  389. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_pytorch.py +0 -0
  390. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_query.py +0 -0
  391. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_query_metrics.py +0 -0
  392. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_query_params.py +0 -0
  393. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_script_meta.py +0 -0
  394. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_semver.py +0 -0
  395. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_serializer.py +0 -0
  396. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_session.py +0 -0
  397. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_utils.py +0 -0
  398. {datachain-0.22.0 → datachain-0.24.0}/tests/unit/test_warehouse.py +0 -0
  399. {datachain-0.22.0 → datachain-0.24.0}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.22.0
3
+ Version: 0.24.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -15,4 +15,8 @@ List of environment variables used to configure DataChain behavior.
15
15
  - `DATACHAIN_STUDIO_TOKEN` – Authentication token for Studio.
16
16
  - `DATACHAIN_STUDIO_TEAM` – Studio team name.
17
17
 
18
+ ### Namespaces and projects
19
+ - `DATACHAIN_NAMESPACE` – Namespace name to use as default.
20
+ - `DATACHAIN_PROJECT` – Project name or combination of namespace name and project name separated by `.` to use as default, example: `DATACHAIN_PROJECT=dev.analytics`
21
+
18
22
  Note: Some environment variables are used internally and may not be documented here. For the most up-to-date list, refer to the source code.
@@ -82,6 +82,49 @@ This is equivalent to saving to `dev.analytics.metrics`.
82
82
 
83
83
  In CLI, `.settings()` is only supported when both `namespace` and `project` are set to `"local"`.
84
84
 
85
+ ## Setting Namespace and Project via Environment Variables
86
+
87
+ In addition to using `.settings()`, you can configure the namespace and project using environment variables:
88
+
89
+ - `DATACHAIN_NAMESPACE` sets the namespace.
90
+ - `DATACHAIN_PROJECT` sets the project name, or both the namespace and project using the format `namespace.project`.
91
+
92
+ ### Examples
93
+
94
+ ```
95
+ # Set namespace only
96
+ export DATACHAIN_NAMESPACE=dev
97
+
98
+ # Set project only
99
+ export DATACHAIN_PROJECT=analytics
100
+
101
+ # Set both namespace and project
102
+ export DATACHAIN_PROJECT=dev.analytics
103
+ ```
104
+
105
+ ## How Namespace and Project Are Resolved
106
+
107
+ When determining which namespace and project to use, Datachain applies the following precedence:
108
+
109
+ 1. **Fully qualified dataset name**
110
+ If the dataset name includes both the namespace and project, these values take highest precedence.
111
+ ```python
112
+ dc.read_dataset("dev.analytics.metrics")
113
+
114
+ 2. **Explicit settings in code**
115
+ Values provided via `.settings()` or passed directly to `read_dataset()` or similar methods.
116
+ ```python
117
+ dc.settings(namespace="dev", project="analytics")
118
+ dc.read_dataset("metrics", namespace="dev", project="analytics")
119
+ ```
120
+ 3. **Environment variables**
121
+ Namespace and project set using environment variables:
122
+ ```console
123
+ export DATACHAIN_PROJECT=dev.analytics
124
+ ```
125
+ 4. **Defaults**
126
+ If none of the above are provided, Datachain falls back to the default namespace and project.
127
+
85
128
  ## Reading a Dataset from a Project
86
129
 
87
130
  To read a dataset from a specific namespace and project:
@@ -116,4 +159,3 @@ dc.read_values(scores=[0.8, 1.5, 2.1]).save("metrics")
116
159
 
117
160
  ds = dc.read_dataset("local.local.metrics")
118
161
  ds.show()
119
- ```
@@ -49,6 +49,7 @@ from datachain.error import (
49
49
  DatasetInvalidVersionError,
50
50
  DatasetNotFoundError,
51
51
  DatasetVersionNotFoundError,
52
+ NamespaceNotFoundError,
52
53
  ProjectNotFoundError,
53
54
  QueryScriptCancelError,
54
55
  QueryScriptRunError,
@@ -1059,6 +1060,39 @@ class Catalog:
1059
1060
 
1060
1061
  return self.get_dataset(name, project)
1061
1062
 
1063
+ def get_full_dataset_name(
1064
+ self,
1065
+ name: str,
1066
+ project_name: Optional[str] = None,
1067
+ namespace_name: Optional[str] = None,
1068
+ ) -> tuple[str, str, str]:
1069
+ """
1070
+ Returns dataset name together with separated namespace and project name.
1071
+ It takes into account all the ways namespace and project can be added.
1072
+ """
1073
+ parsed_namespace_name, parsed_project_name, name = parse_dataset_name(name)
1074
+
1075
+ namespace_env = os.environ.get("DATACHAIN_NAMESPACE")
1076
+ project_env = os.environ.get("DATACHAIN_PROJECT")
1077
+ if project_env and len(project_env.split(".")) == 2:
1078
+ # we allow setting both namespace and project in DATACHAIN_PROJECT
1079
+ namespace_env, project_env = project_env.split(".")
1080
+
1081
+ namespace_name = (
1082
+ parsed_namespace_name
1083
+ or namespace_name
1084
+ or namespace_env
1085
+ or self.metastore.default_namespace_name
1086
+ )
1087
+ project_name = (
1088
+ parsed_project_name
1089
+ or project_name
1090
+ or project_env
1091
+ or self.metastore.default_project_name
1092
+ )
1093
+
1094
+ return namespace_name, project_name, name
1095
+
1062
1096
  def get_dataset(
1063
1097
  self, name: str, project: Optional[Project] = None
1064
1098
  ) -> DatasetRecord:
@@ -1074,21 +1108,26 @@ class Catalog:
1074
1108
  namespace_name: str,
1075
1109
  project_name: str,
1076
1110
  version: Optional[str] = None,
1111
+ pull_dataset: bool = False,
1112
+ update: bool = False,
1077
1113
  ) -> DatasetRecord:
1078
- try:
1079
- project = self.metastore.get_project(project_name, namespace_name)
1080
- ds = self.get_dataset(name, project)
1081
- if version and not ds.has_version(version):
1082
- raise DatasetVersionNotFoundError(
1083
- f"Dataset {name} does not have version {version}"
1084
- )
1085
- return ds
1114
+ if self.metastore.is_local_dataset(namespace_name) or not update:
1115
+ try:
1116
+ project = self.metastore.get_project(project_name, namespace_name)
1117
+ ds = self.get_dataset(name, project)
1118
+ if not version or ds.has_version(version):
1119
+ return ds
1120
+ except (NamespaceNotFoundError, ProjectNotFoundError, DatasetNotFoundError):
1121
+ pass
1122
+
1123
+ if self.metastore.is_local_dataset(namespace_name):
1124
+ raise DatasetNotFoundError(
1125
+ f"Dataset {name}"
1126
+ + (f" version {version} " if version else " ")
1127
+ + "not found"
1128
+ )
1086
1129
 
1087
- except (
1088
- ProjectNotFoundError,
1089
- DatasetNotFoundError,
1090
- DatasetVersionNotFoundError,
1091
- ):
1130
+ if pull_dataset:
1092
1131
  print("Dataset not found in local catalog, trying to get from studio")
1093
1132
  remote_ds_uri = create_dataset_uri(
1094
1133
  name, namespace_name, project_name, version
@@ -1103,6 +1142,8 @@ class Catalog:
1103
1142
  name, self.metastore.get_project(project_name, namespace_name)
1104
1143
  )
1105
1144
 
1145
+ return self.get_remote_dataset(namespace_name, project_name, name)
1146
+
1106
1147
  def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
1107
1148
  """Returns dataset that contains version with specific uuid"""
1108
1149
  for dataset in self.ls_datasets():
@@ -1119,6 +1160,10 @@ class Catalog:
1119
1160
 
1120
1161
  info_response = studio_client.dataset_info(namespace, project, name)
1121
1162
  if not info_response.ok:
1163
+ if info_response.status == 404:
1164
+ raise DatasetNotFoundError(
1165
+ f"Dataset {namespace}.{project}.{name} not found"
1166
+ )
1122
1167
  raise DataChainError(info_response.message)
1123
1168
 
1124
1169
  dataset_info = info_response.data
@@ -8,7 +8,6 @@ if TYPE_CHECKING:
8
8
 
9
9
  from datachain.cli.utils import determine_flavors
10
10
  from datachain.config import Config
11
- from datachain.dataset import parse_dataset_name
12
11
  from datachain.error import DataChainError, DatasetNotFoundError
13
12
  from datachain.studio import list_datasets as list_datasets_studio
14
13
 
@@ -106,9 +105,8 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
106
105
 
107
106
 
108
107
  def list_datasets_local_versions(catalog: "Catalog", name: str):
109
- namespace_name, project_name, name = parse_dataset_name(name)
110
- namespace_name = namespace_name or catalog.metastore.default_namespace_name
111
- project_name = project_name or catalog.metastore.default_project_name
108
+ namespace_name, project_name, name = catalog.get_full_dataset_name(name)
109
+
112
110
  project = catalog.metastore.get_project(project_name, namespace_name)
113
111
  ds = catalog.get_dataset(name, project)
114
112
  for v in ds.versions:
@@ -137,9 +135,7 @@ def rm_dataset(
137
135
  studio: Optional[bool] = False,
138
136
  team: Optional[str] = None,
139
137
  ):
140
- namespace_name, project_name, name = parse_dataset_name(name)
141
- namespace_name = namespace_name or catalog.metastore.default_namespace_name
142
- project_name = project_name or catalog.metastore.default_project_name
138
+ namespace_name, project_name, name = catalog.get_full_dataset_name(name)
143
139
 
144
140
  if not catalog.metastore.is_local_dataset(namespace_name) and studio:
145
141
  from datachain.studio import remove_studio_dataset
@@ -166,9 +162,7 @@ def edit_dataset(
166
162
  attrs: Optional[list[str]] = None,
167
163
  team: Optional[str] = None,
168
164
  ):
169
- namespace_name, project_name, name = parse_dataset_name(name)
170
- namespace_name = namespace_name or catalog.metastore.default_namespace_name
171
- project_name = project_name or catalog.metastore.default_project_name
165
+ namespace_name, project_name, name = catalog.get_full_dataset_name(name)
172
166
 
173
167
  if catalog.metastore.is_local_dataset(namespace_name):
174
168
  try:
@@ -132,6 +132,7 @@ class AbstractMetastore(ABC, Serializable):
132
132
  description: Optional[str] = None,
133
133
  uuid: Optional[str] = None,
134
134
  ignore_if_exists: bool = True,
135
+ validate: bool = True,
135
136
  **kwargs,
136
137
  ) -> Namespace:
137
138
  """Creates new namespace"""
@@ -192,6 +193,7 @@ class AbstractMetastore(ABC, Serializable):
192
193
  description: Optional[str] = None,
193
194
  uuid: Optional[str] = None,
194
195
  ignore_if_exists: bool = True,
196
+ validate: bool = True,
195
197
  **kwargs,
196
198
  ) -> Project:
197
199
  """Creates new project in specific namespace"""
@@ -725,8 +727,11 @@ class AbstractDBMetastore(AbstractMetastore):
725
727
  description: Optional[str] = None,
726
728
  uuid: Optional[str] = None,
727
729
  ignore_if_exists: bool = True,
730
+ validate: bool = True,
728
731
  **kwargs,
729
732
  ) -> Namespace:
733
+ if validate:
734
+ Namespace.validate_name(name)
730
735
  query = self._namespaces_insert().values(
731
736
  name=name,
732
737
  uuid=uuid or str(uuid4()),
@@ -775,12 +780,15 @@ class AbstractDBMetastore(AbstractMetastore):
775
780
  description: Optional[str] = None,
776
781
  uuid: Optional[str] = None,
777
782
  ignore_if_exists: bool = True,
783
+ validate: bool = True,
778
784
  **kwargs,
779
785
  ) -> Project:
786
+ if validate:
787
+ Project.validate_name(name)
780
788
  try:
781
789
  namespace = self.get_namespace(namespace_name)
782
790
  except NamespaceNotFoundError:
783
- namespace = self.create_namespace(namespace_name)
791
+ namespace = self.create_namespace(namespace_name, validate=validate)
784
792
 
785
793
  query = self._projects_insert().values(
786
794
  namespace_id=namespace.id,
@@ -817,11 +825,14 @@ class AbstractDBMetastore(AbstractMetastore):
817
825
  """Gets a single project inside some namespace by name"""
818
826
  n = self._namespaces
819
827
  p = self._projects
828
+ validate = True
829
+
820
830
  if self._is_listing_project(name, namespace_name) or self._is_default_project(
821
831
  name, namespace_name
822
832
  ):
823
833
  # we are always creating default and listing projects if they don't exist
824
834
  create = True
835
+ validate = False
825
836
 
826
837
  query = self._projects_select(
827
838
  *(getattr(n.c, f) for f in self._namespaces_fields),
@@ -834,7 +845,7 @@ class AbstractDBMetastore(AbstractMetastore):
834
845
  rows = list(self.db.execute(query, conn=conn))
835
846
  if not rows:
836
847
  if create:
837
- return self.create_project(namespace_name, name)
848
+ return self.create_project(namespace_name, name, validate=validate)
838
849
  raise ProjectNotFoundError(
839
850
  f"Project {name} in namespace {namespace_name} not found."
840
851
  )
@@ -468,8 +468,12 @@ class SQLiteMetastore(AbstractDBMetastore):
468
468
  be created implicitly though, to keep the same fully qualified name with
469
469
  Studio dataset.
470
470
  """
471
- system_namespace = self.create_namespace(Namespace.system(), "System namespace")
472
- self.create_project(system_namespace.name, Project.listing(), "Listing project")
471
+ system_namespace = self.create_namespace(
472
+ Namespace.system(), "System namespace", validate=False
473
+ )
474
+ self.create_project(
475
+ system_namespace.name, Project.listing(), "Listing project", validate=False
476
+ )
473
477
 
474
478
  def _check_schema_version(self) -> None:
475
479
  """
@@ -12,6 +12,9 @@ from typing import (
12
12
  )
13
13
  from urllib.parse import urlparse
14
14
 
15
+ from packaging.specifiers import SpecifierSet
16
+ from packaging.version import Version
17
+
15
18
  from datachain import semver
16
19
  from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
17
20
  from datachain.namespace import Namespace
@@ -81,8 +84,10 @@ def create_dataset_uri(
81
84
  def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
82
85
  """Parses dataset name and returns namespace, project and name"""
83
86
  if not name:
84
- raise ValueError("Name must be defined to parse it")
87
+ raise InvalidDatasetNameError("Name must be defined to parse it")
85
88
  split = name.split(".")
89
+ if len(split) > 3:
90
+ raise InvalidDatasetNameError(f"Invalid dataset name {name}")
86
91
  name = split[-1]
87
92
  project_name = split[-2] if len(split) > 1 else None
88
93
  namespace_name = split[-3] if len(split) > 2 else None
@@ -659,13 +664,39 @@ class DatasetRecord:
659
664
  return None
660
665
  return max(versions).version
661
666
 
662
- @property
663
- def prev_version(self) -> Optional[str]:
664
- """Returns previous version of a dataset"""
665
- if len(self.versions) == 1:
667
+ def latest_compatible_version(self, version_spec: str) -> Optional[str]:
668
+ """
669
+ Returns the latest version that matches the given version specifier.
670
+
671
+ Supports Python version specifiers like:
672
+ - ">=1.0.0,<2.0.0" (compatible release range)
673
+ - "~=1.4.2" (compatible release clause)
674
+ - "==1.2.*" (prefix matching)
675
+ - ">1.0.0" (exclusive ordered comparison)
676
+ - ">=1.0.0" (inclusive ordered comparison)
677
+ - "!=1.3.0" (version exclusion)
678
+
679
+ Args:
680
+ version_spec: Version specifier string following PEP 440
681
+
682
+ Returns:
683
+ Latest compatible version string, or None if no compatible version found
684
+ """
685
+ spec_set = SpecifierSet(version_spec)
686
+
687
+ # Convert dataset versions to packaging.Version objects
688
+ # and filter compatible ones
689
+ compatible_versions = []
690
+ for v in self.versions:
691
+ pkg_version = Version(v.version)
692
+ if spec_set.contains(pkg_version):
693
+ compatible_versions.append(v)
694
+
695
+ if not compatible_versions:
666
696
  return None
667
697
 
668
- return sorted(self.versions)[-2].version
698
+ # Return the latest compatible version
699
+ return max(compatible_versions).version
669
700
 
670
701
  @classmethod
671
702
  def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
@@ -24,7 +24,7 @@ from pydantic import BaseModel
24
24
  from tqdm import tqdm
25
25
 
26
26
  from datachain import semver
27
- from datachain.dataset import DatasetRecord, parse_dataset_name
27
+ from datachain.dataset import DatasetRecord
28
28
  from datachain.delta import delta_disabled
29
29
  from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
30
30
  from datachain.func import literal
@@ -557,6 +557,7 @@ class DataChain:
557
557
  update_version: which part of the dataset version to automatically increase.
558
558
  Available values: `major`, `minor` or `patch`. Default is `patch`.
559
559
  """
560
+ catalog = self.session.catalog
560
561
  if version is not None:
561
562
  semver.validate(version)
562
563
 
@@ -570,17 +571,10 @@ class DataChain:
570
571
  " patch"
571
572
  )
572
573
 
573
- namespace_name, project_name, name = parse_dataset_name(name)
574
-
575
- namespace_name = (
576
- namespace_name
577
- or self._settings.namespace
578
- or self.session.catalog.metastore.default_namespace_name
579
- )
580
- project_name = (
581
- project_name
582
- or self._settings.project
583
- or self.session.catalog.metastore.default_project_name
574
+ namespace_name, project_name, name = catalog.get_full_dataset_name(
575
+ name,
576
+ namespace_name=self._settings.namespace,
577
+ project_name=self._settings.project,
584
578
  )
585
579
 
586
580
  try:
@@ -1,16 +1,12 @@
1
1
  from collections.abc import Sequence
2
2
  from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
3
3
 
4
- from datachain.dataset import parse_dataset_name
5
4
  from datachain.error import (
6
5
  DatasetNotFoundError,
7
6
  DatasetVersionNotFoundError,
8
7
  ProjectNotFoundError,
9
8
  )
10
9
  from datachain.lib.dataset_info import DatasetInfo
11
- from datachain.lib.file import (
12
- File,
13
- )
14
10
  from datachain.lib.projects import get as get_project
15
11
  from datachain.lib.settings import Settings
16
12
  from datachain.lib.signal_schema import SignalSchema
@@ -35,7 +31,6 @@ def read_dataset(
35
31
  version: Optional[Union[str, int]] = None,
36
32
  session: Optional[Session] = None,
37
33
  settings: Optional[dict] = None,
38
- fallback_to_studio: bool = True,
39
34
  delta: Optional[bool] = False,
40
35
  delta_on: Optional[Union[str, Sequence[str]]] = (
41
36
  "file.path",
@@ -45,6 +40,7 @@ def read_dataset(
45
40
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
46
41
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
47
42
  delta_retry: Optional[Union[bool, str]] = None,
43
+ update: bool = False,
48
44
  ) -> "DataChain":
49
45
  """Get data from a saved Dataset. It returns the chain itself.
50
46
  If dataset or version is not found locally, it will try to pull it from Studio.
@@ -56,11 +52,12 @@ def read_dataset(
56
52
  set; otherwise, default values will be applied.
57
53
  namespace : optional name of namespace in which dataset to read is created
58
54
  project : optional name of project in which dataset to read is created
59
- version : dataset version
55
+ version : dataset version. Supports:
56
+ - Exact version strings: "1.2.3"
57
+ - Legacy integer versions: 1, 2, 3 (finds latest major version)
58
+ - Version specifiers (PEP 440): ">=1.0.0,<2.0.0", "~=1.4.2", "==1.2.*", etc.
60
59
  session : Session to use for the chain.
61
60
  settings : Settings to use for the chain.
62
- fallback_to_studio : Try to pull dataset from Studio if not found locally.
63
- Default is True.
64
61
  delta: If True, only process new or changed files instead of reprocessing
65
62
  everything. This saves time by skipping files that were already processed in
66
63
  previous versions. The optimization is working when a new version of the
@@ -80,6 +77,10 @@ def read_dataset(
80
77
  (error mode)
81
78
  - True: Reprocess records missing from the result dataset (missing mode)
82
79
  - None: No retry processing (default)
80
+ update: If True always checks for newer versions available on Studio, even if
81
+ some version of the dataset exists locally already. If False (default), it
82
+ will only fetch the dataset from Studio if it is not found locally.
83
+
83
84
 
84
85
  Example:
85
86
  ```py
@@ -93,11 +94,22 @@ def read_dataset(
93
94
  ```
94
95
 
95
96
  ```py
96
- chain = dc.read_dataset("my_cats", fallback_to_studio=False)
97
+ chain = dc.read_dataset("my_cats", version="1.0.0")
97
98
  ```
98
99
 
99
100
  ```py
100
- chain = dc.read_dataset("my_cats", version="1.0.0")
101
+ # Using version specifiers (PEP 440)
102
+ chain = dc.read_dataset("my_cats", version=">=1.0.0,<2.0.0")
103
+ ```
104
+
105
+ ```py
106
+ # Legacy integer version support (finds latest in major version)
107
+ chain = dc.read_dataset("my_cats", version=1) # Latest 1.x.x version
108
+ ```
109
+
110
+ ```py
111
+ # Always check for newer versions matching a version specifier from Studio
112
+ chain = dc.read_dataset("my_cats", version=">=1.0.0", update=True)
101
113
  ```
102
114
 
103
115
  ```py
@@ -114,7 +126,6 @@ def read_dataset(
114
126
  version="1.0.0",
115
127
  session=session,
116
128
  settings=settings,
117
- fallback_to_studio=True,
118
129
  )
119
130
  ```
120
131
  """
@@ -122,41 +133,49 @@ def read_dataset(
122
133
 
123
134
  from .datachain import DataChain
124
135
 
136
+ telemetry.send_event_once("class", "datachain_init", name=name, version=version)
137
+
125
138
  session = Session.get(session)
126
139
  catalog = session.catalog
127
140
 
128
- namespace_name, project_name, name = parse_dataset_name(name)
129
- namespace_name = (
130
- namespace_name or namespace or catalog.metastore.default_namespace_name
141
+ namespace_name, project_name, name = catalog.get_full_dataset_name(
142
+ name,
143
+ project_name=project,
144
+ namespace_name=namespace,
131
145
  )
132
- project_name = project_name or project or catalog.metastore.default_project_name
133
146
 
134
147
  if version is not None:
148
+ dataset = session.catalog.get_dataset_with_remote_fallback(
149
+ name, namespace_name, project_name, update=update
150
+ )
151
+
152
+ # Convert legacy integer versions to version specifiers
153
+ # For backward compatibility we still allow users to put version as integer
154
+ # in which case we convert it to a version specifier that finds the latest
155
+ # version where major part is equal to that input version.
156
+ # For example if user sets version=2, we convert it to ">=2.0.0,<3.0.0"
157
+ # which will find something like 2.4.3 (assuming 2.4.3 is the biggest among
158
+ # all 2.* dataset versions)
159
+ if isinstance(version, int):
160
+ version_spec = f">={version}.0.0,<{version + 1}.0.0"
161
+ else:
162
+ version_spec = str(version)
163
+
164
+ from packaging.specifiers import InvalidSpecifier, SpecifierSet
165
+
135
166
  try:
136
- # for backward compatibility we still allow users to put version as integer
137
- # in which case we are trying to find latest version where major part is
138
- # equal to that input version. For example if user sets version=2, we could
139
- # continue with something like 2.4.3 (assuming 2.4.3 is the biggest among
140
- # all 2.* dataset versions). If dataset doesn't have any versions where
141
- # major part is equal to that input, exception is thrown.
142
- major = int(version)
143
- try:
144
- ds_project = get_project(project_name, namespace_name, session=session)
145
- except ProjectNotFoundError:
146
- raise DatasetNotFoundError(
147
- f"Dataset {name} not found in namespace {namespace_name} and",
148
- f" project {project_name}",
149
- ) from None
150
-
151
- dataset = session.catalog.get_dataset(name, ds_project)
152
- latest_major = dataset.latest_major_version(major)
153
- if not latest_major:
167
+ # Try to parse as version specifier
168
+ SpecifierSet(version_spec)
169
+ # If it's a valid specifier set, find the latest compatible version
170
+ latest_compatible = dataset.latest_compatible_version(version_spec)
171
+ if not latest_compatible:
154
172
  raise DatasetVersionNotFoundError(
155
- f"Dataset {name} does not have version {version}"
173
+ f"No dataset {name} version matching specifier {version_spec}"
156
174
  )
157
- version = latest_major
158
- except ValueError:
159
- # version is in new semver string format, continuing as normal
175
+ version = latest_compatible
176
+ except InvalidSpecifier:
177
+ # If not a valid specifier, treat as exact version string
178
+ # This handles cases like "1.2.3" which are exact versions, not specifiers
160
179
  pass
161
180
 
162
181
  if settings:
@@ -170,11 +189,8 @@ def read_dataset(
170
189
  namespace_name=namespace_name,
171
190
  version=version, # type: ignore[arg-type]
172
191
  session=session,
173
- indexing_column_types=File._datachain_column_types,
174
- fallback_to_studio=fallback_to_studio,
175
192
  )
176
193
 
177
- telemetry.send_event_once("class", "datachain_init", name=name, version=version)
178
194
  signals_schema = SignalSchema({"sys": Sys})
179
195
  if query.feature_schema:
180
196
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
@@ -320,11 +336,11 @@ def delete_dataset(
320
336
  session = Session.get(session, in_memory=in_memory)
321
337
  catalog = session.catalog
322
338
 
323
- namespace_name, project_name, name = parse_dataset_name(name)
324
- namespace_name = (
325
- namespace_name or namespace or catalog.metastore.default_namespace_name
339
+ namespace_name, project_name, name = catalog.get_full_dataset_name(
340
+ name,
341
+ project_name=project,
342
+ namespace_name=namespace,
326
343
  )
327
- project_name = project_name or project or catalog.metastore.default_project_name
328
344
 
329
345
  if not catalog.metastore.is_local_dataset(namespace_name) and studio:
330
346
  return remove_studio_dataset(
@@ -127,12 +127,8 @@ def read_listing_dataset(
127
127
  if version is None:
128
128
  version = dataset.latest_version
129
129
 
130
- query = DatasetQuery(
131
- name=name,
132
- session=session,
133
- indexing_column_types=File._datachain_column_types,
134
- fallback_to_studio=False,
135
- )
130
+ query = DatasetQuery(name=name, session=session)
131
+
136
132
  if settings:
137
133
  cfg = {**settings}
138
134
  if "prefetch" not in cfg:
@@ -97,4 +97,4 @@ def read_records(
97
97
  for chunk in batched(records, INSERT_BATCH_SIZE):
98
98
  warehouse.insert_rows(table, chunk)
99
99
  warehouse.insert_rows_done(table)
100
- return read_dataset(name=dsr.name, session=session, settings=settings)
100
+ return read_dataset(name=dsr.full_name, session=session, settings=settings)
@@ -54,7 +54,7 @@ def get(name: str, namespace: str, session: Optional[Session]) -> Project:
54
54
  ```py
55
55
  import datachain as dc
56
56
  from datachain.lib.projects import get as get_project
57
- project = get_project("my-project", "local")
57
+ project = get_project("my-project", "local")
58
58
  ```
59
59
  """
60
60
  return Session.get(session).catalog.metastore.get_project(name, namespace)
@@ -25,6 +25,7 @@ from pydantic import BaseModel, Field, create_model
25
25
  from sqlalchemy import ColumnElement
26
26
  from typing_extensions import Literal as LiteralEx
27
27
 
28
+ from datachain.func import literal
28
29
  from datachain.func.func import Func
29
30
  from datachain.lib.convert.python_to_sql import python_to_sql
30
31
  from datachain.lib.convert.sql_to_python import sql_to_python
@@ -659,6 +660,7 @@ class SignalSchema:
659
660
 
660
661
  def mutate(self, args_map: dict) -> "SignalSchema":
661
662
  new_values = self.values.copy()
663
+ primitives = (bool, str, int, float)
662
664
 
663
665
  for name, value in args_map.items():
664
666
  if isinstance(value, Column) and value.name in self.values:
@@ -679,6 +681,12 @@ class SignalSchema:
679
681
  # adding new signal with function
680
682
  new_values[name] = value.get_result_type(self)
681
683
  continue
684
+ if isinstance(value, primitives):
685
+ # For primitives, store the type, not the value
686
+ val = literal(value)
687
+ val.type = python_to_sql(type(value))()
688
+ new_values[name] = sql_to_python(val)
689
+ continue
682
690
  if isinstance(value, ColumnElement):
683
691
  # adding new signal
684
692
  new_values[name] = sql_to_python(value)