datachain 0.24.6__tar.gz → 0.25.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (399) hide show
  1. {datachain-0.24.6 → datachain-0.25.1}/.github/workflows/benchmarks.yml +0 -3
  2. {datachain-0.24.6 → datachain-0.25.1}/.github/workflows/tests-studio.yml +15 -2
  3. {datachain-0.24.6 → datachain-0.25.1}/.github/workflows/tests.yml +16 -12
  4. {datachain-0.24.6 → datachain-0.25.1}/PKG-INFO +3 -2
  5. {datachain-0.24.6 → datachain-0.25.1}/pyproject.toml +3 -1
  6. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/__init__.py +2 -0
  7. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/catalog/catalog.py +3 -20
  8. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/data_storage/metastore.py +30 -1
  9. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/data_storage/warehouse.py +16 -17
  10. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/arrow.py +9 -0
  11. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/__init__.py +2 -1
  12. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/datasets.py +55 -0
  13. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/hf.py +18 -21
  14. {datachain-0.24.6 → datachain-0.25.1}/src/datachain.egg-info/PKG-INFO +3 -2
  15. {datachain-0.24.6 → datachain-0.25.1}/src/datachain.egg-info/requires.txt +4 -1
  16. {datachain-0.24.6 → datachain-0.25.1}/tests/conftest.py +6 -0
  17. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_datasets.py +101 -14
  18. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_hf.py +16 -1
  19. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_datachain.py +23 -1
  20. {datachain-0.24.6 → datachain-0.25.1}/tests/utils.py +8 -0
  21. {datachain-0.24.6 → datachain-0.25.1}/.cruft.json +0 -0
  22. {datachain-0.24.6 → datachain-0.25.1}/.gitattributes +0 -0
  23. {datachain-0.24.6 → datachain-0.25.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  24. {datachain-0.24.6 → datachain-0.25.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  25. {datachain-0.24.6 → datachain-0.25.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  26. {datachain-0.24.6 → datachain-0.25.1}/.github/codecov.yaml +0 -0
  27. {datachain-0.24.6 → datachain-0.25.1}/.github/dependabot.yml +0 -0
  28. {datachain-0.24.6 → datachain-0.25.1}/.github/workflows/release.yml +0 -0
  29. {datachain-0.24.6 → datachain-0.25.1}/.github/workflows/update-template.yaml +0 -0
  30. {datachain-0.24.6 → datachain-0.25.1}/.gitignore +0 -0
  31. {datachain-0.24.6 → datachain-0.25.1}/.pre-commit-config.yaml +0 -0
  32. {datachain-0.24.6 → datachain-0.25.1}/CODE_OF_CONDUCT.rst +0 -0
  33. {datachain-0.24.6 → datachain-0.25.1}/LICENSE +0 -0
  34. {datachain-0.24.6 → datachain-0.25.1}/README.rst +0 -0
  35. {datachain-0.24.6 → datachain-0.25.1}/docs/assets/captioned_cartoons.png +0 -0
  36. {datachain-0.24.6 → datachain-0.25.1}/docs/assets/datachain-white.svg +0 -0
  37. {datachain-0.24.6 → datachain-0.25.1}/docs/assets/datachain.svg +0 -0
  38. {datachain-0.24.6 → datachain-0.25.1}/docs/commands/auth/login.md +0 -0
  39. {datachain-0.24.6 → datachain-0.25.1}/docs/commands/auth/logout.md +0 -0
  40. {datachain-0.24.6 → datachain-0.25.1}/docs/commands/auth/team.md +0 -0
  41. {datachain-0.24.6 → datachain-0.25.1}/docs/commands/auth/token.md +0 -0
  42. {datachain-0.24.6 → datachain-0.25.1}/docs/commands/index.md +0 -0
  43. {datachain-0.24.6 → datachain-0.25.1}/docs/commands/job/cancel.md +0 -0
  44. {datachain-0.24.6 → datachain-0.25.1}/docs/commands/job/clusters.md +0 -0
  45. {datachain-0.24.6 → datachain-0.25.1}/docs/commands/job/logs.md +0 -0
  46. {datachain-0.24.6 → datachain-0.25.1}/docs/commands/job/ls.md +0 -0
  47. {datachain-0.24.6 → datachain-0.25.1}/docs/commands/job/run.md +0 -0
  48. {datachain-0.24.6 → datachain-0.25.1}/docs/contributing.md +0 -0
  49. {datachain-0.24.6 → datachain-0.25.1}/docs/css/github-permalink-style.css +0 -0
  50. {datachain-0.24.6 → datachain-0.25.1}/docs/examples.md +0 -0
  51. {datachain-0.24.6 → datachain-0.25.1}/docs/guide/db_migrations.md +0 -0
  52. {datachain-0.24.6 → datachain-0.25.1}/docs/guide/delta.md +0 -0
  53. {datachain-0.24.6 → datachain-0.25.1}/docs/guide/env.md +0 -0
  54. {datachain-0.24.6 → datachain-0.25.1}/docs/guide/index.md +0 -0
  55. {datachain-0.24.6 → datachain-0.25.1}/docs/guide/namespaces.md +0 -0
  56. {datachain-0.24.6 → datachain-0.25.1}/docs/guide/processing.md +0 -0
  57. {datachain-0.24.6 → datachain-0.25.1}/docs/guide/remotes.md +0 -0
  58. {datachain-0.24.6 → datachain-0.25.1}/docs/guide/retry.md +0 -0
  59. {datachain-0.24.6 → datachain-0.25.1}/docs/index.md +0 -0
  60. {datachain-0.24.6 → datachain-0.25.1}/docs/overrides/main.html +0 -0
  61. {datachain-0.24.6 → datachain-0.25.1}/docs/quick-start.md +0 -0
  62. {datachain-0.24.6 → datachain-0.25.1}/docs/references/data-types/arrowrow.md +0 -0
  63. {datachain-0.24.6 → datachain-0.25.1}/docs/references/data-types/bbox.md +0 -0
  64. {datachain-0.24.6 → datachain-0.25.1}/docs/references/data-types/file.md +0 -0
  65. {datachain-0.24.6 → datachain-0.25.1}/docs/references/data-types/imagefile.md +0 -0
  66. {datachain-0.24.6 → datachain-0.25.1}/docs/references/data-types/index.md +0 -0
  67. {datachain-0.24.6 → datachain-0.25.1}/docs/references/data-types/pose.md +0 -0
  68. {datachain-0.24.6 → datachain-0.25.1}/docs/references/data-types/segment.md +0 -0
  69. {datachain-0.24.6 → datachain-0.25.1}/docs/references/data-types/tarvfile.md +0 -0
  70. {datachain-0.24.6 → datachain-0.25.1}/docs/references/data-types/textfile.md +0 -0
  71. {datachain-0.24.6 → datachain-0.25.1}/docs/references/data-types/videofile.md +0 -0
  72. {datachain-0.24.6 → datachain-0.25.1}/docs/references/datachain.md +0 -0
  73. {datachain-0.24.6 → datachain-0.25.1}/docs/references/func.md +0 -0
  74. {datachain-0.24.6 → datachain-0.25.1}/docs/references/index.md +0 -0
  75. {datachain-0.24.6 → datachain-0.25.1}/docs/references/toolkit.md +0 -0
  76. {datachain-0.24.6 → datachain-0.25.1}/docs/references/torch.md +0 -0
  77. {datachain-0.24.6 → datachain-0.25.1}/docs/references/udf.md +0 -0
  78. {datachain-0.24.6 → datachain-0.25.1}/docs/tutorials.md +0 -0
  79. {datachain-0.24.6 → datachain-0.25.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  80. {datachain-0.24.6 → datachain-0.25.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  81. {datachain-0.24.6 → datachain-0.25.1}/examples/computer_vision/openimage-detect.py +0 -0
  82. {datachain-0.24.6 → datachain-0.25.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
  83. {datachain-0.24.6 → datachain-0.25.1}/examples/computer_vision/ultralytics-pose.py +0 -0
  84. {datachain-0.24.6 → datachain-0.25.1}/examples/computer_vision/ultralytics-segment.py +0 -0
  85. {datachain-0.24.6 → datachain-0.25.1}/examples/get_started/common_sql_functions.py +0 -0
  86. {datachain-0.24.6 → datachain-0.25.1}/examples/get_started/json-csv-reader.py +0 -0
  87. {datachain-0.24.6 → datachain-0.25.1}/examples/get_started/torch-loader.py +0 -0
  88. {datachain-0.24.6 → datachain-0.25.1}/examples/get_started/udfs/parallel.py +0 -0
  89. {datachain-0.24.6 → datachain-0.25.1}/examples/get_started/udfs/simple.py +0 -0
  90. {datachain-0.24.6 → datachain-0.25.1}/examples/get_started/udfs/stateful.py +0 -0
  91. {datachain-0.24.6 → datachain-0.25.1}/examples/incremental_processing/delta.py +0 -0
  92. {datachain-0.24.6 → datachain-0.25.1}/examples/incremental_processing/retry.py +0 -0
  93. {datachain-0.24.6 → datachain-0.25.1}/examples/incremental_processing/utils.py +0 -0
  94. {datachain-0.24.6 → datachain-0.25.1}/examples/llm_and_nlp/claude-query.py +0 -0
  95. {datachain-0.24.6 → datachain-0.25.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  96. {datachain-0.24.6 → datachain-0.25.1}/examples/multimodal/clip_inference.py +0 -0
  97. {datachain-0.24.6 → datachain-0.25.1}/examples/multimodal/hf_pipeline.py +0 -0
  98. {datachain-0.24.6 → datachain-0.25.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  99. {datachain-0.24.6 → datachain-0.25.1}/examples/multimodal/wds.py +0 -0
  100. {datachain-0.24.6 → datachain-0.25.1}/examples/multimodal/wds_filtered.py +0 -0
  101. {datachain-0.24.6 → datachain-0.25.1}/mkdocs.yml +0 -0
  102. {datachain-0.24.6 → datachain-0.25.1}/noxfile.py +0 -0
  103. {datachain-0.24.6 → datachain-0.25.1}/setup.cfg +0 -0
  104. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/__main__.py +0 -0
  105. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/asyn.py +0 -0
  106. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cache.py +0 -0
  107. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/catalog/__init__.py +0 -0
  108. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/catalog/datasource.py +0 -0
  109. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/catalog/loader.py +0 -0
  110. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/__init__.py +0 -0
  111. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/commands/__init__.py +0 -0
  112. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/commands/datasets.py +0 -0
  113. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/commands/du.py +0 -0
  114. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/commands/index.py +0 -0
  115. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/commands/ls.py +0 -0
  116. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/commands/misc.py +0 -0
  117. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/commands/query.py +0 -0
  118. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/commands/show.py +0 -0
  119. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/parser/__init__.py +0 -0
  120. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/parser/job.py +0 -0
  121. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/parser/studio.py +0 -0
  122. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/parser/utils.py +0 -0
  123. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/cli/utils.py +0 -0
  124. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/client/__init__.py +0 -0
  125. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/client/azure.py +0 -0
  126. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/client/fileslice.py +0 -0
  127. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/client/fsspec.py +0 -0
  128. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/client/gcs.py +0 -0
  129. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/client/hf.py +0 -0
  130. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/client/local.py +0 -0
  131. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/client/s3.py +0 -0
  132. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/config.py +0 -0
  133. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/data_storage/__init__.py +0 -0
  134. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/data_storage/db_engine.py +0 -0
  135. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/data_storage/job.py +0 -0
  136. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/data_storage/schema.py +0 -0
  137. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/data_storage/serializer.py +0 -0
  138. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/data_storage/sqlite.py +0 -0
  139. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/dataset.py +0 -0
  140. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/delta.py +0 -0
  141. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/diff/__init__.py +0 -0
  142. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/error.py +0 -0
  143. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/fs/__init__.py +0 -0
  144. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/fs/reference.py +0 -0
  145. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/fs/utils.py +0 -0
  146. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/__init__.py +0 -0
  147. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/aggregate.py +0 -0
  148. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/array.py +0 -0
  149. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/base.py +0 -0
  150. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/conditional.py +0 -0
  151. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/func.py +0 -0
  152. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/numeric.py +0 -0
  153. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/path.py +0 -0
  154. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/random.py +0 -0
  155. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/string.py +0 -0
  156. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/func/window.py +0 -0
  157. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/job.py +0 -0
  158. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/__init__.py +0 -0
  159. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/clip.py +0 -0
  160. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/convert/__init__.py +0 -0
  161. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/convert/flatten.py +0 -0
  162. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  163. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  164. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/convert/unflatten.py +0 -0
  165. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  166. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/data_model.py +0 -0
  167. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dataset_info.py +0 -0
  168. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/csv.py +0 -0
  169. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/database.py +0 -0
  170. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/datachain.py +0 -0
  171. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/hf.py +0 -0
  172. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/json.py +0 -0
  173. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/listings.py +0 -0
  174. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/pandas.py +0 -0
  175. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/parquet.py +0 -0
  176. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/records.py +0 -0
  177. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/storage.py +0 -0
  178. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/utils.py +0 -0
  179. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/dc/values.py +0 -0
  180. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/file.py +0 -0
  181. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/image.py +0 -0
  182. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/listing.py +0 -0
  183. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/listing_info.py +0 -0
  184. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/meta_formats.py +0 -0
  185. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/model_store.py +0 -0
  186. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/namespaces.py +0 -0
  187. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/projects.py +0 -0
  188. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/pytorch.py +0 -0
  189. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/settings.py +0 -0
  190. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/signal_schema.py +0 -0
  191. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/tar.py +0 -0
  192. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/text.py +0 -0
  193. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/udf.py +0 -0
  194. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/udf_signature.py +0 -0
  195. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/utils.py +0 -0
  196. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/video.py +0 -0
  197. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/webdataset.py +0 -0
  198. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/lib/webdataset_laion.py +0 -0
  199. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/listing.py +0 -0
  200. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/model/__init__.py +0 -0
  201. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/model/bbox.py +0 -0
  202. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/model/pose.py +0 -0
  203. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/model/segment.py +0 -0
  204. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/model/ultralytics/__init__.py +0 -0
  205. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/model/ultralytics/bbox.py +0 -0
  206. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/model/ultralytics/pose.py +0 -0
  207. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/model/ultralytics/segment.py +0 -0
  208. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/model/utils.py +0 -0
  209. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/namespace.py +0 -0
  210. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/node.py +0 -0
  211. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/nodes_fetcher.py +0 -0
  212. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/nodes_thread_pool.py +0 -0
  213. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/progress.py +0 -0
  214. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/project.py +0 -0
  215. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/py.typed +0 -0
  216. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/__init__.py +0 -0
  217. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/batch.py +0 -0
  218. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/dataset.py +0 -0
  219. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/dispatch.py +0 -0
  220. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/metrics.py +0 -0
  221. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/params.py +0 -0
  222. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/queue.py +0 -0
  223. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/schema.py +0 -0
  224. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/session.py +0 -0
  225. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/udf.py +0 -0
  226. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/query/utils.py +0 -0
  227. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/remote/__init__.py +0 -0
  228. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/remote/studio.py +0 -0
  229. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/script_meta.py +0 -0
  230. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/semver.py +0 -0
  231. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/__init__.py +0 -0
  232. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/default/__init__.py +0 -0
  233. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/default/base.py +0 -0
  234. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/functions/__init__.py +0 -0
  235. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/functions/aggregate.py +0 -0
  236. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/functions/array.py +0 -0
  237. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/functions/conditional.py +0 -0
  238. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/functions/numeric.py +0 -0
  239. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/functions/path.py +0 -0
  240. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/functions/random.py +0 -0
  241. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/functions/string.py +0 -0
  242. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/selectable.py +0 -0
  243. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  244. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/sqlite/base.py +0 -0
  245. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/sqlite/types.py +0 -0
  246. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/sqlite/vector.py +0 -0
  247. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/types.py +0 -0
  248. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/sql/utils.py +0 -0
  249. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/studio.py +0 -0
  250. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/telemetry.py +0 -0
  251. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/toolkit/__init__.py +0 -0
  252. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/toolkit/split.py +0 -0
  253. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/torch/__init__.py +0 -0
  254. {datachain-0.24.6 → datachain-0.25.1}/src/datachain/utils.py +0 -0
  255. {datachain-0.24.6 → datachain-0.25.1}/src/datachain.egg-info/SOURCES.txt +0 -0
  256. {datachain-0.24.6 → datachain-0.25.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  257. {datachain-0.24.6 → datachain-0.25.1}/src/datachain.egg-info/entry_points.txt +0 -0
  258. {datachain-0.24.6 → datachain-0.25.1}/src/datachain.egg-info/top_level.txt +0 -0
  259. {datachain-0.24.6 → datachain-0.25.1}/tests/__init__.py +0 -0
  260. {datachain-0.24.6 → datachain-0.25.1}/tests/benchmarks/__init__.py +0 -0
  261. {datachain-0.24.6 → datachain-0.25.1}/tests/benchmarks/conftest.py +0 -0
  262. {datachain-0.24.6 → datachain-0.25.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  263. {datachain-0.24.6 → datachain-0.25.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  264. {datachain-0.24.6 → datachain-0.25.1}/tests/benchmarks/datasets/.gitignore +0 -0
  265. {datachain-0.24.6 → datachain-0.25.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  266. {datachain-0.24.6 → datachain-0.25.1}/tests/benchmarks/test_datachain.py +0 -0
  267. {datachain-0.24.6 → datachain-0.25.1}/tests/benchmarks/test_ls.py +0 -0
  268. {datachain-0.24.6 → datachain-0.25.1}/tests/benchmarks/test_version.py +0 -0
  269. {datachain-0.24.6 → datachain-0.25.1}/tests/data.py +0 -0
  270. {datachain-0.24.6 → datachain-0.25.1}/tests/examples/__init__.py +0 -0
  271. {datachain-0.24.6 → datachain-0.25.1}/tests/examples/test_examples.py +0 -0
  272. {datachain-0.24.6 → datachain-0.25.1}/tests/examples/test_wds_e2e.py +0 -0
  273. {datachain-0.24.6 → datachain-0.25.1}/tests/examples/wds_data.py +0 -0
  274. {datachain-0.24.6 → datachain-0.25.1}/tests/func/__init__.py +0 -0
  275. {datachain-0.24.6 → datachain-0.25.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  276. {datachain-0.24.6 → datachain-0.25.1}/tests/func/data/lena.jpg +0 -0
  277. {datachain-0.24.6 → datachain-0.25.1}/tests/func/fake-service-account-credentials.json +0 -0
  278. {datachain-0.24.6 → datachain-0.25.1}/tests/func/functions/__init__.py +0 -0
  279. {datachain-0.24.6 → datachain-0.25.1}/tests/func/functions/test_aggregate.py +0 -0
  280. {datachain-0.24.6 → datachain-0.25.1}/tests/func/functions/test_array.py +0 -0
  281. {datachain-0.24.6 → datachain-0.25.1}/tests/func/functions/test_conditional.py +0 -0
  282. {datachain-0.24.6 → datachain-0.25.1}/tests/func/functions/test_numeric.py +0 -0
  283. {datachain-0.24.6 → datachain-0.25.1}/tests/func/functions/test_path.py +0 -0
  284. {datachain-0.24.6 → datachain-0.25.1}/tests/func/functions/test_random.py +0 -0
  285. {datachain-0.24.6 → datachain-0.25.1}/tests/func/functions/test_string.py +0 -0
  286. {datachain-0.24.6 → datachain-0.25.1}/tests/func/model/__init__.py +0 -0
  287. {datachain-0.24.6 → datachain-0.25.1}/tests/func/model/data/running-mask0.png +0 -0
  288. {datachain-0.24.6 → datachain-0.25.1}/tests/func/model/data/running-mask1.png +0 -0
  289. {datachain-0.24.6 → datachain-0.25.1}/tests/func/model/data/running.jpg +0 -0
  290. {datachain-0.24.6 → datachain-0.25.1}/tests/func/model/data/ships.jpg +0 -0
  291. {datachain-0.24.6 → datachain-0.25.1}/tests/func/model/test_yolo.py +0 -0
  292. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_batching.py +0 -0
  293. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_catalog.py +0 -0
  294. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_client.py +0 -0
  295. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_cloud_transfer.py +0 -0
  296. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_data_storage.py +0 -0
  297. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_datachain.py +0 -0
  298. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_datachain_merge.py +0 -0
  299. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_dataset_query.py +0 -0
  300. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_delta.py +0 -0
  301. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_feature_pickling.py +0 -0
  302. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_file.py +0 -0
  303. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_hidden_field.py +0 -0
  304. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_image.py +0 -0
  305. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_listing.py +0 -0
  306. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_ls.py +0 -0
  307. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_meta_formats.py +0 -0
  308. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_metastore.py +0 -0
  309. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_metrics.py +0 -0
  310. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_pull.py +0 -0
  311. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_pytorch.py +0 -0
  312. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_query.py +0 -0
  313. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_read_database.py +0 -0
  314. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_read_dataset_remote.py +0 -0
  315. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  316. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_retry.py +0 -0
  317. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_session.py +0 -0
  318. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_toolkit.py +0 -0
  319. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_video.py +0 -0
  320. {datachain-0.24.6 → datachain-0.25.1}/tests/func/test_warehouse.py +0 -0
  321. {datachain-0.24.6 → datachain-0.25.1}/tests/scripts/feature_class.py +0 -0
  322. {datachain-0.24.6 → datachain-0.25.1}/tests/scripts/feature_class_exception.py +0 -0
  323. {datachain-0.24.6 → datachain-0.25.1}/tests/scripts/feature_class_parallel.py +0 -0
  324. {datachain-0.24.6 → datachain-0.25.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  325. {datachain-0.24.6 → datachain-0.25.1}/tests/scripts/name_len_slow.py +0 -0
  326. {datachain-0.24.6 → datachain-0.25.1}/tests/test_atomicity.py +0 -0
  327. {datachain-0.24.6 → datachain-0.25.1}/tests/test_cli_e2e.py +0 -0
  328. {datachain-0.24.6 → datachain-0.25.1}/tests/test_cli_studio.py +0 -0
  329. {datachain-0.24.6 → datachain-0.25.1}/tests/test_import_time.py +0 -0
  330. {datachain-0.24.6 → datachain-0.25.1}/tests/test_query_e2e.py +0 -0
  331. {datachain-0.24.6 → datachain-0.25.1}/tests/test_telemetry.py +0 -0
  332. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/__init__.py +0 -0
  333. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/__init__.py +0 -0
  334. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/conftest.py +0 -0
  335. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_arrow.py +0 -0
  336. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_clip.py +0 -0
  337. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  338. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  339. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_diff.py +0 -0
  340. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_feature.py +0 -0
  341. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_feature_utils.py +0 -0
  342. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_file.py +0 -0
  343. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_hf.py +0 -0
  344. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_image.py +0 -0
  345. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_listing_info.py +0 -0
  346. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_namespace.py +0 -0
  347. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_project.py +0 -0
  348. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_python_to_sql.py +0 -0
  349. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_schema.py +0 -0
  350. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_signal_schema.py +0 -0
  351. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  352. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_text.py +0 -0
  353. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_udf.py +0 -0
  354. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_udf_signature.py +0 -0
  355. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_utils.py +0 -0
  356. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/lib/test_webdataset.py +0 -0
  357. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/model/__init__.py +0 -0
  358. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/model/test_bbox.py +0 -0
  359. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/model/test_pose.py +0 -0
  360. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/model/test_segment.py +0 -0
  361. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/model/test_utils.py +0 -0
  362. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/sql/__init__.py +0 -0
  363. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  364. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/sql/sqlite/test_types.py +0 -0
  365. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  366. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/sql/test_array.py +0 -0
  367. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/sql/test_conditional.py +0 -0
  368. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/sql/test_path.py +0 -0
  369. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/sql/test_random.py +0 -0
  370. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/sql/test_selectable.py +0 -0
  371. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/sql/test_string.py +0 -0
  372. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_asyn.py +0 -0
  373. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_cache.py +0 -0
  374. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_catalog.py +0 -0
  375. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_catalog_loader.py +0 -0
  376. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_cli_parsing.py +0 -0
  377. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_client.py +0 -0
  378. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_client_gcs.py +0 -0
  379. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_client_s3.py +0 -0
  380. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_config.py +0 -0
  381. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_data_storage.py +0 -0
  382. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_database_engine.py +0 -0
  383. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_dataset.py +0 -0
  384. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_dispatch.py +0 -0
  385. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_fileslice.py +0 -0
  386. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_func.py +0 -0
  387. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_listing.py +0 -0
  388. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_metastore.py +0 -0
  389. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_module_exports.py +0 -0
  390. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_pytorch.py +0 -0
  391. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_query.py +0 -0
  392. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_query_metrics.py +0 -0
  393. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_query_params.py +0 -0
  394. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_script_meta.py +0 -0
  395. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_semver.py +0 -0
  396. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_serializer.py +0 -0
  397. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_session.py +0 -0
  398. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_utils.py +0 -0
  399. {datachain-0.24.6 → datachain-0.25.1}/tests/unit/test_warehouse.py +0 -0
@@ -30,9 +30,6 @@ jobs:
30
30
  enable-cache: true
31
31
  cache-suffix: benchmarks
32
32
  cache-dependency-glob: pyproject.toml
33
- # revert after this is fixed
34
- # https://github.com/wntrblm/nox/issues/953
35
- version: ">=0.6,<0.7"
36
33
 
37
34
  - name: Install nox and dvc
38
35
  run: uv pip install dvc[gs] nox --system
@@ -75,8 +75,21 @@ jobs:
75
75
  path: './backend/datachain'
76
76
  fetch-depth: 0
77
77
 
78
- - name: Set up FFmpeg
79
- uses: AnimMouse/setup-ffmpeg@v1
78
+ - name: Install FFmpeg on Windows
79
+ if: runner.os == 'Windows'
80
+ run: choco install ffmpeg
81
+
82
+ - name: Install FFmpeg on macOS
83
+ if: runner.os == 'macOS'
84
+ run: |
85
+ brew install ffmpeg
86
+ echo 'DYLD_FALLBACK_LIBRARY_PATH=/opt/homebrew/lib' >> "$GITHUB_ENV"
87
+
88
+ - name: Install FFmpeg on Ubuntu
89
+ if: runner.os == 'Linux'
90
+ run: |
91
+ sudo apt update
92
+ sudo apt install -y ffmpeg
80
93
 
81
94
  - name: Set up Python ${{ matrix.pyv }}
82
95
  uses: actions/setup-python@v5
@@ -34,9 +34,6 @@ jobs:
34
34
  enable-cache: true
35
35
  cache-suffix: lint
36
36
  cache-dependency-glob: pyproject.toml
37
- # revert after this is fixed
38
- # https://github.com/wntrblm/nox/issues/953
39
- version: ">=0.6,<0.7"
40
37
 
41
38
  - name: Install nox
42
39
  run: uv pip install nox --system
@@ -81,9 +78,6 @@ jobs:
81
78
  fetch-depth: 0
82
79
  ref: ${{ github.event.pull_request.head.sha || github.ref }}
83
80
 
84
- - name: Set up FFmpeg
85
- uses: AnimMouse/setup-ffmpeg@v1
86
-
87
81
  - name: Set up Python ${{ matrix.pyv }}
88
82
  uses: actions/setup-python@v5
89
83
  with:
@@ -95,9 +89,22 @@ jobs:
95
89
  enable-cache: true
96
90
  cache-suffix: tests-${{ matrix.pyv }}
97
91
  cache-dependency-glob: pyproject.toml
98
- # revert after this is fixed
99
- # https://github.com/wntrblm/nox/issues/953
100
- version: ">=0.6,<0.7"
92
+
93
+ - name: Install FFmpeg on Windows
94
+ if: runner.os == 'Windows'
95
+ run: choco install ffmpeg
96
+
97
+ - name: Install FFmpeg on macOS
98
+ if: runner.os == 'macOS'
99
+ run: |
100
+ brew install ffmpeg
101
+ echo 'DYLD_FALLBACK_LIBRARY_PATH=/opt/homebrew/lib' >> "$GITHUB_ENV"
102
+
103
+ - name: Install FFmpeg on Ubuntu
104
+ if: runner.os == 'Linux'
105
+ run: |
106
+ sudo apt update
107
+ sudo apt install -y ffmpeg
101
108
 
102
109
  - name: Install nox
103
110
  run: uv pip install nox --system
@@ -165,9 +172,6 @@ jobs:
165
172
  enable-cache: true
166
173
  cache-suffix: examples-${{ matrix.pyv }}
167
174
  cache-dependency-glob: pyproject.toml
168
- # revert after this is fixed
169
- # https://github.com/wntrblm/nox/issues/953
170
- version: ">=0.6,<0.7"
171
175
 
172
176
  - name: Install nox
173
177
  run: uv pip install nox --system
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.6
3
+ Version: 0.25.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -70,7 +70,8 @@ Provides-Extra: vector
70
70
  Requires-Dist: usearch; extra == "vector"
71
71
  Provides-Extra: hf
72
72
  Requires-Dist: numba>=0.60.0; extra == "hf"
73
- Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
73
+ Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
74
+ Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
74
75
  Requires-Dist: fsspec>=2024.12.0; extra == "hf"
75
76
  Provides-Extra: video
76
77
  Requires-Dist: ffmpeg-python; extra == "video"
@@ -81,7 +81,9 @@ vector = [
81
81
  ]
82
82
  hf = [
83
83
  "numba>=0.60.0",
84
- "datasets[audio,vision]>=2.21.0",
84
+ "datasets[vision]>=4.0.0",
85
+ # https://github.com/pytorch/torchcodec/issues/640
86
+ "datasets[audio]>=4.0.0 ; (sys_platform == 'linux' or sys_platform == 'darwin')",
85
87
  "fsspec>=2024.12.0"
86
88
  ]
87
89
  video = [
@@ -7,6 +7,7 @@ from datachain.lib.dc import (
7
7
  datasets,
8
8
  delete_dataset,
9
9
  listings,
10
+ move_dataset,
10
11
  read_csv,
11
12
  read_database,
12
13
  read_dataset,
@@ -69,6 +70,7 @@ __all__ = [
69
70
  "is_chain_type",
70
71
  "listings",
71
72
  "metrics",
73
+ "move_dataset",
72
74
  "param",
73
75
  "read_csv",
74
76
  "read_database",
@@ -956,26 +956,9 @@ class Catalog:
956
956
  self, dataset: DatasetRecord, conn=None, **kwargs
957
957
  ) -> DatasetRecord:
958
958
  """Updates dataset fields."""
959
- old_name = None
960
- new_name = None
961
- if "name" in kwargs and kwargs["name"] != dataset.name:
962
- old_name = dataset.name
963
- new_name = kwargs["name"]
964
-
965
- dataset = self.metastore.update_dataset(dataset, conn=conn, **kwargs)
966
-
967
- if old_name and new_name:
968
- # updating name must result in updating dataset table names as well
969
- for version in [v.version for v in dataset.versions]:
970
- self.warehouse.rename_dataset_table(
971
- dataset,
972
- old_name,
973
- new_name,
974
- old_version=version,
975
- new_version=version,
976
- )
977
-
978
- return dataset
959
+ dataset_updated = self.metastore.update_dataset(dataset, conn=conn, **kwargs)
960
+ self.warehouse.rename_dataset_tables(dataset, dataset_updated)
961
+ return dataset_updated
979
962
 
980
963
  def remove_dataset_version(
981
964
  self, dataset: DatasetRecord, version: str, drop_rows: Optional[bool] = True
@@ -207,6 +207,10 @@ class AbstractMetastore(ABC, Serializable):
207
207
  It also creates project if not found and create flag is set to True.
208
208
  """
209
209
 
210
+ @abstractmethod
211
+ def get_project_by_id(self, project_id: int, conn=None) -> Project:
212
+ """Gets a single project by id"""
213
+
210
214
  @abstractmethod
211
215
  def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
212
216
  """Gets list of projects in some namespace or in general (in all namespaces)"""
@@ -851,6 +855,24 @@ class AbstractDBMetastore(AbstractMetastore):
851
855
  )
852
856
  return self.project_class.parse(*rows[0])
853
857
 
858
+ def get_project_by_id(self, project_id: int, conn=None) -> Project:
859
+ """Gets a single project by id"""
860
+ n = self._namespaces
861
+ p = self._projects
862
+
863
+ query = self._projects_select(
864
+ *(getattr(n.c, f) for f in self._namespaces_fields),
865
+ *(getattr(p.c, f) for f in self._projects_fields),
866
+ )
867
+ query = query.select_from(n.join(p, n.c.id == p.c.namespace_id)).where(
868
+ p.c.id == project_id
869
+ )
870
+
871
+ rows = list(self.db.execute(query, conn=conn))
872
+ if not rows:
873
+ raise ProjectNotFoundError(f"Project with id {project_id} not found.")
874
+ return self.project_class.parse(*rows[0])
875
+
854
876
  def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
855
877
  """
856
878
  Gets a list of projects inside some namespace, or in all namespaces
@@ -1008,6 +1030,11 @@ class AbstractDBMetastore(AbstractMetastore):
1008
1030
  else:
1009
1031
  values[field] = json.dumps(value)
1010
1032
  dataset_values[field] = DatasetRecord.parse_schema(value)
1033
+ elif field == "project_id":
1034
+ if not value:
1035
+ raise ValueError("Cannot set empty project_id for dataset")
1036
+ dataset_values["project"] = self.get_project_by_id(value)
1037
+ values[field] = value
1011
1038
  else:
1012
1039
  values[field] = value
1013
1040
  dataset_values[field] = value
@@ -1017,7 +1044,9 @@ class AbstractDBMetastore(AbstractMetastore):
1017
1044
 
1018
1045
  d = self._datasets
1019
1046
  self.db.execute(
1020
- self._datasets_update().where(d.c.name == dataset.name).values(values),
1047
+ self._datasets_update()
1048
+ .where(d.c.name == dataset.name, d.c.project_id == dataset.project.id)
1049
+ .values(values),
1021
1050
  conn=conn,
1022
1051
  ) # type: ignore [attr-defined]
1023
1052
 
@@ -356,24 +356,23 @@ class AbstractWarehouse(ABC, Serializable):
356
356
  self, dataset: DatasetRecord, version: str
357
357
  ) -> list[StorageURI]: ...
358
358
 
359
- def rename_dataset_table(
360
- self,
361
- dataset: DatasetRecord,
362
- old_name: str,
363
- new_name: str,
364
- old_version: str,
365
- new_version: str,
359
+ def rename_dataset_tables(
360
+ self, dataset: DatasetRecord, dataset_updated: DatasetRecord
366
361
  ) -> None:
367
- namespace = dataset.project.namespace.name
368
- project = dataset.project.name
369
- old_ds_table_name = self._construct_dataset_table_name(
370
- namespace, project, old_name, old_version
371
- )
372
- new_ds_table_name = self._construct_dataset_table_name(
373
- namespace, project, new_name, new_version
374
- )
375
-
376
- self.db.rename_table(old_ds_table_name, new_ds_table_name)
362
+ """
363
+ Renames all dataset version tables when parts of the dataset that
364
+ are used in constructing table name are updated.
365
+ If nothing important is changed, nothing will be renamed (no DB calls
366
+ will be made at all).
367
+ """
368
+ for version in [v.version for v in dataset_updated.versions]:
369
+ if not dataset.has_version(version):
370
+ continue
371
+ src = self.dataset_table_name(dataset, version)
372
+ dest = self.dataset_table_name(dataset_updated, version)
373
+ if src == dest:
374
+ continue
375
+ self.db.rename_table(src, dest)
377
376
 
378
377
  def dataset_rows_count(self, dataset: DatasetRecord, version=None) -> int:
379
378
  """Returns total number of rows in a dataset"""
@@ -126,7 +126,16 @@ class ArrowGenerator(Generator):
126
126
  if isinstance(kwargs.get("format"), CsvFileFormat):
127
127
  kwargs["format"] = "csv"
128
128
  arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
129
+
130
+ if self.output_schema and hasattr(vals[0], "source"):
131
+ # if we are reading parquet file written by datachain it might have
132
+ # source inside of it already, so we should not duplicate it, instead
133
+ # we are re-creating it of the self.source flag
134
+ vals[0].source = arrow_file # type: ignore[attr-defined]
135
+
136
+ return vals
129
137
  return [arrow_file, *vals]
138
+
130
139
  return vals
131
140
 
132
141
  def _process_non_datachain_record(
@@ -1,7 +1,7 @@
1
1
  from .csv import read_csv
2
2
  from .database import read_database
3
3
  from .datachain import C, Column, DataChain
4
- from .datasets import datasets, delete_dataset, read_dataset
4
+ from .datasets import datasets, delete_dataset, move_dataset, read_dataset
5
5
  from .hf import read_hf
6
6
  from .json import read_json
7
7
  from .listings import listings
@@ -22,6 +22,7 @@ __all__ = [
22
22
  "datasets",
23
23
  "delete_dataset",
24
24
  "listings",
25
+ "move_dataset",
25
26
  "read_csv",
26
27
  "read_database",
27
28
  "read_dataset",
@@ -361,3 +361,58 @@ def delete_dataset(
361
361
  else:
362
362
  version = None
363
363
  catalog.remove_dataset(name, ds_project, version=version, force=force)
364
+
365
+
366
+ def move_dataset(
367
+ src: str,
368
+ dest: str,
369
+ session: Optional[Session] = None,
370
+ in_memory: bool = False,
371
+ ) -> None:
372
+ """Moves an entire dataset between namespaces and projects.
373
+
374
+ Args:
375
+ src: The source dataset name. This can be a fully qualified name that includes
376
+ the namespace and project, or a regular name. If a regular name is used,
377
+ default values will be applied. The source dataset will no longer exist
378
+ after the move.
379
+ dst: The destination dataset name. This can also be a fully qualified
380
+ name with a namespace and project, or just a regular name (default values
381
+ will be used in that case). The original dataset will be moved here.
382
+ session: An optional session instance. If not provided, the default session
383
+ will be used.
384
+ in_memory: If True, creates an in-memory session. Defaults to False.
385
+
386
+ Returns:
387
+ None
388
+
389
+ Examples:
390
+ ```python
391
+ import datachain as dc
392
+ dc.move_dataset("cats", "new_cats")
393
+ ```
394
+
395
+ ```python
396
+ import datachain as dc
397
+ dc.move_dataset("dev.animals.cats", "prod.animals.cats")
398
+ ```
399
+ """
400
+ session = Session.get(session, in_memory=in_memory)
401
+ catalog = session.catalog
402
+
403
+ namespace, project, name = catalog.get_full_dataset_name(src)
404
+ dest_namespace, dest_project, dest_name = catalog.get_full_dataset_name(dest)
405
+
406
+ dataset = catalog.get_dataset(
407
+ name, catalog.metastore.get_project(project, namespace)
408
+ )
409
+
410
+ catalog.update_dataset(
411
+ dataset,
412
+ name=dest_name,
413
+ project_id=catalog.metastore.get_project(
414
+ dest_project,
415
+ dest_namespace,
416
+ create=catalog.metastore.project_allowed_to_create,
417
+ ).id,
418
+ )
@@ -11,7 +11,7 @@ try:
11
11
  Image,
12
12
  IterableDataset,
13
13
  IterableDatasetDict,
14
- Sequence,
14
+ List,
15
15
  Value,
16
16
  load_dataset,
17
17
  )
@@ -59,7 +59,6 @@ class HFImage(DataModel):
59
59
 
60
60
 
61
61
  class HFAudio(DataModel):
62
- path: str
63
62
  array: list[float]
64
63
  sampling_rate: int
65
64
 
@@ -116,26 +115,24 @@ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
116
115
  return {"": ds}
117
116
 
118
117
 
119
- def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
120
- if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
118
+ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
119
+ if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D, List)):
121
120
  return val
122
121
  if isinstance(feat, ClassLabel):
123
122
  return HFClassLabel(string=feat.names[val], integer=val)
124
- if isinstance(feat, Sequence):
125
- if isinstance(feat.feature, dict):
126
- sdict = {}
127
- for sname in val:
128
- sfeat = feat.feature[sname]
129
- sanno = anno.model_fields[sname].annotation
130
- sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
131
- return anno(**sdict)
132
- return val
123
+ if isinstance(feat, dict):
124
+ sdict = {}
125
+ for sname in val:
126
+ sfeat = feat[sname]
127
+ sanno = anno.model_fields[sname].annotation
128
+ sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
129
+ return anno(**sdict)
133
130
  if isinstance(feat, Image):
134
131
  if isinstance(val, dict):
135
132
  return HFImage(img=val["bytes"])
136
133
  return HFImage(img=image_to_bytes(val))
137
134
  if isinstance(feat, Audio):
138
- return HFAudio(**val)
135
+ return HFAudio(array=val["array"], sampling_rate=val["sampling_rate"])
139
136
 
140
137
 
141
138
  def get_output_schema(features: Features) -> dict[str, DataType]:
@@ -151,13 +148,13 @@ def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
151
148
  return arrow_type_mapper(val.pa_type)
152
149
  if isinstance(val, ClassLabel):
153
150
  return HFClassLabel
154
- if isinstance(val, Sequence):
155
- if isinstance(val.feature, dict):
156
- sequence_dict = {}
157
- for sname, sval in val.feature.items():
158
- dtype = _feature_to_chain_type(sname, sval)
159
- sequence_dict[sname] = list[dtype] # type: ignore[valid-type]
160
- return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
151
+ if isinstance(val, dict):
152
+ sequence_dict = {}
153
+ for sname, sval in val.items():
154
+ dtype = _feature_to_chain_type(sname, sval)
155
+ sequence_dict[sname] = dtype # type: ignore[valid-type]
156
+ return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
157
+ if isinstance(val, List):
161
158
  return list[_feature_to_chain_type(name, val.feature)] # type: ignore[arg-type,misc,return-value]
162
159
  if isinstance(val, Array2D):
163
160
  dtype = arrow_type_mapper(string_to_arrow(val.dtype))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.6
3
+ Version: 0.25.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -70,7 +70,8 @@ Provides-Extra: vector
70
70
  Requires-Dist: usearch; extra == "vector"
71
71
  Provides-Extra: hf
72
72
  Requires-Dist: numba>=0.60.0; extra == "hf"
73
- Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
73
+ Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
74
+ Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
74
75
  Requires-Dist: fsspec>=2024.12.0; extra == "hf"
75
76
  Provides-Extra: video
76
77
  Requires-Dist: ffmpeg-python; extra == "video"
@@ -63,9 +63,12 @@ open_clip_torch
63
63
 
64
64
  [hf]
65
65
  numba>=0.60.0
66
- datasets[audio,vision]>=2.21.0
66
+ datasets[vision]>=4.0.0
67
67
  fsspec>=2024.12.0
68
68
 
69
+ [hf:sys_platform == "linux" or sys_platform == "darwin"]
70
+ datasets[audio]>=4.0.0
71
+
69
72
  [remote]
70
73
  lz4
71
74
  requests>=2.22.0
@@ -576,6 +576,12 @@ def mock_allowed_to_create_namespace(allow_create_namespace):
576
576
  yield
577
577
 
578
578
 
579
+ @pytest.fixture
580
+ def mock_is_local_dataset():
581
+ with patch.object(AbstractMetastore, "is_local_dataset", return_value=True):
582
+ yield
583
+
584
+
579
585
  @pytest.fixture
580
586
  def project(test_session):
581
587
  return dc.create_project("dev", "animals", "Animals project")
@@ -11,12 +11,13 @@ from datachain.dataset import DatasetDependencyType, DatasetStatus
11
11
  from datachain.error import (
12
12
  DatasetInvalidVersionError,
13
13
  DatasetNotFoundError,
14
+ ProjectNotFoundError,
14
15
  )
15
16
  from datachain.lib.file import File
16
17
  from datachain.lib.listing import parse_listing_uri
17
18
  from datachain.query.dataset import DatasetQuery
18
19
  from datachain.sql.types import Float32, Int, Int64
19
- from tests.utils import assert_row_names, dataset_dependency_asdict
20
+ from tests.utils import assert_row_names, dataset_dependency_asdict, table_row_count
20
21
 
21
22
  FILE_SCHEMA = {
22
23
  f"file__{name}": _type if _type != Int else Int64
@@ -169,14 +170,6 @@ def test_get_dataset(cloud_test_catalog, dogs_dataset):
169
170
  catalog.get_dataset("wrong name", dogs_dataset.project)
170
171
 
171
172
 
172
- # Returns None if the table does not exist
173
- def get_table_row_count(db, table_name):
174
- if not db.has_table(table_name):
175
- return None
176
- query = sa.select(sa.func.count()).select_from(sa.table(table_name))
177
- return next(db.execute(query), (None,))[0]
178
-
179
-
180
173
  def test_create_dataset_from_sources(listed_bucket, cloud_test_catalog, project):
181
174
  dataset_name = uuid.uuid4().hex
182
175
  src_uri = cloud_test_catalog.src_uri
@@ -327,7 +320,7 @@ def test_remove_dataset(cloud_test_catalog, dogs_dataset):
327
320
  catalog.get_dataset(dogs_dataset.name, dogs_dataset.project)
328
321
 
329
322
  dataset_table_name = catalog.warehouse.dataset_table_name(dogs_dataset, "1.0.0")
330
- assert get_table_row_count(catalog.warehouse.db, dataset_table_name) is None
323
+ assert table_row_count(catalog.warehouse.db, dataset_table_name) is None
331
324
 
332
325
  assert (
333
326
  catalog.metastore.get_direct_dataset_dependencies(dogs_dataset, "1.0.0") == []
@@ -391,14 +384,108 @@ def test_edit_dataset(cloud_test_catalog, dogs_dataset):
391
384
  old_dataset_table_name = catalog.warehouse.dataset_table_name(dogs_dataset, "1.0.0")
392
385
  new_dataset_table_name = catalog.warehouse.dataset_table_name(dataset, "1.0.0")
393
386
 
394
- assert get_table_row_count(catalog.warehouse.db, old_dataset_table_name) is None
395
- expected_table_row_count = get_table_row_count(
387
+ assert table_row_count(catalog.warehouse.db, old_dataset_table_name) is None
388
+ expected_table_row_count = table_row_count(
396
389
  catalog.warehouse.db, new_dataset_table_name
397
390
  )
398
391
  assert expected_table_row_count
399
392
  assert dataset.get_version("1.0.0").num_objects == expected_table_row_count
400
393
 
401
394
 
395
+ @pytest.mark.parametrize(
396
+ "old_name,new_name",
397
+ [
398
+ ("old.old.numbers", "new.new.numbers"),
399
+ ("old.old.numbers", "new.new.numbers_new"),
400
+ ("old.old.numbers", "old.new.numbers"),
401
+ ("old.old.numbers", "old.old.numbers"),
402
+ ("numbers", "numbers2"),
403
+ ("numbers", "numbers"),
404
+ ],
405
+ )
406
+ def test_move_dataset(
407
+ test_session,
408
+ old_name,
409
+ new_name,
410
+ mock_is_local_dataset,
411
+ ):
412
+ catalog = test_session.catalog
413
+
414
+ # create 2 versions of dataset in old project
415
+ for _ in range(2):
416
+ (dc.read_values(num=[1, 2, 3], session=test_session).save(old_name))
417
+
418
+ dataset = dc.read_dataset(old_name).dataset
419
+
420
+ dc.move_dataset(old_name, new_name, session=test_session)
421
+
422
+ if old_name != new_name:
423
+ # check that old dataset doesn't exist any more
424
+ with pytest.raises(DatasetNotFoundError):
425
+ dc.read_dataset(old_name).save("wrong")
426
+
427
+ dataset_updated = dc.read_dataset(new_name).dataset
428
+
429
+ # check if dataset tables are renamed correctly as well
430
+ for version in [v.version for v in dataset.versions]:
431
+ old_table_name = catalog.warehouse.dataset_table_name(dataset, version)
432
+ new_table_name = catalog.warehouse.dataset_table_name(dataset_updated, version)
433
+ if old_name == new_name:
434
+ assert old_table_name == new_table_name
435
+ else:
436
+ assert table_row_count(catalog.warehouse.db, old_table_name) is None
437
+
438
+ assert table_row_count(catalog.warehouse.db, new_table_name) == 3
439
+
440
+
441
+ def test_move_dataset_then_save_into(test_session):
442
+ old_name = "old.old.numbers"
443
+ new_name = "new.new.numbers"
444
+
445
+ # create 2 versions of dataset in old project
446
+ for _ in range(2):
447
+ dc.read_values(num=[1, 2, 3], session=test_session).save(old_name)
448
+
449
+ dc.move_dataset(old_name, new_name, session=test_session)
450
+ dc.read_values(num=[1, 2, 3], session=test_session).save(new_name)
451
+
452
+ ds = dc.datasets(column="dataset", session=test_session)
453
+ datasets = [
454
+ d
455
+ for d in ds.to_values("dataset")
456
+ if d.name == "numbers" and d.project == "new" and d.namespace == "new"
457
+ ]
458
+
459
+ assert len(datasets) == 3
460
+
461
+
462
+ def test_move_dataset_wrong_old_project(test_session, project):
463
+ dc.read_values(num=[1, 2, 3], session=test_session).save("old.old.numbers")
464
+
465
+ with pytest.raises(ProjectNotFoundError):
466
+ dc.move_dataset("wrong.wrong.numbers", "new.new.numbers", session=test_session)
467
+
468
+
469
+ def test_move_dataset_error_in_session_moved_dataset_removed(catalog):
470
+ from datachain.query.session import Session
471
+
472
+ old_name = "old.old.numbers"
473
+ new_name = "new.new.numbers"
474
+
475
+ with pytest.raises(DatasetNotFoundError):
476
+ with Session("new", catalog=catalog) as test_session:
477
+ dc.read_values(num=[1, 2, 3]).save("aa")
478
+ dc.read_values(num=[1, 2, 3], session=test_session).save(old_name)
479
+ dc.move_dataset(old_name, new_name, session=test_session)
480
+
481
+ # throws DatasetNotFoundError
482
+ dc.read_dataset("wrong", session=test_session)
483
+
484
+ ds = dc.datasets(column="dataset")
485
+ datasets = [d for d in ds.to_values("dataset")] # noqa: C416
486
+ assert len(datasets) == 0
487
+
488
+
402
489
  def test_edit_dataset_same_name(cloud_test_catalog, dogs_dataset):
403
490
  dataset_new_name = dogs_dataset.name
404
491
  catalog = cloud_test_catalog.catalog
@@ -414,12 +501,12 @@ def test_edit_dataset_same_name(cloud_test_catalog, dogs_dataset):
414
501
  old_dataset_table_name = catalog.warehouse.dataset_table_name(dogs_dataset, "1.0.0")
415
502
  new_dataset_table_name = catalog.warehouse.dataset_table_name(dataset, "1.0.0")
416
503
 
417
- expected_table_row_count = get_table_row_count(
504
+ expected_table_row_count = table_row_count(
418
505
  catalog.warehouse.db, old_dataset_table_name
419
506
  )
420
507
  assert expected_table_row_count
421
508
  assert dataset.get_version("1.0.0").num_objects == expected_table_row_count
422
- assert expected_table_row_count == get_table_row_count(
509
+ assert expected_table_row_count == table_row_count(
423
510
  catalog.warehouse.db, new_dataset_table_name
424
511
  )
425
512