datachain 0.26.3__tar.gz → 0.27.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (407) hide show
  1. {datachain-0.26.3 → datachain-0.27.0}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.26.3 → datachain-0.27.0}/PKG-INFO +7 -2
  3. {datachain-0.26.3 → datachain-0.27.0}/README.rst +4 -1
  4. {datachain-0.26.3 → datachain-0.27.0}/docs/commands/job/run.md +62 -13
  5. {datachain-0.26.3 → datachain-0.27.0}/docs/examples.md +21 -31
  6. {datachain-0.26.3 → datachain-0.27.0}/pyproject.toml +2 -0
  7. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/parser/job.py +14 -1
  8. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/arrow.py +1 -1
  9. datachain-0.27.0/src/datachain/lib/audio.py +244 -0
  10. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/data_model.py +9 -1
  11. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/datachain.py +8 -4
  12. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/hf.py +20 -4
  13. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/storage.py +3 -3
  14. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/file.py +60 -8
  15. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/hf.py +17 -7
  16. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/video.py +4 -1
  17. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/remote/studio.py +4 -0
  18. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/studio.py +36 -0
  19. {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/PKG-INFO +7 -2
  20. {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/SOURCES.txt +1 -0
  21. {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/requires.txt +2 -0
  22. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_audio.py +3 -2
  23. datachain-0.27.0/tests/func/test_hf.py +142 -0
  24. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_image.py +28 -0
  25. datachain-0.27.0/tests/func/test_studio_datetime_parsing.py +107 -0
  26. {datachain-0.26.3 → datachain-0.27.0}/tests/test_cli_studio.py +47 -0
  27. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_audio.py +153 -34
  28. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_datachain.py +0 -18
  29. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_hf.py +3 -1
  30. datachain-0.26.3/src/datachain/lib/audio.py +0 -151
  31. datachain-0.26.3/tests/func/test_hf.py +0 -67
  32. {datachain-0.26.3 → datachain-0.27.0}/.cruft.json +0 -0
  33. {datachain-0.26.3 → datachain-0.27.0}/.gitattributes +0 -0
  34. {datachain-0.26.3 → datachain-0.27.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  35. {datachain-0.26.3 → datachain-0.27.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  36. {datachain-0.26.3 → datachain-0.27.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  37. {datachain-0.26.3 → datachain-0.27.0}/.github/codecov.yaml +0 -0
  38. {datachain-0.26.3 → datachain-0.27.0}/.github/dependabot.yml +0 -0
  39. {datachain-0.26.3 → datachain-0.27.0}/.github/workflows/benchmarks.yml +0 -0
  40. {datachain-0.26.3 → datachain-0.27.0}/.github/workflows/release.yml +0 -0
  41. {datachain-0.26.3 → datachain-0.27.0}/.github/workflows/tests-studio.yml +0 -0
  42. {datachain-0.26.3 → datachain-0.27.0}/.github/workflows/tests.yml +0 -0
  43. {datachain-0.26.3 → datachain-0.27.0}/.github/workflows/update-template.yaml +0 -0
  44. {datachain-0.26.3 → datachain-0.27.0}/.gitignore +0 -0
  45. {datachain-0.26.3 → datachain-0.27.0}/CODE_OF_CONDUCT.rst +0 -0
  46. {datachain-0.26.3 → datachain-0.27.0}/LICENSE +0 -0
  47. {datachain-0.26.3 → datachain-0.27.0}/docs/assets/captioned_cartoons.png +0 -0
  48. {datachain-0.26.3 → datachain-0.27.0}/docs/assets/datachain-white.svg +0 -0
  49. {datachain-0.26.3 → datachain-0.27.0}/docs/assets/datachain.svg +0 -0
  50. {datachain-0.26.3 → datachain-0.27.0}/docs/commands/auth/login.md +0 -0
  51. {datachain-0.26.3 → datachain-0.27.0}/docs/commands/auth/logout.md +0 -0
  52. {datachain-0.26.3 → datachain-0.27.0}/docs/commands/auth/team.md +0 -0
  53. {datachain-0.26.3 → datachain-0.27.0}/docs/commands/auth/token.md +0 -0
  54. {datachain-0.26.3 → datachain-0.27.0}/docs/commands/index.md +0 -0
  55. {datachain-0.26.3 → datachain-0.27.0}/docs/commands/job/cancel.md +0 -0
  56. {datachain-0.26.3 → datachain-0.27.0}/docs/commands/job/clusters.md +0 -0
  57. {datachain-0.26.3 → datachain-0.27.0}/docs/commands/job/logs.md +0 -0
  58. {datachain-0.26.3 → datachain-0.27.0}/docs/commands/job/ls.md +0 -0
  59. {datachain-0.26.3 → datachain-0.27.0}/docs/contributing.md +0 -0
  60. {datachain-0.26.3 → datachain-0.27.0}/docs/css/github-permalink-style.css +0 -0
  61. {datachain-0.26.3 → datachain-0.27.0}/docs/guide/db_migrations.md +0 -0
  62. {datachain-0.26.3 → datachain-0.27.0}/docs/guide/delta.md +0 -0
  63. {datachain-0.26.3 → datachain-0.27.0}/docs/guide/env.md +0 -0
  64. {datachain-0.26.3 → datachain-0.27.0}/docs/guide/index.md +0 -0
  65. {datachain-0.26.3 → datachain-0.27.0}/docs/guide/namespaces.md +0 -0
  66. {datachain-0.26.3 → datachain-0.27.0}/docs/guide/processing.md +0 -0
  67. {datachain-0.26.3 → datachain-0.27.0}/docs/guide/remotes.md +0 -0
  68. {datachain-0.26.3 → datachain-0.27.0}/docs/guide/retry.md +0 -0
  69. {datachain-0.26.3 → datachain-0.27.0}/docs/index.md +0 -0
  70. {datachain-0.26.3 → datachain-0.27.0}/docs/overrides/main.html +0 -0
  71. {datachain-0.26.3 → datachain-0.27.0}/docs/quick-start.md +0 -0
  72. {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/arrowrow.md +0 -0
  73. {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/bbox.md +0 -0
  74. {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/file.md +0 -0
  75. {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/imagefile.md +0 -0
  76. {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/index.md +0 -0
  77. {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/pose.md +0 -0
  78. {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/segment.md +0 -0
  79. {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/tarvfile.md +0 -0
  80. {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/textfile.md +0 -0
  81. {datachain-0.26.3 → datachain-0.27.0}/docs/references/data-types/videofile.md +0 -0
  82. {datachain-0.26.3 → datachain-0.27.0}/docs/references/datachain.md +0 -0
  83. {datachain-0.26.3 → datachain-0.27.0}/docs/references/func.md +0 -0
  84. {datachain-0.26.3 → datachain-0.27.0}/docs/references/index.md +0 -0
  85. {datachain-0.26.3 → datachain-0.27.0}/docs/references/toolkit.md +0 -0
  86. {datachain-0.26.3 → datachain-0.27.0}/docs/references/torch.md +0 -0
  87. {datachain-0.26.3 → datachain-0.27.0}/docs/references/udf.md +0 -0
  88. {datachain-0.26.3 → datachain-0.27.0}/docs/tutorials.md +0 -0
  89. {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  90. {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  91. {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/openimage-detect.py +0 -0
  92. {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  93. {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  94. {datachain-0.26.3 → datachain-0.27.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  95. {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/common_sql_functions.py +0 -0
  96. {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/json-csv-reader.py +0 -0
  97. {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/torch-loader.py +0 -0
  98. {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/udfs/parallel.py +0 -0
  99. {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/udfs/simple.py +0 -0
  100. {datachain-0.26.3 → datachain-0.27.0}/examples/get_started/udfs/stateful.py +0 -0
  101. {datachain-0.26.3 → datachain-0.27.0}/examples/incremental_processing/delta.py +0 -0
  102. {datachain-0.26.3 → datachain-0.27.0}/examples/incremental_processing/retry.py +0 -0
  103. {datachain-0.26.3 → datachain-0.27.0}/examples/incremental_processing/utils.py +0 -0
  104. {datachain-0.26.3 → datachain-0.27.0}/examples/llm_and_nlp/claude-query.py +0 -0
  105. {datachain-0.26.3 → datachain-0.27.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  106. {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/audio-to-text.py +0 -0
  107. {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/clip_inference.py +0 -0
  108. {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/hf_pipeline.py +0 -0
  109. {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  110. {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/wds.py +0 -0
  111. {datachain-0.26.3 → datachain-0.27.0}/examples/multimodal/wds_filtered.py +0 -0
  112. {datachain-0.26.3 → datachain-0.27.0}/mkdocs.yml +0 -0
  113. {datachain-0.26.3 → datachain-0.27.0}/noxfile.py +0 -0
  114. {datachain-0.26.3 → datachain-0.27.0}/setup.cfg +0 -0
  115. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/__init__.py +0 -0
  116. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/__main__.py +0 -0
  117. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/asyn.py +0 -0
  118. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cache.py +0 -0
  119. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/catalog/__init__.py +0 -0
  120. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/catalog/catalog.py +0 -0
  121. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/catalog/datasource.py +0 -0
  122. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/catalog/loader.py +0 -0
  123. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/__init__.py +0 -0
  124. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/__init__.py +0 -0
  125. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/datasets.py +0 -0
  126. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/du.py +0 -0
  127. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/index.py +0 -0
  128. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/ls.py +0 -0
  129. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/misc.py +0 -0
  130. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/query.py +0 -0
  131. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/commands/show.py +0 -0
  132. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/parser/__init__.py +0 -0
  133. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/parser/studio.py +0 -0
  134. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/parser/utils.py +0 -0
  135. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/cli/utils.py +0 -0
  136. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/__init__.py +0 -0
  137. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/azure.py +0 -0
  138. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/fileslice.py +0 -0
  139. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/fsspec.py +0 -0
  140. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/gcs.py +0 -0
  141. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/hf.py +0 -0
  142. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/local.py +0 -0
  143. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/client/s3.py +0 -0
  144. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/config.py +0 -0
  145. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/__init__.py +0 -0
  146. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/db_engine.py +0 -0
  147. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/job.py +0 -0
  148. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/metastore.py +0 -0
  149. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/schema.py +0 -0
  150. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/serializer.py +0 -0
  151. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/sqlite.py +0 -0
  152. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/data_storage/warehouse.py +0 -0
  153. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/dataset.py +0 -0
  154. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/delta.py +0 -0
  155. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/diff/__init__.py +0 -0
  156. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/error.py +0 -0
  157. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/fs/__init__.py +0 -0
  158. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/fs/reference.py +0 -0
  159. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/fs/utils.py +0 -0
  160. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/__init__.py +0 -0
  161. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/aggregate.py +0 -0
  162. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/array.py +0 -0
  163. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/base.py +0 -0
  164. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/conditional.py +0 -0
  165. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/func.py +0 -0
  166. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/numeric.py +0 -0
  167. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/path.py +0 -0
  168. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/random.py +0 -0
  169. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/string.py +0 -0
  170. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/func/window.py +0 -0
  171. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/job.py +0 -0
  172. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/__init__.py +0 -0
  173. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/clip.py +0 -0
  174. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/__init__.py +0 -0
  175. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/flatten.py +0 -0
  176. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  177. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  178. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/unflatten.py +0 -0
  179. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  180. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dataset_info.py +0 -0
  181. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/__init__.py +0 -0
  182. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/csv.py +0 -0
  183. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/database.py +0 -0
  184. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/datasets.py +0 -0
  185. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/json.py +0 -0
  186. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/listings.py +0 -0
  187. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/pandas.py +0 -0
  188. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/parquet.py +0 -0
  189. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/records.py +0 -0
  190. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/utils.py +0 -0
  191. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/dc/values.py +0 -0
  192. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/image.py +0 -0
  193. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/listing.py +0 -0
  194. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/listing_info.py +0 -0
  195. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/meta_formats.py +0 -0
  196. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/model_store.py +0 -0
  197. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/namespaces.py +0 -0
  198. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/projects.py +0 -0
  199. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/pytorch.py +0 -0
  200. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/settings.py +0 -0
  201. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/signal_schema.py +0 -0
  202. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/tar.py +0 -0
  203. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/text.py +0 -0
  204. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/udf.py +0 -0
  205. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/udf_signature.py +0 -0
  206. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/utils.py +0 -0
  207. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/webdataset.py +0 -0
  208. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/lib/webdataset_laion.py +0 -0
  209. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/listing.py +0 -0
  210. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/__init__.py +0 -0
  211. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/bbox.py +0 -0
  212. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/pose.py +0 -0
  213. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/segment.py +0 -0
  214. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  215. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  216. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/ultralytics/pose.py +0 -0
  217. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/ultralytics/segment.py +0 -0
  218. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/model/utils.py +0 -0
  219. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/namespace.py +0 -0
  220. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/node.py +0 -0
  221. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/nodes_fetcher.py +0 -0
  222. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/nodes_thread_pool.py +0 -0
  223. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/progress.py +0 -0
  224. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/project.py +0 -0
  225. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/py.typed +0 -0
  226. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/__init__.py +0 -0
  227. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/batch.py +0 -0
  228. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/dataset.py +0 -0
  229. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/dispatch.py +0 -0
  230. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/metrics.py +0 -0
  231. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/params.py +0 -0
  232. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/queue.py +0 -0
  233. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/schema.py +0 -0
  234. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/session.py +0 -0
  235. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/udf.py +0 -0
  236. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/query/utils.py +0 -0
  237. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/remote/__init__.py +0 -0
  238. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/script_meta.py +0 -0
  239. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/semver.py +0 -0
  240. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/__init__.py +0 -0
  241. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/default/__init__.py +0 -0
  242. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/default/base.py +0 -0
  243. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/__init__.py +0 -0
  244. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/aggregate.py +0 -0
  245. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/array.py +0 -0
  246. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/conditional.py +0 -0
  247. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/numeric.py +0 -0
  248. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/path.py +0 -0
  249. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/random.py +0 -0
  250. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/functions/string.py +0 -0
  251. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/selectable.py +0 -0
  252. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  253. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/sqlite/base.py +0 -0
  254. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/sqlite/types.py +0 -0
  255. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/sqlite/vector.py +0 -0
  256. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/types.py +0 -0
  257. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/sql/utils.py +0 -0
  258. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/telemetry.py +0 -0
  259. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/toolkit/__init__.py +0 -0
  260. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/toolkit/split.py +0 -0
  261. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/torch/__init__.py +0 -0
  262. {datachain-0.26.3 → datachain-0.27.0}/src/datachain/utils.py +0 -0
  263. {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  264. {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/entry_points.txt +0 -0
  265. {datachain-0.26.3 → datachain-0.27.0}/src/datachain.egg-info/top_level.txt +0 -0
  266. {datachain-0.26.3 → datachain-0.27.0}/tests/__init__.py +0 -0
  267. {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/__init__.py +0 -0
  268. {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/conftest.py +0 -0
  269. {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  270. {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  271. {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/datasets/.gitignore +0 -0
  272. {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  273. {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/test_datachain.py +0 -0
  274. {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/test_ls.py +0 -0
  275. {datachain-0.26.3 → datachain-0.27.0}/tests/benchmarks/test_version.py +0 -0
  276. {datachain-0.26.3 → datachain-0.27.0}/tests/conftest.py +0 -0
  277. {datachain-0.26.3 → datachain-0.27.0}/tests/data.py +0 -0
  278. {datachain-0.26.3 → datachain-0.27.0}/tests/examples/__init__.py +0 -0
  279. {datachain-0.26.3 → datachain-0.27.0}/tests/examples/test_examples.py +0 -0
  280. {datachain-0.26.3 → datachain-0.27.0}/tests/examples/test_wds_e2e.py +0 -0
  281. {datachain-0.26.3 → datachain-0.27.0}/tests/examples/wds_data.py +0 -0
  282. {datachain-0.26.3 → datachain-0.27.0}/tests/func/__init__.py +0 -0
  283. {datachain-0.26.3 → datachain-0.27.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  284. {datachain-0.26.3 → datachain-0.27.0}/tests/func/data/lena.jpg +0 -0
  285. {datachain-0.26.3 → datachain-0.27.0}/tests/func/fake-service-account-credentials.json +0 -0
  286. {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/__init__.py +0 -0
  287. {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_aggregate.py +0 -0
  288. {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_array.py +0 -0
  289. {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_conditional.py +0 -0
  290. {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_numeric.py +0 -0
  291. {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_path.py +0 -0
  292. {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_random.py +0 -0
  293. {datachain-0.26.3 → datachain-0.27.0}/tests/func/functions/test_string.py +0 -0
  294. {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/__init__.py +0 -0
  295. {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/data/running-mask0.png +0 -0
  296. {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/data/running-mask1.png +0 -0
  297. {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/data/running.jpg +0 -0
  298. {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/data/ships.jpg +0 -0
  299. {datachain-0.26.3 → datachain-0.27.0}/tests/func/model/test_yolo.py +0 -0
  300. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_batching.py +0 -0
  301. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_catalog.py +0 -0
  302. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_client.py +0 -0
  303. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_cloud_transfer.py +0 -0
  304. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_data_storage.py +0 -0
  305. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_datachain.py +0 -0
  306. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_datachain_merge.py +0 -0
  307. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_dataset_query.py +0 -0
  308. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_datasets.py +0 -0
  309. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_delta.py +0 -0
  310. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_feature_pickling.py +0 -0
  311. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_file.py +0 -0
  312. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_hidden_field.py +0 -0
  313. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_listing.py +0 -0
  314. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_ls.py +0 -0
  315. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_meta_formats.py +0 -0
  316. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_metastore.py +0 -0
  317. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_metrics.py +0 -0
  318. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_pull.py +0 -0
  319. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_pytorch.py +0 -0
  320. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_query.py +0 -0
  321. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_read_database.py +0 -0
  322. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_read_dataset_remote.py +0 -0
  323. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  324. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_retry.py +0 -0
  325. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_session.py +0 -0
  326. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_toolkit.py +0 -0
  327. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_video.py +0 -0
  328. {datachain-0.26.3 → datachain-0.27.0}/tests/func/test_warehouse.py +0 -0
  329. {datachain-0.26.3 → datachain-0.27.0}/tests/scripts/feature_class.py +0 -0
  330. {datachain-0.26.3 → datachain-0.27.0}/tests/scripts/feature_class_exception.py +0 -0
  331. {datachain-0.26.3 → datachain-0.27.0}/tests/scripts/feature_class_parallel.py +0 -0
  332. {datachain-0.26.3 → datachain-0.27.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  333. {datachain-0.26.3 → datachain-0.27.0}/tests/scripts/name_len_slow.py +0 -0
  334. {datachain-0.26.3 → datachain-0.27.0}/tests/test_atomicity.py +0 -0
  335. {datachain-0.26.3 → datachain-0.27.0}/tests/test_cli_e2e.py +0 -0
  336. {datachain-0.26.3 → datachain-0.27.0}/tests/test_import_time.py +0 -0
  337. {datachain-0.26.3 → datachain-0.27.0}/tests/test_query_e2e.py +0 -0
  338. {datachain-0.26.3 → datachain-0.27.0}/tests/test_telemetry.py +0 -0
  339. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/__init__.py +0 -0
  340. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/__init__.py +0 -0
  341. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/conftest.py +0 -0
  342. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_arrow.py +0 -0
  343. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_clip.py +0 -0
  344. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  345. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  346. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_diff.py +0 -0
  347. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_feature.py +0 -0
  348. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_feature_utils.py +0 -0
  349. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_file.py +0 -0
  350. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_image.py +0 -0
  351. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_listing_info.py +0 -0
  352. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_namespace.py +0 -0
  353. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_partition_by.py +0 -0
  354. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_project.py +0 -0
  355. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  356. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_schema.py +0 -0
  357. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_signal_schema.py +0 -0
  358. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  359. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_text.py +0 -0
  360. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_udf.py +0 -0
  361. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_udf_signature.py +0 -0
  362. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_utils.py +0 -0
  363. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/lib/test_webdataset.py +0 -0
  364. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/model/__init__.py +0 -0
  365. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/model/test_bbox.py +0 -0
  366. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/model/test_pose.py +0 -0
  367. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/model/test_segment.py +0 -0
  368. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/model/test_utils.py +0 -0
  369. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/__init__.py +0 -0
  370. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  371. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  372. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  373. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_array.py +0 -0
  374. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_conditional.py +0 -0
  375. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_path.py +0 -0
  376. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_random.py +0 -0
  377. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_selectable.py +0 -0
  378. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/sql/test_string.py +0 -0
  379. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_asyn.py +0 -0
  380. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_cache.py +0 -0
  381. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_catalog.py +0 -0
  382. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_catalog_loader.py +0 -0
  383. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_cli_parsing.py +0 -0
  384. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_client.py +0 -0
  385. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_client_gcs.py +0 -0
  386. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_client_s3.py +0 -0
  387. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_config.py +0 -0
  388. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_data_storage.py +0 -0
  389. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_database_engine.py +0 -0
  390. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_dataset.py +0 -0
  391. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_dispatch.py +0 -0
  392. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_fileslice.py +0 -0
  393. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_func.py +0 -0
  394. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_listing.py +0 -0
  395. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_metastore.py +0 -0
  396. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_module_exports.py +0 -0
  397. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_pytorch.py +0 -0
  398. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_query.py +0 -0
  399. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_query_metrics.py +0 -0
  400. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_query_params.py +0 -0
  401. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_script_meta.py +0 -0
  402. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_semver.py +0 -0
  403. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_serializer.py +0 -0
  404. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_session.py +0 -0
  405. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_utils.py +0 -0
  406. {datachain-0.26.3 → datachain-0.27.0}/tests/unit/test_warehouse.py +0 -0
  407. {datachain-0.26.3 → datachain-0.27.0}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.12.3'
27
+ rev: 'v0.12.4'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.26.3
3
+ Version: 0.27.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -26,6 +26,7 @@ Requires-Dist: packaging
26
26
  Requires-Dist: pyarrow
27
27
  Requires-Dist: typing-extensions
28
28
  Requires-Dist: python-dateutil>=2
29
+ Requires-Dist: dateparser>=1.0.0
29
30
  Requires-Dist: attrs>=21.3.0
30
31
  Requires-Dist: fsspec>=2024.2.0
31
32
  Requires-Dist: s3fs>=2024.2.0
@@ -100,6 +101,7 @@ Provides-Extra: dev
100
101
  Requires-Dist: datachain[docs,tests]; extra == "dev"
101
102
  Requires-Dist: mypy==1.17.0; extra == "dev"
102
103
  Requires-Dist: types-python-dateutil; extra == "dev"
104
+ Requires-Dist: types-dateparser; extra == "dev"
103
105
  Requires-Dist: types-pytz; extra == "dev"
104
106
  Requires-Dist: types-PyYAML; extra == "dev"
105
107
  Requires-Dist: types-requests; extra == "dev"
@@ -118,7 +120,7 @@ Dynamic: license-file
118
120
  |logo| DataChain
119
121
  ================
120
122
 
121
- |PyPI| |Python Version| |Codecov| |Tests|
123
+ |PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
122
124
 
123
125
  .. |logo| image:: docs/assets/datachain.svg
124
126
  :height: 24
@@ -134,6 +136,9 @@ Dynamic: license-file
134
136
  .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
135
137
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
136
138
  :alt: Tests
139
+ .. |DeepWiki| image:: https://deepwiki.com/badge.svg
140
+ :target: https://deepwiki.com/iterative/datachain
141
+ :alt: DeepWiki
137
142
 
138
143
  DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
139
144
  data like images, audio, videos, text and PDFs. It integrates with external storage
@@ -2,7 +2,7 @@
2
2
  |logo| DataChain
3
3
  ================
4
4
 
5
- |PyPI| |Python Version| |Codecov| |Tests|
5
+ |PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
6
6
 
7
7
  .. |logo| image:: docs/assets/datachain.svg
8
8
  :height: 24
@@ -18,6 +18,9 @@
18
18
  .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
19
19
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
20
20
  :alt: Tests
21
+ .. |DeepWiki| image:: https://deepwiki.com/badge.svg
22
+ :target: https://deepwiki.com/iterative/datachain
23
+ :alt: DeepWiki
21
24
 
22
25
  DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
23
26
  data like images, audio, videos, text and PDFs. It integrates with external storage
@@ -5,15 +5,21 @@ Run a job in Studio.
5
5
  ## Synopsis
6
6
 
7
7
  ```usage
8
- usage: datachain job run [-h] [-v] [-q] [--team TEAM] [--env-file ENV_FILE] [--env ENV [ENV ...]]
9
- [--workers WORKERS] [--files FILES [FILES ...]] [--python-version PYTHON_VERSION]
8
+ usage: datachain job run [-h] [-v] [-q] [--team TEAM] [--env-file ENV_FILE]
9
+ [--env ENV [ENV ...]]
10
+ [--cluster CLUSTER] [--workers WORKERS]
11
+ [--files FILES [FILES ...]]
12
+ [--python-version PYTHON_VERSION]
13
+ [--repository REPOSITORY]
10
14
  [--req-file REQ_FILE] [--req REQ [REQ ...]]
15
+ [--priority PRIORITY]
16
+ [--start-time START_TIME] [--cron CRON]
11
17
  file
12
18
  ```
13
19
 
14
20
  ## Description
15
21
 
16
- This command runs a job in Studio using the specified query file. You can configure various aspects of the job including environment variables, Python version, dependencies, and more.
22
+ This command runs a job in Studio using the specified query file. You can configure various aspects of the job including environment variables, Python version, dependencies, and more. When using --start-time or --cron, the job is scheduled to run but won't start immediately. (can be seen in the Tasks tab in UI)
17
23
 
18
24
  ## Arguments
19
25
 
@@ -28,10 +34,12 @@ This command runs a job in Studio using the specified query file. You can config
28
34
  * `--workers WORKERS` - Number of workers for the job
29
35
  * `--files FILES` - Additional files to include in the job
30
36
  * `--python-version PYTHON_VERSION` - Python version for the job (e.g., 3.9, 3.10, 3.11)
37
+ * `--repository REPOSITORY` - Repository URL to clone before running the job
31
38
  * `--req-file REQ_FILE` - Python requirements file
32
39
  * `--req REQ` - Python package requirements
33
40
  * `--priority PRIORITY` - Priority for the job in range 0-5. Lower value is higher priority (default: 5)
34
- * `--repository URL` - Repository URL to clone before running the job.
41
+ * `--start-time START_TIME` - Time to schedule the task in YYYY-MM-DDTHH:mm format or natural language.
42
+ * `--cron CRON` - Cron expression for the cron task.
35
43
  * `-h`, `--help` - Show the help message and exit.
36
44
  * `-v`, `--verbose` - Be verbose.
37
45
  * `-q`, `--quiet` - Be quiet.
@@ -66,17 +74,11 @@ datachain job run --env API_KEY=123 --req pandas numpy query.py
66
74
  6. Run a job with a repository (will be cloned in the job working directory):
67
75
  ```bash
68
76
  datachain job run --repository https://github.com/iterative/datachain query.py
69
- ```
70
-
71
- To specify a branch / revision:
72
77
 
73
- ```bash
78
+ # To specify a branch / revision:
74
79
  datachain job run --repository https://github.com/iterative/datachain@main query.py
75
- ```
76
-
77
- Git URLs are also supported:
78
80
 
79
- ```bash
81
+ # Git URLs are also supported:
80
82
  datachain job run --repository git@github.com:iterative/datachain.git@main query.py
81
83
  ```
82
84
 
@@ -90,7 +92,43 @@ datachain job run --priority 2 query.py
90
92
  # Get the cluster id using following command
91
93
  datachain job clusters
92
94
  # Use the id of an active clusters from above
93
- datachain job run --cluster-id 1 query.py
95
+ datachain job run --cluster 1 query.py
96
+ ```
97
+
98
+ 9. Schedule a job to run once at a specific time
99
+ ```bash
100
+ # Run job tomorrow at 3pm
101
+ datachain job run --start-time "tomorrow 3pm" query.py
102
+
103
+ # Run job in 2 hours
104
+ datachain job run --start-time "in 2 hours" query.py
105
+
106
+ # Run job on Monday at 9am
107
+ datachain job run --start-time "monday 9am" query.py
108
+
109
+ # Run job at a specific date and time
110
+ datachain job run --start-time "2024-01-15 14:30:00" query.py
111
+ ```
112
+
113
+ 10. Schedule a recurring job using cron expression
114
+ ```bash
115
+ # Run job daily at midnight
116
+ datachain job run --cron "0 0 * * *" query.py
117
+
118
+ # Run job every Monday at 9am
119
+ datachain job run --cron "0 9 * * 1" query.py
120
+
121
+ # Run job every hour
122
+ datachain job run --cron "0 * * * *" query.py
123
+
124
+ # Run job every month
125
+ datachain job run --cron "@monthly" query.py
126
+ ```
127
+
128
+ 11. Schedule a recurring job with a start time
129
+ ```bash
130
+ # Start the cron job after tomorrow 3pm
131
+ datachain job run --start-time "tomorrow 3pm" --cron "0 0 * * *" query.py
94
132
  ```
95
133
 
96
134
  ## Notes
@@ -99,3 +137,14 @@ datachain job run --cluster-id 1 query.py
99
137
  * To cancel a running job, use the `datachain job cancel` command
100
138
  * The job will continue running in Studio even after you stop viewing the logs
101
139
  * You can get the list of compute clusters using `datachain job clusters` command.
140
+ * When using `--start-time` or `--cron` options, the job is scheduled as a task and will not show logs immediately. The job will be executed according to the schedule.
141
+ * The `--start-time` option supports natural language parsing using the [dateparser](https://dateparser.readthedocs.io/en/latest/) library, allowing flexible time expressions like "tomorrow 3pm", "in 2 hours", "monday 9am", etc.
142
+ * Cron expressions follow the standard format: minute hour day-of-month month day-of-week (e.g., "0 0 * * *" for daily at midnight) or Vixie cron-style “@” keyword expressions.
143
+ * Following options for Vixie cron-style expressions are supported:
144
+ * @midnight
145
+ * @hourly
146
+ * @daily
147
+ * @weekly
148
+ * @monthly
149
+ * @yearly
150
+ * @annually
@@ -10,55 +10,45 @@ title: Examples
10
10
 
11
11
  Datachain is built by composing wrangling operations.
12
12
 
13
- For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column scene”:
13
+ For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies BLIP Large model to caption the first five of them and put the results in the column "scene":
14
14
 
15
15
  ```python
16
16
  import datachain as dc # (1)!
17
- from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
17
+ from transformers import Pipeline, pipeline
18
+ from datachain import File
18
19
 
19
- images = dc.read_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
20
-
21
- model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-mix-224")
22
- processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
23
-
24
- def process(file: File) -> str:
25
- image=file.read().convert("RGB")
26
- inputs = processor(text="caption", images=image, return_tensors="pt")
27
- generate_ids = model.generate(**inputs, max_new_tokens=100)
28
- return processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
20
+ def process(file: File, pipeline: Pipeline) -> str:
21
+ image = file.read().convert("RGB")
22
+ return pipeline(image)[0]["generated_text"]
29
23
 
30
24
  chain = (
31
- images.limit(5)
32
- .settings(cache=True)
33
- .map(scene=lambda file: process(file), output = str)
34
- .save()
25
+ dc.read_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image", anon=True)
26
+ .limit(5)
27
+ .settings(cache=True)
28
+ .setup(pipeline=lambda: pipeline("image-to-text", model="Salesforce/blip-image-captioning-large"))
29
+ .map(scene=process)
30
+ .persist()
35
31
  )
36
32
  ```
37
33
 
38
- 1. `pip install datachain`
39
- 2. `pip install transformers`
34
+ 1. `pip install datachain[hf]`
40
35
 
41
36
  Here is how we can view the results in a plot:
42
37
 
43
38
  ```python
44
39
  import matplotlib.pyplot as plt
45
- import re
46
40
  from textwrap import wrap
47
41
 
48
- def trim_text(text):
49
- match = re.search(r'[A-Z][^.]*\.', text)
50
- return match.group(0) if match else ''
51
-
52
- images = chain.collect("file")
53
- captions = chain.collect("scene")
54
- _ , axes = plt.subplots(1, len(captions), figsize=(15, 5))
42
+ count = chain.count()
43
+ _, axes = plt.subplots(1, count, figsize=(15, 5))
55
44
 
56
- for ax, img, caption in zip(axes, images, captions):
57
- ax.imshow(img.read(),cmap='gray')
58
- ax.axis('off')
59
- wrapped_caption = "\n".join(wrap(trim_text(caption), 30))
60
- ax.set_title(wrapped_caption, fontsize=6)
45
+ for ax, (img_file, caption) in zip(axes, chain.to_iter("file", "scene")):
46
+ ax.imshow(img_file.read(), cmap="gray")
47
+ ax.axis("off")
48
+ wrapped_caption = "\n".join(wrap(caption.strip(), 40))
49
+ ax.set_title(wrapped_caption, fontsize=10, pad=20)
61
50
 
51
+ plt.tight_layout()
62
52
  plt.show()
63
53
  ```
64
54
 
@@ -30,6 +30,7 @@ dependencies = [
30
30
  "pyarrow",
31
31
  "typing-extensions",
32
32
  "python-dateutil>=2",
33
+ "dateparser>=1.0.0",
33
34
  "attrs>=21.3.0",
34
35
  "fsspec>=2024.2.0",
35
36
  "s3fs>=2024.2.0",
@@ -116,6 +117,7 @@ dev = [
116
117
  "datachain[docs,tests]",
117
118
  "mypy==1.17.0",
118
119
  "types-python-dateutil",
120
+ "types-dateparser",
119
121
  "types-pytz",
120
122
  "types-PyYAML",
121
123
  "types-requests",
@@ -17,7 +17,12 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
17
17
  )
18
18
 
19
19
  studio_run_help = "Run a job in Studio"
20
- studio_run_description = "Run a job in Studio."
20
+ studio_run_description = "Run a job in Studio. \n"
21
+ studio_run_description += (
22
+ "When using --start-time or --cron,"
23
+ " the job is scheduled to run but won't start immediately"
24
+ " (can be seen in the Tasks tab in UI)"
25
+ )
21
26
 
22
27
  studio_run_parser = jobs_subparser.add_parser(
23
28
  "run",
@@ -96,6 +101,14 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
96
101
  help="Priority for the job in range 0-5. "
97
102
  "Lower value is higher priority (default: 5)",
98
103
  )
104
+ studio_run_parser.add_argument(
105
+ "--start-time",
106
+ action="store",
107
+ help="Time to schedule a task in YYYY-MM-DDTHH:mm format or natural language.",
108
+ )
109
+ studio_run_parser.add_argument(
110
+ "--cron", action="store", help="Cron expression for the cron task."
111
+ )
99
112
 
100
113
  studio_ls_help = "List jobs in Studio"
101
114
  studio_ls_description = "List jobs in Studio."
@@ -245,7 +245,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
245
245
  if field.nullable and not ModelStore.is_pydantic(dtype):
246
246
  dtype = Optional[dtype] # type: ignore[assignment]
247
247
  type_dict[field.name] = dtype
248
- return dict_to_data_model(column, type_dict)
248
+ return dict_to_data_model(f"ArrowDataModel_{column}", type_dict)
249
249
  if pa.types.is_map(col_type):
250
250
  return dict
251
251
  if isinstance(col_type, pa.lib.DictionaryType):
@@ -0,0 +1,244 @@
1
+ import posixpath
2
+ from typing import TYPE_CHECKING, Optional, Union
3
+
4
+ from datachain.lib.file import FileError
5
+
6
+ if TYPE_CHECKING:
7
+ from numpy import ndarray
8
+
9
+ from datachain.lib.file import Audio, AudioFile, File
10
+
11
+ try:
12
+ import torchaudio
13
+ except ImportError as exc:
14
+ raise ImportError(
15
+ "Missing dependencies for processing audio.\n"
16
+ "To install run:\n\n"
17
+ " pip install 'datachain[audio]'\n"
18
+ ) from exc
19
+
20
+
21
+ def audio_info(file: "Union[File, AudioFile]") -> "Audio":
22
+ """Extract metadata like sample rate, channels, duration, and format."""
23
+ from datachain.lib.file import Audio
24
+
25
+ file = file.as_audio_file()
26
+
27
+ try:
28
+ with file.open() as f:
29
+ info = torchaudio.info(f)
30
+
31
+ sample_rate = int(info.sample_rate)
32
+ channels = int(info.num_channels)
33
+ frames = int(info.num_frames)
34
+ duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
35
+
36
+ codec_name = getattr(info, "encoding", "")
37
+ file_ext = file.get_file_ext().lower()
38
+ format_name = _encoding_to_format(codec_name, file_ext)
39
+
40
+ bits_per_sample = getattr(info, "bits_per_sample", 0)
41
+ bit_rate = (
42
+ bits_per_sample * sample_rate * channels if bits_per_sample > 0 else -1
43
+ )
44
+
45
+ except Exception as exc:
46
+ raise FileError(
47
+ "unable to extract metadata from audio file", file.source, file.path
48
+ ) from exc
49
+
50
+ return Audio(
51
+ sample_rate=sample_rate,
52
+ channels=channels,
53
+ duration=duration,
54
+ samples=frames,
55
+ format=format_name,
56
+ codec=codec_name,
57
+ bit_rate=bit_rate,
58
+ )
59
+
60
+
61
+ def _encoding_to_format(encoding: str, file_ext: str) -> str:
62
+ """
63
+ Map torchaudio encoding to a format name.
64
+
65
+ Args:
66
+ encoding: The encoding string from torchaudio.info()
67
+ file_ext: The file extension as a fallback
68
+
69
+ Returns:
70
+ Format name as a string
71
+ """
72
+ # Direct mapping for formats that match exactly
73
+ encoding_map = {
74
+ "FLAC": "flac",
75
+ "MP3": "mp3",
76
+ "VORBIS": "ogg",
77
+ "AMR_WB": "amr",
78
+ "AMR_NB": "amr",
79
+ "OPUS": "opus",
80
+ "GSM": "gsm",
81
+ }
82
+
83
+ if encoding in encoding_map:
84
+ return encoding_map[encoding]
85
+
86
+ # For PCM variants, use file extension to determine format
87
+ if encoding.startswith("PCM_"):
88
+ # Common PCM formats by extension
89
+ pcm_formats = {
90
+ "wav": "wav",
91
+ "aiff": "aiff",
92
+ "au": "au",
93
+ "raw": "raw",
94
+ }
95
+ return pcm_formats.get(file_ext, "wav") # Default to wav for PCM
96
+
97
+ # Fallback to file extension if encoding is unknown
98
+ return file_ext if file_ext else "unknown"
99
+
100
+
101
+ def audio_to_np(
102
+ audio: "AudioFile", start: float = 0, duration: Optional[float] = None
103
+ ) -> "tuple[ndarray, int]":
104
+ """Load audio fragment as numpy array.
105
+ Multi-channel audio is transposed to (samples, channels)."""
106
+ if start < 0:
107
+ raise ValueError("start must be a non-negative float")
108
+
109
+ if duration is not None and duration <= 0:
110
+ raise ValueError("duration must be a positive float")
111
+
112
+ if hasattr(audio, "as_audio_file"):
113
+ audio = audio.as_audio_file()
114
+
115
+ try:
116
+ with audio.open() as f:
117
+ info = torchaudio.info(f)
118
+ sample_rate = info.sample_rate
119
+
120
+ frame_offset = int(start * sample_rate)
121
+ num_frames = int(duration * sample_rate) if duration is not None else -1
122
+
123
+ # Reset file pointer to the beginning
124
+ # This is important to ensure we read from the correct position later
125
+ f.seek(0)
126
+
127
+ waveform, sr = torchaudio.load(
128
+ f, frame_offset=frame_offset, num_frames=num_frames
129
+ )
130
+
131
+ audio_np = waveform.numpy()
132
+
133
+ if audio_np.shape[0] > 1:
134
+ audio_np = audio_np.T
135
+ else:
136
+ audio_np = audio_np.squeeze()
137
+
138
+ return audio_np, int(sr)
139
+ except Exception as exc:
140
+ raise FileError(
141
+ "unable to read audio fragment", audio.source, audio.path
142
+ ) from exc
143
+
144
+
145
+ def audio_to_bytes(
146
+ audio: "AudioFile",
147
+ format: str = "wav",
148
+ start: float = 0,
149
+ duration: Optional[float] = None,
150
+ ) -> bytes:
151
+ """Convert audio to bytes using soundfile.
152
+
153
+ If duration is None, converts from start to end of file.
154
+ If start is 0 and duration is None, converts entire file."""
155
+ y, sr = audio_to_np(audio, start, duration)
156
+
157
+ import io
158
+
159
+ import soundfile as sf
160
+
161
+ buffer = io.BytesIO()
162
+ sf.write(buffer, y, sr, format=format)
163
+ return buffer.getvalue()
164
+
165
+
166
+ def save_audio(
167
+ audio: "AudioFile",
168
+ output: str,
169
+ format: Optional[str] = None,
170
+ start: float = 0,
171
+ end: Optional[float] = None,
172
+ ) -> "AudioFile":
173
+ """Save audio file or extract fragment to specified format.
174
+
175
+ Args:
176
+ audio: Source AudioFile object
177
+ output: Output directory path
178
+ format: Output format ('wav', 'mp3', etc). Defaults to source format
179
+ start: Start time in seconds (>= 0). Defaults to 0
180
+ end: End time in seconds. If None, extracts to end of file
181
+
182
+ Returns:
183
+ AudioFile: New audio file with format conversion/extraction applied
184
+
185
+ Examples:
186
+ save_audio(audio, "/path", "mp3") # Entire file to MP3
187
+ save_audio(audio, "s3://bucket/path", "wav", start=2.5) # From 2.5s to end
188
+ save_audio(audio, "/path", "flac", start=1, end=3) # Extract 1-3s fragment
189
+ """
190
+ if format is None:
191
+ format = audio.get_file_ext()
192
+
193
+ # Validate start time
194
+ if start < 0:
195
+ raise ValueError(
196
+ f"Can't save audio for '{audio.path}', "
197
+ f"start time must be non-negative: {start:.3f}"
198
+ )
199
+
200
+ # Handle full file conversion when end is None and start is 0
201
+ if end is None and start == 0:
202
+ output_file = posixpath.join(output, f"{audio.get_file_stem()}.{format}")
203
+ try:
204
+ audio_bytes = audio_to_bytes(audio, format, start=0, duration=None)
205
+ except Exception as exc:
206
+ raise FileError(
207
+ "unable to convert audio file", audio.source, audio.path
208
+ ) from exc
209
+ elif end is None:
210
+ # Extract from start to end of file
211
+ output_file = posixpath.join(
212
+ output, f"{audio.get_file_stem()}_{int(start * 1000):06d}_end.{format}"
213
+ )
214
+ try:
215
+ audio_bytes = audio_to_bytes(audio, format, start=start, duration=None)
216
+ except Exception as exc:
217
+ raise FileError(
218
+ "unable to save audio fragment", audio.source, audio.path
219
+ ) from exc
220
+ else:
221
+ # Fragment extraction mode with specific end time
222
+ if end < 0 or start >= end:
223
+ raise ValueError(
224
+ f"Can't save audio for '{audio.path}', "
225
+ f"invalid time range: ({start:.3f}, {end:.3f})"
226
+ )
227
+
228
+ duration = end - start
229
+ start_ms = int(start * 1000)
230
+ end_ms = int(end * 1000)
231
+ output_file = posixpath.join(
232
+ output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
233
+ )
234
+
235
+ try:
236
+ audio_bytes = audio_to_bytes(audio, format, start, duration)
237
+ except Exception as exc:
238
+ raise FileError(
239
+ "unable to save audio fragment", audio.source, audio.path
240
+ ) from exc
241
+
242
+ from datachain.lib.file import AudioFile
243
+
244
+ return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
@@ -1,3 +1,5 @@
1
+ import inspect
2
+ import uuid
1
3
  from collections.abc import Sequence
2
4
  from datetime import datetime
3
5
  from typing import ClassVar, Optional, Union, get_args, get_origin
@@ -80,7 +82,9 @@ def dict_to_data_model(
80
82
 
81
83
  fields = {
82
84
  name: (
83
- anno,
85
+ anno
86
+ if inspect.isclass(anno) and issubclass(anno, BaseModel)
87
+ else Optional[anno],
84
88
  Field(
85
89
  validation_alias=AliasChoices(name, original_names[idx] or name),
86
90
  default=None,
@@ -101,6 +105,10 @@ def dict_to_data_model(
101
105
  field_info[str(alias)] = (_name, field)
102
106
  return field_info
103
107
 
108
+ # Generate random unique name if not provided
109
+ if not name:
110
+ name = f"DataModel_{uuid.uuid4().hex[:8]}"
111
+
104
112
  return create_model(
105
113
  name,
106
114
  __base__=_DataModelStrict,
@@ -2388,7 +2388,7 @@ class DataChain:
2388
2388
  placement: FileExportPlacement = "fullpath",
2389
2389
  link_type: Literal["copy", "symlink"] = "copy",
2390
2390
  num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
2391
- anon: bool = False,
2391
+ anon: Optional[bool] = None,
2392
2392
  client_config: Optional[dict] = None,
2393
2393
  ) -> None:
2394
2394
  """Export files from a specified signal to a directory. Files can be
@@ -2403,7 +2403,11 @@ class DataChain:
2403
2403
  Falls back to `'copy'` if symlinking fails.
2404
2404
  num_threads : number of threads to use for exporting files.
2405
2405
  By default it uses 5 threads.
2406
- anon: If true, we will treat cloud bucket as public one
2406
+ anon: If True, we will treat cloud bucket as public one. Default behavior
2407
+ depends on the previous session configuration (e.g. happens in the
2408
+ initial `read_storage`) and particular cloud storage client
2409
+ implementation (e.g. S3 fallbacks to anonymous access if no credentials
2410
+ were found).
2407
2411
  client_config: Optional configuration for the destination storage client
2408
2412
 
2409
2413
  Example:
@@ -2421,8 +2425,8 @@ class DataChain:
2421
2425
  ):
2422
2426
  raise ValueError("Files with the same name found")
2423
2427
 
2424
- if anon:
2425
- client_config = (client_config or {}) | {"anon": True}
2428
+ if anon is not None:
2429
+ client_config = (client_config or {}) | {"anon": anon}
2426
2430
 
2427
2431
  progress_bar = tqdm(
2428
2432
  desc=f"Exporting files to {output}: ",
@@ -25,19 +25,23 @@ def read_hf(
25
25
  settings: Optional[dict] = None,
26
26
  column: str = "",
27
27
  model_name: str = "",
28
+ limit: int = 0,
28
29
  **kwargs,
29
30
  ) -> "DataChain":
30
- """Generate chain from huggingface hub dataset.
31
+ """Generate chain from Hugging Face Hub dataset.
31
32
 
32
33
  Parameters:
33
34
  dataset : Path or name of the dataset to read from Hugging Face Hub,
34
35
  or an instance of `datasets.Dataset`-like object.
35
- args : Additional positional arguments to pass to datasets.load_dataset.
36
+ args : Additional positional arguments to pass to `datasets.load_dataset`.
36
37
  session : Session to use for the chain.
37
38
  settings : Settings to use for the chain.
38
39
  column : Generated object column name.
39
40
  model_name : Generated model name.
40
- kwargs : Parameters to pass to datasets.load_dataset.
41
+ limit : Limit the number of items to read from the HF dataset.
42
+ Adds `take(limit)` to the `datasets.load_dataset`.
43
+ Defaults to 0 (no limit).
44
+ kwargs : Parameters to pass to `datasets.load_dataset`.
41
45
 
42
46
  Example:
43
47
  Load from Hugging Face Hub:
@@ -53,6 +57,18 @@ def read_hf(
53
57
  import datachain as dc
54
58
  chain = dc.read_hf(ds)
55
59
  ```
60
+
61
+ Streaming with limit, for large datasets:
62
+ ```py
63
+ import datachain as dc
64
+ ds = dc.read_hf("beans", split="train", streaming=True, limit=10)
65
+ ```
66
+
67
+ or use HF split syntax (not supported if streaming is enabled):
68
+ ```py
69
+ import datachain as dc
70
+ ds = dc.read_hf("beans", split="train[%10]")
71
+ ```
56
72
  """
57
73
  from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
58
74
 
@@ -72,4 +88,4 @@ def read_hf(
72
88
  output = {column: model}
73
89
 
74
90
  chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
75
- return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
91
+ return chain.gen(HFGenerator(dataset, model, limit, *args, **kwargs), output=output)