datachain 0.26.4__tar.gz → 0.28.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (407) hide show
  1. {datachain-0.26.4 → datachain-0.28.0}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.26.4 → datachain-0.28.0}/PKG-INFO +6 -3
  3. {datachain-0.26.4 → datachain-0.28.0}/README.rst +4 -1
  4. {datachain-0.26.4 → datachain-0.28.0}/docs/commands/job/run.md +20 -6
  5. {datachain-0.26.4 → datachain-0.28.0}/docs/examples.md +21 -31
  6. {datachain-0.26.4 → datachain-0.28.0}/mkdocs.yml +1 -1
  7. {datachain-0.26.4 → datachain-0.28.0}/pyproject.toml +1 -1
  8. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/parser/job.py +8 -3
  9. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/job.py +2 -1
  10. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/arrow.py +1 -1
  11. datachain-0.28.0/src/datachain/lib/audio.py +244 -0
  12. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/data_model.py +9 -1
  13. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/hf.py +20 -4
  14. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/file.py +43 -8
  15. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/hf.py +17 -7
  16. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/video.py +4 -1
  17. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/studio.py +42 -27
  18. {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/PKG-INFO +6 -3
  19. {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/requires.txt +1 -1
  20. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_audio.py +3 -2
  21. datachain-0.28.0/tests/func/test_hf.py +142 -0
  22. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_studio_datetime_parsing.py +1 -1
  23. {datachain-0.26.4 → datachain-0.28.0}/tests/test_cli_studio.py +1 -1
  24. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_audio.py +153 -34
  25. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_datachain.py +0 -18
  26. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_hf.py +3 -1
  27. datachain-0.26.4/src/datachain/lib/audio.py +0 -151
  28. datachain-0.26.4/tests/func/test_hf.py +0 -67
  29. {datachain-0.26.4 → datachain-0.28.0}/.cruft.json +0 -0
  30. {datachain-0.26.4 → datachain-0.28.0}/.gitattributes +0 -0
  31. {datachain-0.26.4 → datachain-0.28.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  32. {datachain-0.26.4 → datachain-0.28.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  33. {datachain-0.26.4 → datachain-0.28.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  34. {datachain-0.26.4 → datachain-0.28.0}/.github/codecov.yaml +0 -0
  35. {datachain-0.26.4 → datachain-0.28.0}/.github/dependabot.yml +0 -0
  36. {datachain-0.26.4 → datachain-0.28.0}/.github/workflows/benchmarks.yml +0 -0
  37. {datachain-0.26.4 → datachain-0.28.0}/.github/workflows/release.yml +0 -0
  38. {datachain-0.26.4 → datachain-0.28.0}/.github/workflows/tests-studio.yml +0 -0
  39. {datachain-0.26.4 → datachain-0.28.0}/.github/workflows/tests.yml +0 -0
  40. {datachain-0.26.4 → datachain-0.28.0}/.github/workflows/update-template.yaml +0 -0
  41. {datachain-0.26.4 → datachain-0.28.0}/.gitignore +0 -0
  42. {datachain-0.26.4 → datachain-0.28.0}/CODE_OF_CONDUCT.rst +0 -0
  43. {datachain-0.26.4 → datachain-0.28.0}/LICENSE +0 -0
  44. {datachain-0.26.4 → datachain-0.28.0}/docs/assets/captioned_cartoons.png +0 -0
  45. {datachain-0.26.4 → datachain-0.28.0}/docs/assets/datachain-white.svg +0 -0
  46. {datachain-0.26.4 → datachain-0.28.0}/docs/assets/datachain.svg +0 -0
  47. {datachain-0.26.4 → datachain-0.28.0}/docs/commands/auth/login.md +0 -0
  48. {datachain-0.26.4 → datachain-0.28.0}/docs/commands/auth/logout.md +0 -0
  49. {datachain-0.26.4 → datachain-0.28.0}/docs/commands/auth/team.md +0 -0
  50. {datachain-0.26.4 → datachain-0.28.0}/docs/commands/auth/token.md +0 -0
  51. {datachain-0.26.4 → datachain-0.28.0}/docs/commands/index.md +0 -0
  52. {datachain-0.26.4 → datachain-0.28.0}/docs/commands/job/cancel.md +0 -0
  53. {datachain-0.26.4 → datachain-0.28.0}/docs/commands/job/clusters.md +0 -0
  54. {datachain-0.26.4 → datachain-0.28.0}/docs/commands/job/logs.md +0 -0
  55. {datachain-0.26.4 → datachain-0.28.0}/docs/commands/job/ls.md +0 -0
  56. {datachain-0.26.4 → datachain-0.28.0}/docs/contributing.md +0 -0
  57. {datachain-0.26.4 → datachain-0.28.0}/docs/css/github-permalink-style.css +0 -0
  58. {datachain-0.26.4 → datachain-0.28.0}/docs/guide/db_migrations.md +0 -0
  59. {datachain-0.26.4 → datachain-0.28.0}/docs/guide/delta.md +0 -0
  60. {datachain-0.26.4 → datachain-0.28.0}/docs/guide/env.md +0 -0
  61. {datachain-0.26.4 → datachain-0.28.0}/docs/guide/index.md +0 -0
  62. {datachain-0.26.4 → datachain-0.28.0}/docs/guide/namespaces.md +0 -0
  63. {datachain-0.26.4 → datachain-0.28.0}/docs/guide/processing.md +0 -0
  64. {datachain-0.26.4 → datachain-0.28.0}/docs/guide/remotes.md +0 -0
  65. {datachain-0.26.4 → datachain-0.28.0}/docs/guide/retry.md +0 -0
  66. {datachain-0.26.4 → datachain-0.28.0}/docs/index.md +0 -0
  67. {datachain-0.26.4 → datachain-0.28.0}/docs/overrides/main.html +0 -0
  68. {datachain-0.26.4 → datachain-0.28.0}/docs/quick-start.md +0 -0
  69. {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/arrowrow.md +0 -0
  70. {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/bbox.md +0 -0
  71. {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/file.md +0 -0
  72. {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/imagefile.md +0 -0
  73. {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/index.md +0 -0
  74. {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/pose.md +0 -0
  75. {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/segment.md +0 -0
  76. {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/tarvfile.md +0 -0
  77. {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/textfile.md +0 -0
  78. {datachain-0.26.4 → datachain-0.28.0}/docs/references/data-types/videofile.md +0 -0
  79. {datachain-0.26.4 → datachain-0.28.0}/docs/references/datachain.md +0 -0
  80. {datachain-0.26.4 → datachain-0.28.0}/docs/references/func.md +0 -0
  81. {datachain-0.26.4 → datachain-0.28.0}/docs/references/index.md +0 -0
  82. {datachain-0.26.4 → datachain-0.28.0}/docs/references/toolkit.md +0 -0
  83. {datachain-0.26.4 → datachain-0.28.0}/docs/references/torch.md +0 -0
  84. {datachain-0.26.4 → datachain-0.28.0}/docs/references/udf.md +0 -0
  85. {datachain-0.26.4 → datachain-0.28.0}/docs/tutorials.md +0 -0
  86. {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  87. {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  88. {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/openimage-detect.py +0 -0
  89. {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  90. {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  91. {datachain-0.26.4 → datachain-0.28.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  92. {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/common_sql_functions.py +0 -0
  93. {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/json-csv-reader.py +0 -0
  94. {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/torch-loader.py +0 -0
  95. {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/udfs/parallel.py +0 -0
  96. {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/udfs/simple.py +0 -0
  97. {datachain-0.26.4 → datachain-0.28.0}/examples/get_started/udfs/stateful.py +0 -0
  98. {datachain-0.26.4 → datachain-0.28.0}/examples/incremental_processing/delta.py +0 -0
  99. {datachain-0.26.4 → datachain-0.28.0}/examples/incremental_processing/retry.py +0 -0
  100. {datachain-0.26.4 → datachain-0.28.0}/examples/incremental_processing/utils.py +0 -0
  101. {datachain-0.26.4 → datachain-0.28.0}/examples/llm_and_nlp/claude-query.py +0 -0
  102. {datachain-0.26.4 → datachain-0.28.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  103. {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/audio-to-text.py +0 -0
  104. {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/clip_inference.py +0 -0
  105. {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/hf_pipeline.py +0 -0
  106. {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  107. {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/wds.py +0 -0
  108. {datachain-0.26.4 → datachain-0.28.0}/examples/multimodal/wds_filtered.py +0 -0
  109. {datachain-0.26.4 → datachain-0.28.0}/noxfile.py +0 -0
  110. {datachain-0.26.4 → datachain-0.28.0}/setup.cfg +0 -0
  111. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/__init__.py +0 -0
  112. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/__main__.py +0 -0
  113. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/asyn.py +0 -0
  114. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cache.py +0 -0
  115. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/catalog/__init__.py +0 -0
  116. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/catalog/catalog.py +0 -0
  117. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/catalog/datasource.py +0 -0
  118. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/catalog/loader.py +0 -0
  119. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/__init__.py +0 -0
  120. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/__init__.py +0 -0
  121. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/datasets.py +0 -0
  122. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/du.py +0 -0
  123. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/index.py +0 -0
  124. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/ls.py +0 -0
  125. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/misc.py +0 -0
  126. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/query.py +0 -0
  127. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/commands/show.py +0 -0
  128. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/parser/__init__.py +0 -0
  129. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/parser/studio.py +0 -0
  130. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/parser/utils.py +0 -0
  131. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/cli/utils.py +0 -0
  132. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/__init__.py +0 -0
  133. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/azure.py +0 -0
  134. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/fileslice.py +0 -0
  135. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/fsspec.py +0 -0
  136. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/gcs.py +0 -0
  137. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/hf.py +0 -0
  138. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/local.py +0 -0
  139. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/client/s3.py +0 -0
  140. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/config.py +0 -0
  141. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/__init__.py +0 -0
  142. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/db_engine.py +0 -0
  143. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/metastore.py +0 -0
  144. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/schema.py +0 -0
  145. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/serializer.py +0 -0
  146. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/sqlite.py +0 -0
  147. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/data_storage/warehouse.py +0 -0
  148. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/dataset.py +0 -0
  149. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/delta.py +0 -0
  150. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/diff/__init__.py +0 -0
  151. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/error.py +0 -0
  152. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/fs/__init__.py +0 -0
  153. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/fs/reference.py +0 -0
  154. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/fs/utils.py +0 -0
  155. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/__init__.py +0 -0
  156. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/aggregate.py +0 -0
  157. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/array.py +0 -0
  158. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/base.py +0 -0
  159. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/conditional.py +0 -0
  160. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/func.py +0 -0
  161. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/numeric.py +0 -0
  162. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/path.py +0 -0
  163. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/random.py +0 -0
  164. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/string.py +0 -0
  165. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/func/window.py +0 -0
  166. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/job.py +0 -0
  167. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/__init__.py +0 -0
  168. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/clip.py +0 -0
  169. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/__init__.py +0 -0
  170. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/flatten.py +0 -0
  171. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  172. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  173. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/unflatten.py +0 -0
  174. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  175. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dataset_info.py +0 -0
  176. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/__init__.py +0 -0
  177. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/csv.py +0 -0
  178. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/database.py +0 -0
  179. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/datachain.py +0 -0
  180. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/datasets.py +0 -0
  181. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/json.py +0 -0
  182. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/listings.py +0 -0
  183. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/pandas.py +0 -0
  184. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/parquet.py +0 -0
  185. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/records.py +0 -0
  186. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/storage.py +0 -0
  187. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/utils.py +0 -0
  188. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/dc/values.py +0 -0
  189. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/image.py +0 -0
  190. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/listing.py +0 -0
  191. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/listing_info.py +0 -0
  192. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/meta_formats.py +0 -0
  193. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/model_store.py +0 -0
  194. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/namespaces.py +0 -0
  195. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/projects.py +0 -0
  196. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/pytorch.py +0 -0
  197. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/settings.py +0 -0
  198. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/signal_schema.py +0 -0
  199. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/tar.py +0 -0
  200. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/text.py +0 -0
  201. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/udf.py +0 -0
  202. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/udf_signature.py +0 -0
  203. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/utils.py +0 -0
  204. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/webdataset.py +0 -0
  205. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/lib/webdataset_laion.py +0 -0
  206. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/listing.py +0 -0
  207. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/__init__.py +0 -0
  208. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/bbox.py +0 -0
  209. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/pose.py +0 -0
  210. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/segment.py +0 -0
  211. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  212. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  213. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/ultralytics/pose.py +0 -0
  214. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/ultralytics/segment.py +0 -0
  215. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/model/utils.py +0 -0
  216. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/namespace.py +0 -0
  217. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/node.py +0 -0
  218. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/nodes_fetcher.py +0 -0
  219. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/nodes_thread_pool.py +0 -0
  220. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/progress.py +0 -0
  221. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/project.py +0 -0
  222. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/py.typed +0 -0
  223. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/__init__.py +0 -0
  224. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/batch.py +0 -0
  225. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/dataset.py +0 -0
  226. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/dispatch.py +0 -0
  227. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/metrics.py +0 -0
  228. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/params.py +0 -0
  229. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/queue.py +0 -0
  230. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/schema.py +0 -0
  231. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/session.py +0 -0
  232. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/udf.py +0 -0
  233. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/query/utils.py +0 -0
  234. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/remote/__init__.py +0 -0
  235. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/remote/studio.py +0 -0
  236. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/script_meta.py +0 -0
  237. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/semver.py +0 -0
  238. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/__init__.py +0 -0
  239. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/default/__init__.py +0 -0
  240. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/default/base.py +0 -0
  241. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/__init__.py +0 -0
  242. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/aggregate.py +0 -0
  243. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/array.py +0 -0
  244. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/conditional.py +0 -0
  245. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/numeric.py +0 -0
  246. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/path.py +0 -0
  247. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/random.py +0 -0
  248. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/functions/string.py +0 -0
  249. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/selectable.py +0 -0
  250. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  251. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/sqlite/base.py +0 -0
  252. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/sqlite/types.py +0 -0
  253. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/sqlite/vector.py +0 -0
  254. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/types.py +0 -0
  255. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/sql/utils.py +0 -0
  256. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/telemetry.py +0 -0
  257. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/toolkit/__init__.py +0 -0
  258. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/toolkit/split.py +0 -0
  259. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/torch/__init__.py +0 -0
  260. {datachain-0.26.4 → datachain-0.28.0}/src/datachain/utils.py +0 -0
  261. {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/SOURCES.txt +0 -0
  262. {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  263. {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/entry_points.txt +0 -0
  264. {datachain-0.26.4 → datachain-0.28.0}/src/datachain.egg-info/top_level.txt +0 -0
  265. {datachain-0.26.4 → datachain-0.28.0}/tests/__init__.py +0 -0
  266. {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/__init__.py +0 -0
  267. {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/conftest.py +0 -0
  268. {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  269. {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  270. {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/datasets/.gitignore +0 -0
  271. {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  272. {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/test_datachain.py +0 -0
  273. {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/test_ls.py +0 -0
  274. {datachain-0.26.4 → datachain-0.28.0}/tests/benchmarks/test_version.py +0 -0
  275. {datachain-0.26.4 → datachain-0.28.0}/tests/conftest.py +0 -0
  276. {datachain-0.26.4 → datachain-0.28.0}/tests/data.py +0 -0
  277. {datachain-0.26.4 → datachain-0.28.0}/tests/examples/__init__.py +0 -0
  278. {datachain-0.26.4 → datachain-0.28.0}/tests/examples/test_examples.py +0 -0
  279. {datachain-0.26.4 → datachain-0.28.0}/tests/examples/test_wds_e2e.py +0 -0
  280. {datachain-0.26.4 → datachain-0.28.0}/tests/examples/wds_data.py +0 -0
  281. {datachain-0.26.4 → datachain-0.28.0}/tests/func/__init__.py +0 -0
  282. {datachain-0.26.4 → datachain-0.28.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  283. {datachain-0.26.4 → datachain-0.28.0}/tests/func/data/lena.jpg +0 -0
  284. {datachain-0.26.4 → datachain-0.28.0}/tests/func/fake-service-account-credentials.json +0 -0
  285. {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/__init__.py +0 -0
  286. {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_aggregate.py +0 -0
  287. {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_array.py +0 -0
  288. {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_conditional.py +0 -0
  289. {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_numeric.py +0 -0
  290. {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_path.py +0 -0
  291. {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_random.py +0 -0
  292. {datachain-0.26.4 → datachain-0.28.0}/tests/func/functions/test_string.py +0 -0
  293. {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/__init__.py +0 -0
  294. {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/data/running-mask0.png +0 -0
  295. {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/data/running-mask1.png +0 -0
  296. {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/data/running.jpg +0 -0
  297. {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/data/ships.jpg +0 -0
  298. {datachain-0.26.4 → datachain-0.28.0}/tests/func/model/test_yolo.py +0 -0
  299. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_batching.py +0 -0
  300. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_catalog.py +0 -0
  301. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_client.py +0 -0
  302. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_cloud_transfer.py +0 -0
  303. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_data_storage.py +0 -0
  304. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_datachain.py +0 -0
  305. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_datachain_merge.py +0 -0
  306. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_dataset_query.py +0 -0
  307. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_datasets.py +0 -0
  308. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_delta.py +0 -0
  309. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_feature_pickling.py +0 -0
  310. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_file.py +0 -0
  311. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_hidden_field.py +0 -0
  312. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_image.py +0 -0
  313. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_listing.py +0 -0
  314. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_ls.py +0 -0
  315. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_meta_formats.py +0 -0
  316. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_metastore.py +0 -0
  317. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_metrics.py +0 -0
  318. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_pull.py +0 -0
  319. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_pytorch.py +0 -0
  320. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_query.py +0 -0
  321. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_read_database.py +0 -0
  322. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_read_dataset_remote.py +0 -0
  323. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  324. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_retry.py +0 -0
  325. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_session.py +0 -0
  326. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_toolkit.py +0 -0
  327. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_video.py +0 -0
  328. {datachain-0.26.4 → datachain-0.28.0}/tests/func/test_warehouse.py +0 -0
  329. {datachain-0.26.4 → datachain-0.28.0}/tests/scripts/feature_class.py +0 -0
  330. {datachain-0.26.4 → datachain-0.28.0}/tests/scripts/feature_class_exception.py +0 -0
  331. {datachain-0.26.4 → datachain-0.28.0}/tests/scripts/feature_class_parallel.py +0 -0
  332. {datachain-0.26.4 → datachain-0.28.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  333. {datachain-0.26.4 → datachain-0.28.0}/tests/scripts/name_len_slow.py +0 -0
  334. {datachain-0.26.4 → datachain-0.28.0}/tests/test_atomicity.py +0 -0
  335. {datachain-0.26.4 → datachain-0.28.0}/tests/test_cli_e2e.py +0 -0
  336. {datachain-0.26.4 → datachain-0.28.0}/tests/test_import_time.py +0 -0
  337. {datachain-0.26.4 → datachain-0.28.0}/tests/test_query_e2e.py +0 -0
  338. {datachain-0.26.4 → datachain-0.28.0}/tests/test_telemetry.py +0 -0
  339. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/__init__.py +0 -0
  340. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/__init__.py +0 -0
  341. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/conftest.py +0 -0
  342. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_arrow.py +0 -0
  343. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_clip.py +0 -0
  344. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  345. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  346. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_diff.py +0 -0
  347. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_feature.py +0 -0
  348. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_feature_utils.py +0 -0
  349. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_file.py +0 -0
  350. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_image.py +0 -0
  351. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_listing_info.py +0 -0
  352. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_namespace.py +0 -0
  353. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_partition_by.py +0 -0
  354. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_project.py +0 -0
  355. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  356. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_schema.py +0 -0
  357. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_signal_schema.py +0 -0
  358. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  359. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_text.py +0 -0
  360. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_udf.py +0 -0
  361. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_udf_signature.py +0 -0
  362. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_utils.py +0 -0
  363. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/lib/test_webdataset.py +0 -0
  364. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/model/__init__.py +0 -0
  365. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/model/test_bbox.py +0 -0
  366. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/model/test_pose.py +0 -0
  367. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/model/test_segment.py +0 -0
  368. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/model/test_utils.py +0 -0
  369. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/__init__.py +0 -0
  370. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  371. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  372. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  373. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_array.py +0 -0
  374. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_conditional.py +0 -0
  375. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_path.py +0 -0
  376. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_random.py +0 -0
  377. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_selectable.py +0 -0
  378. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/sql/test_string.py +0 -0
  379. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_asyn.py +0 -0
  380. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_cache.py +0 -0
  381. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_catalog.py +0 -0
  382. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_catalog_loader.py +0 -0
  383. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_cli_parsing.py +0 -0
  384. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_client.py +0 -0
  385. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_client_gcs.py +0 -0
  386. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_client_s3.py +0 -0
  387. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_config.py +0 -0
  388. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_data_storage.py +0 -0
  389. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_database_engine.py +0 -0
  390. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_dataset.py +0 -0
  391. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_dispatch.py +0 -0
  392. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_fileslice.py +0 -0
  393. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_func.py +0 -0
  394. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_listing.py +0 -0
  395. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_metastore.py +0 -0
  396. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_module_exports.py +0 -0
  397. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_pytorch.py +0 -0
  398. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_query.py +0 -0
  399. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_query_metrics.py +0 -0
  400. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_query_params.py +0 -0
  401. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_script_meta.py +0 -0
  402. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_semver.py +0 -0
  403. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_serializer.py +0 -0
  404. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_session.py +0 -0
  405. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_utils.py +0 -0
  406. {datachain-0.26.4 → datachain-0.28.0}/tests/unit/test_warehouse.py +0 -0
  407. {datachain-0.26.4 → datachain-0.28.0}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.12.3'
27
+ rev: 'v0.12.4'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.26.4
3
+ Version: 0.28.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -45,7 +45,7 @@ Requires-Dist: datamodel-code-generator>=0.25
45
45
  Requires-Dist: Pillow<12,>=10.0.0
46
46
  Requires-Dist: msgpack<2,>=1.0.4
47
47
  Requires-Dist: psutil
48
- Requires-Dist: huggingface_hub
48
+ Requires-Dist: huggingface_hub<0.34.0
49
49
  Requires-Dist: iterative-telemetry>=0.0.10
50
50
  Requires-Dist: platformdirs
51
51
  Requires-Dist: dvc-studio-client<1,>=0.21
@@ -120,7 +120,7 @@ Dynamic: license-file
120
120
  |logo| DataChain
121
121
  ================
122
122
 
123
- |PyPI| |Python Version| |Codecov| |Tests|
123
+ |PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
124
124
 
125
125
  .. |logo| image:: docs/assets/datachain.svg
126
126
  :height: 24
@@ -136,6 +136,9 @@ Dynamic: license-file
136
136
  .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
137
137
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
138
138
  :alt: Tests
139
+ .. |DeepWiki| image:: https://deepwiki.com/badge.svg
140
+ :target: https://deepwiki.com/iterative/datachain
141
+ :alt: DeepWiki
139
142
 
140
143
  DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
141
144
  data like images, audio, videos, text and PDFs. It integrates with external storage
@@ -2,7 +2,7 @@
2
2
  |logo| DataChain
3
3
  ================
4
4
 
5
- |PyPI| |Python Version| |Codecov| |Tests|
5
+ |PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
6
6
 
7
7
  .. |logo| image:: docs/assets/datachain.svg
8
8
  :height: 24
@@ -18,6 +18,9 @@
18
18
  .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
19
19
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
20
20
  :alt: Tests
21
+ .. |DeepWiki| image:: https://deepwiki.com/badge.svg
22
+ :target: https://deepwiki.com/iterative/datachain
23
+ :alt: DeepWiki
21
24
 
22
25
  DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
23
26
  data like images, audio, videos, text and PDFs. It integrates with external storage
@@ -5,15 +5,22 @@ Run a job in Studio.
5
5
  ## Synopsis
6
6
 
7
7
  ```usage
8
- usage: datachain job run [-h] [-v] [-q] [--team TEAM] [--env-file ENV_FILE] [--env ENV [ENV ...]] [--cluster CLUSTER] [--workers WORKERS]
9
- [--files FILES [FILES ...]] [--python-version PYTHON_VERSION] [--repository REPOSITORY] [--req-file REQ_FILE] [--req REQ [REQ ...]]
10
- [--priority PRIORITY] [--start-time START_TIME] [--cron CRON]
8
+ usage: datachain job run [-h] [-v] [-q] [--team TEAM] [--env-file ENV_FILE]
9
+ [--env ENV [ENV ...]]
10
+ [--cluster CLUSTER] [--workers WORKERS]
11
+ [--files FILES [FILES ...]]
12
+ [--python-version PYTHON_VERSION]
13
+ [--repository REPOSITORY]
14
+ [--req-file REQ_FILE] [--req REQ [REQ ...]]
15
+ [--priority PRIORITY]
16
+ [--start-time START_TIME] [--cron CRON]
17
+ [--no-wait]
11
18
  file
12
19
  ```
13
20
 
14
21
  ## Description
15
22
 
16
- This command runs a job in Studio using the specified query file. You can configure various aspects of the job including environment variables, Python version, dependencies, and more. When using --start-time or --cron, the job is scheduled as a task and will not show logs immediately. The job will be executed according to the schedule.
23
+ This command runs a job in Studio using the specified query file. You can configure various aspects of the job including environment variables, Python version, dependencies, and more. When using --start-time or --cron, the job is scheduled to run but won't start immediately. (can be seen in the Tasks tab in UI)
17
24
 
18
25
  ## Arguments
19
26
 
@@ -32,8 +39,9 @@ This command runs a job in Studio using the specified query file. You can config
32
39
  * `--req-file REQ_FILE` - Python requirements file
33
40
  * `--req REQ` - Python package requirements
34
41
  * `--priority PRIORITY` - Priority for the job in range 0-5. Lower value is higher priority (default: 5)
35
- * `--start-time START_TIME` - Start time in ISO format or natural language for the cron task.
42
+ * `--start-time START_TIME` - Time to schedule the task in YYYY-MM-DDTHH:mm format or natural language.
36
43
  * `--cron CRON` - Cron expression for the cron task.
44
+ * `--no-wait` - Do not wait for the job to finish.
37
45
  * `-h`, `--help` - Show the help message and exit.
38
46
  * `-v`, `--verbose` - Be verbose.
39
47
  * `-q`, `--quiet` - Be quiet.
@@ -125,6 +133,12 @@ datachain job run --cron "@monthly" query.py
125
133
  datachain job run --start-time "tomorrow 3pm" --cron "0 0 * * *" query.py
126
134
  ```
127
135
 
136
+ 12. Start the job and do not wait for the job to complete
137
+ ```bash
138
+ # Do not follow or tail the logs from Studio.
139
+ datachain job run query.py --no-wait
140
+ ```
141
+
128
142
  ## Notes
129
143
 
130
144
  * Closing the logs command (e.g., with Ctrl+C) will only stop displaying the logs but will not cancel the job execution
@@ -132,7 +146,7 @@ datachain job run --start-time "tomorrow 3pm" --cron "0 0 * * *" query.py
132
146
  * The job will continue running in Studio even after you stop viewing the logs
133
147
  * You can get the list of compute clusters using `datachain job clusters` command.
134
148
  * When using `--start-time` or `--cron` options, the job is scheduled as a task and will not show logs immediately. The job will be executed according to the schedule.
135
- * The `--start-time` option supports natural language parsing using the dateparser library, allowing flexible time expressions like "tomorrow 3pm", "in 2 hours", "monday 9am", etc.
149
+ * The `--start-time` option supports natural language parsing using the [dateparser](https://dateparser.readthedocs.io/en/latest/) library, allowing flexible time expressions like "tomorrow 3pm", "in 2 hours", "monday 9am", etc.
136
150
  * Cron expressions follow the standard format: minute hour day-of-month month day-of-week (e.g., "0 0 * * *" for daily at midnight) or Vixie cron-style “@” keyword expressions.
137
151
  * Following options for Vixie cron-style expressions are supported:
138
152
  * @midnight
@@ -10,55 +10,45 @@ title: Examples
10
10
 
11
11
  Datachain is built by composing wrangling operations.
12
12
 
13
- For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column scene”:
13
+ For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies BLIP Large model to caption the first five of them and put the results in the column "scene":
14
14
 
15
15
  ```python
16
16
  import datachain as dc # (1)!
17
- from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
17
+ from transformers import Pipeline, pipeline
18
+ from datachain import File
18
19
 
19
- images = dc.read_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
20
-
21
- model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-mix-224")
22
- processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
23
-
24
- def process(file: File) -> str:
25
- image=file.read().convert("RGB")
26
- inputs = processor(text="caption", images=image, return_tensors="pt")
27
- generate_ids = model.generate(**inputs, max_new_tokens=100)
28
- return processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
20
+ def process(file: File, pipeline: Pipeline) -> str:
21
+ image = file.read().convert("RGB")
22
+ return pipeline(image)[0]["generated_text"]
29
23
 
30
24
  chain = (
31
- images.limit(5)
32
- .settings(cache=True)
33
- .map(scene=lambda file: process(file), output = str)
34
- .save()
25
+ dc.read_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image", anon=True)
26
+ .limit(5)
27
+ .settings(cache=True)
28
+ .setup(pipeline=lambda: pipeline("image-to-text", model="Salesforce/blip-image-captioning-large"))
29
+ .map(scene=process)
30
+ .persist()
35
31
  )
36
32
  ```
37
33
 
38
- 1. `pip install datachain`
39
- 2. `pip install transformers`
34
+ 1. `pip install datachain[hf]`
40
35
 
41
36
  Here is how we can view the results in a plot:
42
37
 
43
38
  ```python
44
39
  import matplotlib.pyplot as plt
45
- import re
46
40
  from textwrap import wrap
47
41
 
48
- def trim_text(text):
49
- match = re.search(r'[A-Z][^.]*\.', text)
50
- return match.group(0) if match else ''
51
-
52
- images = chain.collect("file")
53
- captions = chain.collect("scene")
54
- _ , axes = plt.subplots(1, len(captions), figsize=(15, 5))
42
+ count = chain.count()
43
+ _, axes = plt.subplots(1, count, figsize=(15, 5))
55
44
 
56
- for ax, img, caption in zip(axes, images, captions):
57
- ax.imshow(img.read(),cmap='gray')
58
- ax.axis('off')
59
- wrapped_caption = "\n".join(wrap(trim_text(caption), 30))
60
- ax.set_title(wrapped_caption, fontsize=6)
45
+ for ax, (img_file, caption) in zip(axes, chain.to_iter("file", "scene")):
46
+ ax.imshow(img_file.read(), cmap="gray")
47
+ ax.axis("off")
48
+ wrapped_caption = "\n".join(wrap(caption.strip(), 40))
49
+ ax.set_title(wrapped_caption, fontsize=10, pad=20)
61
50
 
51
+ plt.tight_layout()
62
52
  plt.show()
63
53
  ```
64
54
 
@@ -177,7 +177,7 @@ plugins:
177
177
  - https://numpy.org/doc/stable/objects.inv
178
178
  - https://pandas.pydata.org/docs/objects.inv
179
179
  - https://arrow.apache.org/docs/objects.inv
180
- - https://docs.sqlalchemy.org/objects.inv
180
+ # - https://docs.sqlalchemy.org/objects.inv # SSL certificate issue
181
181
  - https://docs.pydantic.dev/latest/objects.inv
182
182
 
183
183
  watch:
@@ -49,7 +49,7 @@ dependencies = [
49
49
  "Pillow>=10.0.0,<12",
50
50
  "msgpack>=1.0.4,<2",
51
51
  "psutil",
52
- "huggingface_hub",
52
+ "huggingface_hub<0.34.0",
53
53
  "iterative-telemetry>=0.0.10",
54
54
  "platformdirs",
55
55
  "dvc-studio-client>=0.21,<1",
@@ -20,8 +20,8 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
20
20
  studio_run_description = "Run a job in Studio. \n"
21
21
  studio_run_description += (
22
22
  "When using --start-time or --cron,"
23
- " the job is scheduled as a task and will not show logs immediately."
24
- " The job will be executed according to the schedule."
23
+ " the job is scheduled to run but won't start immediately"
24
+ " (can be seen in the Tasks tab in UI)"
25
25
  )
26
26
 
27
27
  studio_run_parser = jobs_subparser.add_parser(
@@ -104,11 +104,16 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
104
104
  studio_run_parser.add_argument(
105
105
  "--start-time",
106
106
  action="store",
107
- help="Start time in ISO format or natural language for the cron task.",
107
+ help="Time to schedule a task in YYYY-MM-DDTHH:mm format or natural language.",
108
108
  )
109
109
  studio_run_parser.add_argument(
110
110
  "--cron", action="store", help="Cron expression for the cron task."
111
111
  )
112
+ studio_run_parser.add_argument(
113
+ "--no-wait",
114
+ action="store_true",
115
+ help="Do not wait for the job to finish",
116
+ )
112
117
 
113
118
  studio_ls_help = "List jobs in Studio"
114
119
  studio_ls_description = "List jobs in Studio."
@@ -12,10 +12,11 @@ class JobStatus(int, Enum):
12
12
  CANCELING = 7
13
13
  CANCELED = 8
14
14
  CANCELING_SCHEDULED = 9
15
+ TASK = 11
15
16
 
16
17
  @classmethod
17
18
  def finished(cls) -> tuple[int, ...]:
18
- return cls.COMPLETE, cls.FAILED, cls.CANCELED
19
+ return cls.COMPLETE, cls.FAILED, cls.CANCELED, cls.TASK
19
20
 
20
21
 
21
22
  class JobQueryType(int, Enum):
@@ -245,7 +245,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
245
245
  if field.nullable and not ModelStore.is_pydantic(dtype):
246
246
  dtype = Optional[dtype] # type: ignore[assignment]
247
247
  type_dict[field.name] = dtype
248
- return dict_to_data_model(column, type_dict)
248
+ return dict_to_data_model(f"ArrowDataModel_{column}", type_dict)
249
249
  if pa.types.is_map(col_type):
250
250
  return dict
251
251
  if isinstance(col_type, pa.lib.DictionaryType):
@@ -0,0 +1,244 @@
1
+ import posixpath
2
+ from typing import TYPE_CHECKING, Optional, Union
3
+
4
+ from datachain.lib.file import FileError
5
+
6
+ if TYPE_CHECKING:
7
+ from numpy import ndarray
8
+
9
+ from datachain.lib.file import Audio, AudioFile, File
10
+
11
+ try:
12
+ import torchaudio
13
+ except ImportError as exc:
14
+ raise ImportError(
15
+ "Missing dependencies for processing audio.\n"
16
+ "To install run:\n\n"
17
+ " pip install 'datachain[audio]'\n"
18
+ ) from exc
19
+
20
+
21
+ def audio_info(file: "Union[File, AudioFile]") -> "Audio":
22
+ """Extract metadata like sample rate, channels, duration, and format."""
23
+ from datachain.lib.file import Audio
24
+
25
+ file = file.as_audio_file()
26
+
27
+ try:
28
+ with file.open() as f:
29
+ info = torchaudio.info(f)
30
+
31
+ sample_rate = int(info.sample_rate)
32
+ channels = int(info.num_channels)
33
+ frames = int(info.num_frames)
34
+ duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
35
+
36
+ codec_name = getattr(info, "encoding", "")
37
+ file_ext = file.get_file_ext().lower()
38
+ format_name = _encoding_to_format(codec_name, file_ext)
39
+
40
+ bits_per_sample = getattr(info, "bits_per_sample", 0)
41
+ bit_rate = (
42
+ bits_per_sample * sample_rate * channels if bits_per_sample > 0 else -1
43
+ )
44
+
45
+ except Exception as exc:
46
+ raise FileError(
47
+ "unable to extract metadata from audio file", file.source, file.path
48
+ ) from exc
49
+
50
+ return Audio(
51
+ sample_rate=sample_rate,
52
+ channels=channels,
53
+ duration=duration,
54
+ samples=frames,
55
+ format=format_name,
56
+ codec=codec_name,
57
+ bit_rate=bit_rate,
58
+ )
59
+
60
+
61
+ def _encoding_to_format(encoding: str, file_ext: str) -> str:
62
+ """
63
+ Map torchaudio encoding to a format name.
64
+
65
+ Args:
66
+ encoding: The encoding string from torchaudio.info()
67
+ file_ext: The file extension as a fallback
68
+
69
+ Returns:
70
+ Format name as a string
71
+ """
72
+ # Direct mapping for formats that match exactly
73
+ encoding_map = {
74
+ "FLAC": "flac",
75
+ "MP3": "mp3",
76
+ "VORBIS": "ogg",
77
+ "AMR_WB": "amr",
78
+ "AMR_NB": "amr",
79
+ "OPUS": "opus",
80
+ "GSM": "gsm",
81
+ }
82
+
83
+ if encoding in encoding_map:
84
+ return encoding_map[encoding]
85
+
86
+ # For PCM variants, use file extension to determine format
87
+ if encoding.startswith("PCM_"):
88
+ # Common PCM formats by extension
89
+ pcm_formats = {
90
+ "wav": "wav",
91
+ "aiff": "aiff",
92
+ "au": "au",
93
+ "raw": "raw",
94
+ }
95
+ return pcm_formats.get(file_ext, "wav") # Default to wav for PCM
96
+
97
+ # Fallback to file extension if encoding is unknown
98
+ return file_ext if file_ext else "unknown"
99
+
100
+
101
+ def audio_to_np(
102
+ audio: "AudioFile", start: float = 0, duration: Optional[float] = None
103
+ ) -> "tuple[ndarray, int]":
104
+ """Load audio fragment as numpy array.
105
+ Multi-channel audio is transposed to (samples, channels)."""
106
+ if start < 0:
107
+ raise ValueError("start must be a non-negative float")
108
+
109
+ if duration is not None and duration <= 0:
110
+ raise ValueError("duration must be a positive float")
111
+
112
+ if hasattr(audio, "as_audio_file"):
113
+ audio = audio.as_audio_file()
114
+
115
+ try:
116
+ with audio.open() as f:
117
+ info = torchaudio.info(f)
118
+ sample_rate = info.sample_rate
119
+
120
+ frame_offset = int(start * sample_rate)
121
+ num_frames = int(duration * sample_rate) if duration is not None else -1
122
+
123
+ # Reset file pointer to the beginning
124
+ # This is important to ensure we read from the correct position later
125
+ f.seek(0)
126
+
127
+ waveform, sr = torchaudio.load(
128
+ f, frame_offset=frame_offset, num_frames=num_frames
129
+ )
130
+
131
+ audio_np = waveform.numpy()
132
+
133
+ if audio_np.shape[0] > 1:
134
+ audio_np = audio_np.T
135
+ else:
136
+ audio_np = audio_np.squeeze()
137
+
138
+ return audio_np, int(sr)
139
+ except Exception as exc:
140
+ raise FileError(
141
+ "unable to read audio fragment", audio.source, audio.path
142
+ ) from exc
143
+
144
+
145
+ def audio_to_bytes(
146
+ audio: "AudioFile",
147
+ format: str = "wav",
148
+ start: float = 0,
149
+ duration: Optional[float] = None,
150
+ ) -> bytes:
151
+ """Convert audio to bytes using soundfile.
152
+
153
+ If duration is None, converts from start to end of file.
154
+ If start is 0 and duration is None, converts entire file."""
155
+ y, sr = audio_to_np(audio, start, duration)
156
+
157
+ import io
158
+
159
+ import soundfile as sf
160
+
161
+ buffer = io.BytesIO()
162
+ sf.write(buffer, y, sr, format=format)
163
+ return buffer.getvalue()
164
+
165
+
166
+ def save_audio(
167
+ audio: "AudioFile",
168
+ output: str,
169
+ format: Optional[str] = None,
170
+ start: float = 0,
171
+ end: Optional[float] = None,
172
+ ) -> "AudioFile":
173
+ """Save audio file or extract fragment to specified format.
174
+
175
+ Args:
176
+ audio: Source AudioFile object
177
+ output: Output directory path
178
+ format: Output format ('wav', 'mp3', etc). Defaults to source format
179
+ start: Start time in seconds (>= 0). Defaults to 0
180
+ end: End time in seconds. If None, extracts to end of file
181
+
182
+ Returns:
183
+ AudioFile: New audio file with format conversion/extraction applied
184
+
185
+ Examples:
186
+ save_audio(audio, "/path", "mp3") # Entire file to MP3
187
+ save_audio(audio, "s3://bucket/path", "wav", start=2.5) # From 2.5s to end
188
+ save_audio(audio, "/path", "flac", start=1, end=3) # Extract 1-3s fragment
189
+ """
190
+ if format is None:
191
+ format = audio.get_file_ext()
192
+
193
+ # Validate start time
194
+ if start < 0:
195
+ raise ValueError(
196
+ f"Can't save audio for '{audio.path}', "
197
+ f"start time must be non-negative: {start:.3f}"
198
+ )
199
+
200
+ # Handle full file conversion when end is None and start is 0
201
+ if end is None and start == 0:
202
+ output_file = posixpath.join(output, f"{audio.get_file_stem()}.{format}")
203
+ try:
204
+ audio_bytes = audio_to_bytes(audio, format, start=0, duration=None)
205
+ except Exception as exc:
206
+ raise FileError(
207
+ "unable to convert audio file", audio.source, audio.path
208
+ ) from exc
209
+ elif end is None:
210
+ # Extract from start to end of file
211
+ output_file = posixpath.join(
212
+ output, f"{audio.get_file_stem()}_{int(start * 1000):06d}_end.{format}"
213
+ )
214
+ try:
215
+ audio_bytes = audio_to_bytes(audio, format, start=start, duration=None)
216
+ except Exception as exc:
217
+ raise FileError(
218
+ "unable to save audio fragment", audio.source, audio.path
219
+ ) from exc
220
+ else:
221
+ # Fragment extraction mode with specific end time
222
+ if end < 0 or start >= end:
223
+ raise ValueError(
224
+ f"Can't save audio for '{audio.path}', "
225
+ f"invalid time range: ({start:.3f}, {end:.3f})"
226
+ )
227
+
228
+ duration = end - start
229
+ start_ms = int(start * 1000)
230
+ end_ms = int(end * 1000)
231
+ output_file = posixpath.join(
232
+ output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
233
+ )
234
+
235
+ try:
236
+ audio_bytes = audio_to_bytes(audio, format, start, duration)
237
+ except Exception as exc:
238
+ raise FileError(
239
+ "unable to save audio fragment", audio.source, audio.path
240
+ ) from exc
241
+
242
+ from datachain.lib.file import AudioFile
243
+
244
+ return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
@@ -1,3 +1,5 @@
1
+ import inspect
2
+ import uuid
1
3
  from collections.abc import Sequence
2
4
  from datetime import datetime
3
5
  from typing import ClassVar, Optional, Union, get_args, get_origin
@@ -80,7 +82,9 @@ def dict_to_data_model(
80
82
 
81
83
  fields = {
82
84
  name: (
83
- anno,
85
+ anno
86
+ if inspect.isclass(anno) and issubclass(anno, BaseModel)
87
+ else Optional[anno],
84
88
  Field(
85
89
  validation_alias=AliasChoices(name, original_names[idx] or name),
86
90
  default=None,
@@ -101,6 +105,10 @@ def dict_to_data_model(
101
105
  field_info[str(alias)] = (_name, field)
102
106
  return field_info
103
107
 
108
+ # Generate random unique name if not provided
109
+ if not name:
110
+ name = f"DataModel_{uuid.uuid4().hex[:8]}"
111
+
104
112
  return create_model(
105
113
  name,
106
114
  __base__=_DataModelStrict,
@@ -25,19 +25,23 @@ def read_hf(
25
25
  settings: Optional[dict] = None,
26
26
  column: str = "",
27
27
  model_name: str = "",
28
+ limit: int = 0,
28
29
  **kwargs,
29
30
  ) -> "DataChain":
30
- """Generate chain from huggingface hub dataset.
31
+ """Generate chain from Hugging Face Hub dataset.
31
32
 
32
33
  Parameters:
33
34
  dataset : Path or name of the dataset to read from Hugging Face Hub,
34
35
  or an instance of `datasets.Dataset`-like object.
35
- args : Additional positional arguments to pass to datasets.load_dataset.
36
+ args : Additional positional arguments to pass to `datasets.load_dataset`.
36
37
  session : Session to use for the chain.
37
38
  settings : Settings to use for the chain.
38
39
  column : Generated object column name.
39
40
  model_name : Generated model name.
40
- kwargs : Parameters to pass to datasets.load_dataset.
41
+ limit : Limit the number of items to read from the HF dataset.
42
+ Adds `take(limit)` to the `datasets.load_dataset`.
43
+ Defaults to 0 (no limit).
44
+ kwargs : Parameters to pass to `datasets.load_dataset`.
41
45
 
42
46
  Example:
43
47
  Load from Hugging Face Hub:
@@ -53,6 +57,18 @@ def read_hf(
53
57
  import datachain as dc
54
58
  chain = dc.read_hf(ds)
55
59
  ```
60
+
61
+ Streaming with limit, for large datasets:
62
+ ```py
63
+ import datachain as dc
64
+ ds = dc.read_hf("beans", split="train", streaming=True, limit=10)
65
+ ```
66
+
67
+ or use HF split syntax (not supported if streaming is enabled):
68
+ ```py
69
+ import datachain as dc
70
+ ds = dc.read_hf("beans", split="train[%10]")
71
+ ```
56
72
  """
57
73
  from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
58
74
 
@@ -72,4 +88,4 @@ def read_hf(
72
88
  output = {column: model}
73
89
 
74
90
  chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
75
- return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
91
+ return chain.gen(HFGenerator(dataset, model, limit, *args, **kwargs), output=output)