datachain 0.25.2__tar.gz → 0.26.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (410) hide show
  1. {datachain-0.25.2 → datachain-0.26.1}/.gitignore +2 -0
  2. {datachain-0.25.2 → datachain-0.26.1}/.pre-commit-config.yaml +1 -1
  3. {datachain-0.25.2 → datachain-0.26.1}/PKG-INFO +6 -2
  4. {datachain-0.25.2 → datachain-0.26.1}/docs/commands/job/run.md +13 -0
  5. {datachain-0.25.2 → datachain-0.26.1}/examples/computer_vision/iptc_exif_xmp_lib.py +19 -23
  6. datachain-0.26.1/examples/computer_vision/llava2_image_desc_lib.py +71 -0
  7. {datachain-0.25.2 → datachain-0.26.1}/examples/computer_vision/openimage-detect.py +14 -12
  8. {datachain-0.25.2 → datachain-0.26.1}/examples/computer_vision/ultralytics-bbox.py +1 -1
  9. {datachain-0.25.2 → datachain-0.26.1}/examples/computer_vision/ultralytics-pose.py +1 -1
  10. {datachain-0.25.2 → datachain-0.26.1}/examples/computer_vision/ultralytics-segment.py +1 -1
  11. {datachain-0.25.2 → datachain-0.26.1}/examples/get_started/common_sql_functions.py +14 -18
  12. {datachain-0.25.2 → datachain-0.26.1}/examples/get_started/json-csv-reader.py +9 -12
  13. {datachain-0.25.2 → datachain-0.26.1}/examples/get_started/torch-loader.py +2 -2
  14. {datachain-0.25.2 → datachain-0.26.1}/examples/get_started/udfs/parallel.py +9 -9
  15. datachain-0.26.1/examples/get_started/udfs/simple.py +21 -0
  16. {datachain-0.25.2 → datachain-0.26.1}/examples/get_started/udfs/stateful.py +15 -19
  17. datachain-0.26.1/examples/llm_and_nlp/claude-query.py +65 -0
  18. {datachain-0.25.2 → datachain-0.26.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +6 -6
  19. datachain-0.26.1/examples/multimodal/audio-to-text.py +62 -0
  20. {datachain-0.25.2 → datachain-0.26.1}/examples/multimodal/clip_inference.py +21 -7
  21. datachain-0.26.1/examples/multimodal/hf_pipeline.py +119 -0
  22. datachain-0.26.1/examples/multimodal/openai_image_desc_lib.py +58 -0
  23. {datachain-0.25.2 → datachain-0.26.1}/examples/multimodal/wds.py +18 -4
  24. {datachain-0.25.2 → datachain-0.26.1}/examples/multimodal/wds_filtered.py +19 -5
  25. {datachain-0.25.2 → datachain-0.26.1}/pyproject.toml +7 -2
  26. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/__init__.py +6 -0
  27. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/catalog/loader.py +4 -0
  28. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/__init__.py +2 -1
  29. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/conditional.py +34 -0
  30. datachain-0.26.1/src/datachain/lib/audio.py +151 -0
  31. datachain-0.26.1/src/datachain/lib/convert/sql_to_python.py +22 -0
  32. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/datachain.py +227 -67
  33. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/file.py +190 -1
  34. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/model_store.py +8 -0
  35. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/pytorch.py +4 -1
  36. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/signal_schema.py +56 -11
  37. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/udf.py +17 -5
  38. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/dataset.py +37 -9
  39. {datachain-0.25.2 → datachain-0.26.1}/src/datachain.egg-info/PKG-INFO +6 -2
  40. {datachain-0.25.2 → datachain-0.26.1}/src/datachain.egg-info/SOURCES.txt +5 -0
  41. {datachain-0.25.2 → datachain-0.26.1}/src/datachain.egg-info/requires.txt +6 -1
  42. {datachain-0.25.2 → datachain-0.26.1}/tests/func/functions/test_conditional.py +4 -3
  43. datachain-0.26.1/tests/func/test_audio.py +115 -0
  44. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_dataset_query.py +1 -1
  45. datachain-0.26.1/tests/unit/lib/test_audio.py +265 -0
  46. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_datachain.py +716 -1
  47. datachain-0.26.1/tests/unit/lib/test_partition_by.py +590 -0
  48. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_signal_schema.py +258 -13
  49. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_sql_to_python.py +3 -1
  50. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/sql/test_conditional.py +15 -0
  51. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_func.py +13 -0
  52. datachain-0.25.2/examples/computer_vision/llava2_image_desc_lib.py +0 -86
  53. datachain-0.25.2/examples/get_started/udfs/simple.py +0 -19
  54. datachain-0.25.2/examples/llm_and_nlp/claude-query.py +0 -78
  55. datachain-0.25.2/examples/multimodal/hf_pipeline.py +0 -139
  56. datachain-0.25.2/examples/multimodal/openai_image_desc_lib.py +0 -93
  57. datachain-0.25.2/src/datachain/lib/convert/sql_to_python.py +0 -14
  58. {datachain-0.25.2 → datachain-0.26.1}/.cruft.json +0 -0
  59. {datachain-0.25.2 → datachain-0.26.1}/.gitattributes +0 -0
  60. {datachain-0.25.2 → datachain-0.26.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  61. {datachain-0.25.2 → datachain-0.26.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  62. {datachain-0.25.2 → datachain-0.26.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  63. {datachain-0.25.2 → datachain-0.26.1}/.github/codecov.yaml +0 -0
  64. {datachain-0.25.2 → datachain-0.26.1}/.github/dependabot.yml +0 -0
  65. {datachain-0.25.2 → datachain-0.26.1}/.github/workflows/benchmarks.yml +0 -0
  66. {datachain-0.25.2 → datachain-0.26.1}/.github/workflows/release.yml +0 -0
  67. {datachain-0.25.2 → datachain-0.26.1}/.github/workflows/tests-studio.yml +0 -0
  68. {datachain-0.25.2 → datachain-0.26.1}/.github/workflows/tests.yml +0 -0
  69. {datachain-0.25.2 → datachain-0.26.1}/.github/workflows/update-template.yaml +0 -0
  70. {datachain-0.25.2 → datachain-0.26.1}/CODE_OF_CONDUCT.rst +0 -0
  71. {datachain-0.25.2 → datachain-0.26.1}/LICENSE +0 -0
  72. {datachain-0.25.2 → datachain-0.26.1}/README.rst +0 -0
  73. {datachain-0.25.2 → datachain-0.26.1}/docs/assets/captioned_cartoons.png +0 -0
  74. {datachain-0.25.2 → datachain-0.26.1}/docs/assets/datachain-white.svg +0 -0
  75. {datachain-0.25.2 → datachain-0.26.1}/docs/assets/datachain.svg +0 -0
  76. {datachain-0.25.2 → datachain-0.26.1}/docs/commands/auth/login.md +0 -0
  77. {datachain-0.25.2 → datachain-0.26.1}/docs/commands/auth/logout.md +0 -0
  78. {datachain-0.25.2 → datachain-0.26.1}/docs/commands/auth/team.md +0 -0
  79. {datachain-0.25.2 → datachain-0.26.1}/docs/commands/auth/token.md +0 -0
  80. {datachain-0.25.2 → datachain-0.26.1}/docs/commands/index.md +0 -0
  81. {datachain-0.25.2 → datachain-0.26.1}/docs/commands/job/cancel.md +0 -0
  82. {datachain-0.25.2 → datachain-0.26.1}/docs/commands/job/clusters.md +0 -0
  83. {datachain-0.25.2 → datachain-0.26.1}/docs/commands/job/logs.md +0 -0
  84. {datachain-0.25.2 → datachain-0.26.1}/docs/commands/job/ls.md +0 -0
  85. {datachain-0.25.2 → datachain-0.26.1}/docs/contributing.md +0 -0
  86. {datachain-0.25.2 → datachain-0.26.1}/docs/css/github-permalink-style.css +0 -0
  87. {datachain-0.25.2 → datachain-0.26.1}/docs/examples.md +0 -0
  88. {datachain-0.25.2 → datachain-0.26.1}/docs/guide/db_migrations.md +0 -0
  89. {datachain-0.25.2 → datachain-0.26.1}/docs/guide/delta.md +0 -0
  90. {datachain-0.25.2 → datachain-0.26.1}/docs/guide/env.md +0 -0
  91. {datachain-0.25.2 → datachain-0.26.1}/docs/guide/index.md +0 -0
  92. {datachain-0.25.2 → datachain-0.26.1}/docs/guide/namespaces.md +0 -0
  93. {datachain-0.25.2 → datachain-0.26.1}/docs/guide/processing.md +0 -0
  94. {datachain-0.25.2 → datachain-0.26.1}/docs/guide/remotes.md +0 -0
  95. {datachain-0.25.2 → datachain-0.26.1}/docs/guide/retry.md +0 -0
  96. {datachain-0.25.2 → datachain-0.26.1}/docs/index.md +0 -0
  97. {datachain-0.25.2 → datachain-0.26.1}/docs/overrides/main.html +0 -0
  98. {datachain-0.25.2 → datachain-0.26.1}/docs/quick-start.md +0 -0
  99. {datachain-0.25.2 → datachain-0.26.1}/docs/references/data-types/arrowrow.md +0 -0
  100. {datachain-0.25.2 → datachain-0.26.1}/docs/references/data-types/bbox.md +0 -0
  101. {datachain-0.25.2 → datachain-0.26.1}/docs/references/data-types/file.md +0 -0
  102. {datachain-0.25.2 → datachain-0.26.1}/docs/references/data-types/imagefile.md +0 -0
  103. {datachain-0.25.2 → datachain-0.26.1}/docs/references/data-types/index.md +0 -0
  104. {datachain-0.25.2 → datachain-0.26.1}/docs/references/data-types/pose.md +0 -0
  105. {datachain-0.25.2 → datachain-0.26.1}/docs/references/data-types/segment.md +0 -0
  106. {datachain-0.25.2 → datachain-0.26.1}/docs/references/data-types/tarvfile.md +0 -0
  107. {datachain-0.25.2 → datachain-0.26.1}/docs/references/data-types/textfile.md +0 -0
  108. {datachain-0.25.2 → datachain-0.26.1}/docs/references/data-types/videofile.md +0 -0
  109. {datachain-0.25.2 → datachain-0.26.1}/docs/references/datachain.md +0 -0
  110. {datachain-0.25.2 → datachain-0.26.1}/docs/references/func.md +0 -0
  111. {datachain-0.25.2 → datachain-0.26.1}/docs/references/index.md +0 -0
  112. {datachain-0.25.2 → datachain-0.26.1}/docs/references/toolkit.md +0 -0
  113. {datachain-0.25.2 → datachain-0.26.1}/docs/references/torch.md +0 -0
  114. {datachain-0.25.2 → datachain-0.26.1}/docs/references/udf.md +0 -0
  115. {datachain-0.25.2 → datachain-0.26.1}/docs/tutorials.md +0 -0
  116. {datachain-0.25.2 → datachain-0.26.1}/examples/incremental_processing/delta.py +0 -0
  117. {datachain-0.25.2 → datachain-0.26.1}/examples/incremental_processing/retry.py +0 -0
  118. {datachain-0.25.2 → datachain-0.26.1}/examples/incremental_processing/utils.py +0 -0
  119. {datachain-0.25.2 → datachain-0.26.1}/mkdocs.yml +0 -0
  120. {datachain-0.25.2 → datachain-0.26.1}/noxfile.py +0 -0
  121. {datachain-0.25.2 → datachain-0.26.1}/setup.cfg +0 -0
  122. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/__main__.py +0 -0
  123. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/asyn.py +0 -0
  124. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cache.py +0 -0
  125. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/catalog/__init__.py +0 -0
  126. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/catalog/catalog.py +0 -0
  127. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/catalog/datasource.py +0 -0
  128. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/__init__.py +0 -0
  129. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/commands/__init__.py +0 -0
  130. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/commands/datasets.py +0 -0
  131. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/commands/du.py +0 -0
  132. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/commands/index.py +0 -0
  133. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/commands/ls.py +0 -0
  134. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/commands/misc.py +0 -0
  135. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/commands/query.py +0 -0
  136. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/commands/show.py +0 -0
  137. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/parser/__init__.py +0 -0
  138. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/parser/job.py +0 -0
  139. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/parser/studio.py +0 -0
  140. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/parser/utils.py +0 -0
  141. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/cli/utils.py +0 -0
  142. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/client/__init__.py +0 -0
  143. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/client/azure.py +0 -0
  144. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/client/fileslice.py +0 -0
  145. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/client/fsspec.py +0 -0
  146. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/client/gcs.py +0 -0
  147. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/client/hf.py +0 -0
  148. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/client/local.py +0 -0
  149. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/client/s3.py +0 -0
  150. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/config.py +0 -0
  151. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/data_storage/__init__.py +0 -0
  152. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/data_storage/db_engine.py +0 -0
  153. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/data_storage/job.py +0 -0
  154. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/data_storage/metastore.py +0 -0
  155. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/data_storage/schema.py +0 -0
  156. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/data_storage/serializer.py +0 -0
  157. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/data_storage/sqlite.py +0 -0
  158. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/data_storage/warehouse.py +0 -0
  159. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/dataset.py +0 -0
  160. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/delta.py +0 -0
  161. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/diff/__init__.py +0 -0
  162. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/error.py +0 -0
  163. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/fs/__init__.py +0 -0
  164. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/fs/reference.py +0 -0
  165. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/fs/utils.py +0 -0
  166. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/aggregate.py +0 -0
  167. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/array.py +0 -0
  168. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/base.py +0 -0
  169. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/func.py +0 -0
  170. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/numeric.py +0 -0
  171. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/path.py +0 -0
  172. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/random.py +0 -0
  173. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/string.py +0 -0
  174. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/func/window.py +0 -0
  175. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/job.py +0 -0
  176. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/__init__.py +0 -0
  177. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/arrow.py +0 -0
  178. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/clip.py +0 -0
  179. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/convert/__init__.py +0 -0
  180. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/convert/flatten.py +0 -0
  181. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  182. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/convert/unflatten.py +0 -0
  183. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  184. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/data_model.py +0 -0
  185. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dataset_info.py +0 -0
  186. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/__init__.py +0 -0
  187. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/csv.py +0 -0
  188. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/database.py +0 -0
  189. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/datasets.py +0 -0
  190. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/hf.py +0 -0
  191. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/json.py +0 -0
  192. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/listings.py +0 -0
  193. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/pandas.py +0 -0
  194. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/parquet.py +0 -0
  195. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/records.py +0 -0
  196. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/storage.py +0 -0
  197. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/utils.py +0 -0
  198. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/dc/values.py +0 -0
  199. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/hf.py +0 -0
  200. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/image.py +0 -0
  201. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/listing.py +0 -0
  202. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/listing_info.py +0 -0
  203. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/meta_formats.py +0 -0
  204. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/namespaces.py +0 -0
  205. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/projects.py +0 -0
  206. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/settings.py +0 -0
  207. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/tar.py +0 -0
  208. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/text.py +0 -0
  209. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/udf_signature.py +0 -0
  210. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/utils.py +0 -0
  211. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/video.py +0 -0
  212. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/webdataset.py +0 -0
  213. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/lib/webdataset_laion.py +0 -0
  214. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/listing.py +0 -0
  215. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/model/__init__.py +0 -0
  216. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/model/bbox.py +0 -0
  217. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/model/pose.py +0 -0
  218. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/model/segment.py +0 -0
  219. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/model/ultralytics/__init__.py +0 -0
  220. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/model/ultralytics/bbox.py +0 -0
  221. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/model/ultralytics/pose.py +0 -0
  222. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/model/ultralytics/segment.py +0 -0
  223. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/model/utils.py +0 -0
  224. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/namespace.py +0 -0
  225. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/node.py +0 -0
  226. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/nodes_fetcher.py +0 -0
  227. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/nodes_thread_pool.py +0 -0
  228. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/progress.py +0 -0
  229. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/project.py +0 -0
  230. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/py.typed +0 -0
  231. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/__init__.py +0 -0
  232. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/batch.py +0 -0
  233. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/dispatch.py +0 -0
  234. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/metrics.py +0 -0
  235. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/params.py +0 -0
  236. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/queue.py +0 -0
  237. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/schema.py +0 -0
  238. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/session.py +0 -0
  239. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/udf.py +0 -0
  240. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/query/utils.py +0 -0
  241. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/remote/__init__.py +0 -0
  242. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/remote/studio.py +0 -0
  243. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/script_meta.py +0 -0
  244. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/semver.py +0 -0
  245. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/__init__.py +0 -0
  246. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/default/__init__.py +0 -0
  247. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/default/base.py +0 -0
  248. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/functions/__init__.py +0 -0
  249. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/functions/aggregate.py +0 -0
  250. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/functions/array.py +0 -0
  251. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/functions/conditional.py +0 -0
  252. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/functions/numeric.py +0 -0
  253. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/functions/path.py +0 -0
  254. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/functions/random.py +0 -0
  255. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/functions/string.py +0 -0
  256. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/selectable.py +0 -0
  257. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  258. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/sqlite/base.py +0 -0
  259. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/sqlite/types.py +0 -0
  260. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/sqlite/vector.py +0 -0
  261. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/types.py +0 -0
  262. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/sql/utils.py +0 -0
  263. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/studio.py +0 -0
  264. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/telemetry.py +0 -0
  265. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/toolkit/__init__.py +0 -0
  266. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/toolkit/split.py +0 -0
  267. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/torch/__init__.py +0 -0
  268. {datachain-0.25.2 → datachain-0.26.1}/src/datachain/utils.py +0 -0
  269. {datachain-0.25.2 → datachain-0.26.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  270. {datachain-0.25.2 → datachain-0.26.1}/src/datachain.egg-info/entry_points.txt +0 -0
  271. {datachain-0.25.2 → datachain-0.26.1}/src/datachain.egg-info/top_level.txt +0 -0
  272. {datachain-0.25.2 → datachain-0.26.1}/tests/__init__.py +0 -0
  273. {datachain-0.25.2 → datachain-0.26.1}/tests/benchmarks/__init__.py +0 -0
  274. {datachain-0.25.2 → datachain-0.26.1}/tests/benchmarks/conftest.py +0 -0
  275. {datachain-0.25.2 → datachain-0.26.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  276. {datachain-0.25.2 → datachain-0.26.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  277. {datachain-0.25.2 → datachain-0.26.1}/tests/benchmarks/datasets/.gitignore +0 -0
  278. {datachain-0.25.2 → datachain-0.26.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  279. {datachain-0.25.2 → datachain-0.26.1}/tests/benchmarks/test_datachain.py +0 -0
  280. {datachain-0.25.2 → datachain-0.26.1}/tests/benchmarks/test_ls.py +0 -0
  281. {datachain-0.25.2 → datachain-0.26.1}/tests/benchmarks/test_version.py +0 -0
  282. {datachain-0.25.2 → datachain-0.26.1}/tests/conftest.py +0 -0
  283. {datachain-0.25.2 → datachain-0.26.1}/tests/data.py +0 -0
  284. {datachain-0.25.2 → datachain-0.26.1}/tests/examples/__init__.py +0 -0
  285. {datachain-0.25.2 → datachain-0.26.1}/tests/examples/test_examples.py +0 -0
  286. {datachain-0.25.2 → datachain-0.26.1}/tests/examples/test_wds_e2e.py +0 -0
  287. {datachain-0.25.2 → datachain-0.26.1}/tests/examples/wds_data.py +0 -0
  288. {datachain-0.25.2 → datachain-0.26.1}/tests/func/__init__.py +0 -0
  289. {datachain-0.25.2 → datachain-0.26.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  290. {datachain-0.25.2 → datachain-0.26.1}/tests/func/data/lena.jpg +0 -0
  291. {datachain-0.25.2 → datachain-0.26.1}/tests/func/fake-service-account-credentials.json +0 -0
  292. {datachain-0.25.2 → datachain-0.26.1}/tests/func/functions/__init__.py +0 -0
  293. {datachain-0.25.2 → datachain-0.26.1}/tests/func/functions/test_aggregate.py +0 -0
  294. {datachain-0.25.2 → datachain-0.26.1}/tests/func/functions/test_array.py +0 -0
  295. {datachain-0.25.2 → datachain-0.26.1}/tests/func/functions/test_numeric.py +0 -0
  296. {datachain-0.25.2 → datachain-0.26.1}/tests/func/functions/test_path.py +0 -0
  297. {datachain-0.25.2 → datachain-0.26.1}/tests/func/functions/test_random.py +0 -0
  298. {datachain-0.25.2 → datachain-0.26.1}/tests/func/functions/test_string.py +0 -0
  299. {datachain-0.25.2 → datachain-0.26.1}/tests/func/model/__init__.py +0 -0
  300. {datachain-0.25.2 → datachain-0.26.1}/tests/func/model/data/running-mask0.png +0 -0
  301. {datachain-0.25.2 → datachain-0.26.1}/tests/func/model/data/running-mask1.png +0 -0
  302. {datachain-0.25.2 → datachain-0.26.1}/tests/func/model/data/running.jpg +0 -0
  303. {datachain-0.25.2 → datachain-0.26.1}/tests/func/model/data/ships.jpg +0 -0
  304. {datachain-0.25.2 → datachain-0.26.1}/tests/func/model/test_yolo.py +0 -0
  305. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_batching.py +0 -0
  306. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_catalog.py +0 -0
  307. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_client.py +0 -0
  308. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_cloud_transfer.py +0 -0
  309. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_data_storage.py +0 -0
  310. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_datachain.py +0 -0
  311. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_datachain_merge.py +0 -0
  312. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_datasets.py +0 -0
  313. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_delta.py +0 -0
  314. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_feature_pickling.py +0 -0
  315. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_file.py +0 -0
  316. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_hf.py +0 -0
  317. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_hidden_field.py +0 -0
  318. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_image.py +0 -0
  319. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_listing.py +0 -0
  320. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_ls.py +0 -0
  321. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_meta_formats.py +0 -0
  322. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_metastore.py +0 -0
  323. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_metrics.py +0 -0
  324. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_pull.py +0 -0
  325. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_pytorch.py +0 -0
  326. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_query.py +0 -0
  327. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_read_database.py +0 -0
  328. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_read_dataset_remote.py +0 -0
  329. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  330. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_retry.py +0 -0
  331. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_session.py +0 -0
  332. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_toolkit.py +0 -0
  333. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_video.py +0 -0
  334. {datachain-0.25.2 → datachain-0.26.1}/tests/func/test_warehouse.py +0 -0
  335. {datachain-0.25.2 → datachain-0.26.1}/tests/scripts/feature_class.py +0 -0
  336. {datachain-0.25.2 → datachain-0.26.1}/tests/scripts/feature_class_exception.py +0 -0
  337. {datachain-0.25.2 → datachain-0.26.1}/tests/scripts/feature_class_parallel.py +0 -0
  338. {datachain-0.25.2 → datachain-0.26.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  339. {datachain-0.25.2 → datachain-0.26.1}/tests/scripts/name_len_slow.py +0 -0
  340. {datachain-0.25.2 → datachain-0.26.1}/tests/test_atomicity.py +0 -0
  341. {datachain-0.25.2 → datachain-0.26.1}/tests/test_cli_e2e.py +0 -0
  342. {datachain-0.25.2 → datachain-0.26.1}/tests/test_cli_studio.py +0 -0
  343. {datachain-0.25.2 → datachain-0.26.1}/tests/test_import_time.py +0 -0
  344. {datachain-0.25.2 → datachain-0.26.1}/tests/test_query_e2e.py +0 -0
  345. {datachain-0.25.2 → datachain-0.26.1}/tests/test_telemetry.py +0 -0
  346. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/__init__.py +0 -0
  347. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/__init__.py +0 -0
  348. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/conftest.py +0 -0
  349. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_arrow.py +0 -0
  350. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_clip.py +0 -0
  351. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  352. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  353. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_diff.py +0 -0
  354. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_feature.py +0 -0
  355. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_feature_utils.py +0 -0
  356. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_file.py +0 -0
  357. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_hf.py +0 -0
  358. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_image.py +0 -0
  359. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_listing_info.py +0 -0
  360. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_namespace.py +0 -0
  361. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_project.py +0 -0
  362. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_python_to_sql.py +0 -0
  363. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_schema.py +0 -0
  364. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_text.py +0 -0
  365. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_udf.py +0 -0
  366. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_udf_signature.py +0 -0
  367. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_utils.py +0 -0
  368. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/lib/test_webdataset.py +0 -0
  369. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/model/__init__.py +0 -0
  370. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/model/test_bbox.py +0 -0
  371. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/model/test_pose.py +0 -0
  372. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/model/test_segment.py +0 -0
  373. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/model/test_utils.py +0 -0
  374. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/sql/__init__.py +0 -0
  375. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  376. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/sql/sqlite/test_types.py +0 -0
  377. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  378. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/sql/test_array.py +0 -0
  379. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/sql/test_path.py +0 -0
  380. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/sql/test_random.py +0 -0
  381. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/sql/test_selectable.py +0 -0
  382. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/sql/test_string.py +0 -0
  383. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_asyn.py +0 -0
  384. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_cache.py +0 -0
  385. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_catalog.py +0 -0
  386. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_catalog_loader.py +0 -0
  387. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_cli_parsing.py +0 -0
  388. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_client.py +0 -0
  389. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_client_gcs.py +0 -0
  390. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_client_s3.py +0 -0
  391. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_config.py +0 -0
  392. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_data_storage.py +0 -0
  393. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_database_engine.py +0 -0
  394. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_dataset.py +0 -0
  395. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_dispatch.py +0 -0
  396. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_fileslice.py +0 -0
  397. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_listing.py +0 -0
  398. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_metastore.py +0 -0
  399. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_module_exports.py +0 -0
  400. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_pytorch.py +0 -0
  401. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_query.py +0 -0
  402. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_query_metrics.py +0 -0
  403. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_query_params.py +0 -0
  404. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_script_meta.py +0 -0
  405. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_semver.py +0 -0
  406. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_serializer.py +0 -0
  407. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_session.py +0 -0
  408. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_utils.py +0 -0
  409. {datachain-0.25.2 → datachain-0.26.1}/tests/unit/test_warehouse.py +0 -0
  410. {datachain-0.25.2 → datachain-0.26.1}/tests/utils.py +0 -0
@@ -143,3 +143,5 @@ cython_debug/
143
143
 
144
144
  # pt files produced by ultralytics examples
145
145
  *.pt
146
+
147
+ .DS_Store/
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.12.2'
27
+ rev: 'v0.12.3'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.25.2
3
+ Version: 0.26.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -63,6 +63,9 @@ Provides-Extra: torch
63
63
  Requires-Dist: torch>=2.1.0; extra == "torch"
64
64
  Requires-Dist: torchvision; extra == "torch"
65
65
  Requires-Dist: transformers>=4.36.0; extra == "torch"
66
+ Provides-Extra: audio
67
+ Requires-Dist: torchaudio; extra == "audio"
68
+ Requires-Dist: soundfile; extra == "audio"
66
69
  Provides-Extra: remote
67
70
  Requires-Dist: lz4; extra == "remote"
68
71
  Requires-Dist: requests>=2.22.0; extra == "remote"
@@ -78,7 +81,7 @@ Requires-Dist: ffmpeg-python; extra == "video"
78
81
  Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
79
82
  Requires-Dist: opencv-python; extra == "video"
80
83
  Provides-Extra: tests
81
- Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
84
+ Requires-Dist: datachain[audio,hf,remote,torch,vector,video]; extra == "tests"
82
85
  Requires-Dist: pytest<9,>=8; extra == "tests"
83
86
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
84
87
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
@@ -108,6 +111,7 @@ Requires-Dist: accelerate; extra == "examples"
108
111
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
109
112
  Requires-Dist: ultralytics; extra == "examples"
110
113
  Requires-Dist: open_clip_torch; extra == "examples"
114
+ Requires-Dist: openai; extra == "examples"
111
115
  Dynamic: license-file
112
116
 
113
117
  ================
@@ -31,6 +31,7 @@ This command runs a job in Studio using the specified query file. You can config
31
31
  * `--req-file REQ_FILE` - Python requirements file
32
32
  * `--req REQ` - Python package requirements
33
33
  * `--priority PRIORITY` - Priority for the job in range 0-5. Lower value is higher priority (default: 5)
34
+ * `--repository URL` - Repository URL to clone before running the job.
34
35
  * `-h`, `--help` - Show the help message and exit.
35
36
  * `-v`, `--verbose` - Be verbose.
36
37
  * `-q`, `--quiet` - Be quiet.
@@ -67,6 +68,18 @@ datachain job run --env API_KEY=123 --req pandas numpy query.py
67
68
  datachain job run --repository https://github.com/iterative/datachain query.py
68
69
  ```
69
70
 
71
+ To specify a branch / revision:
72
+
73
+ ```bash
74
+ datachain job run --repository https://github.com/iterative/datachain@main query.py
75
+ ```
76
+
77
+ Git URLs are also supported:
78
+
79
+ ```bash
80
+ datachain job run --repository git@github.com:iterative/datachain.git@main query.py
81
+ ```
82
+
70
83
  7. Run a job with higher priority
71
84
  ```bash
72
85
  datachain job run --priority 2 query.py
@@ -5,8 +5,6 @@ To install the required dependencies:
5
5
 
6
6
  """
7
7
 
8
- import json
9
-
10
8
  from PIL import (
11
9
  ExifTags,
12
10
  IptcImagePlugin,
@@ -14,8 +12,7 @@ from PIL import (
14
12
  )
15
13
 
16
14
  import datachain as dc
17
-
18
- source = "gs://datachain-demo/open-images-v6/"
15
+ from datachain import C, DataModel, File
19
16
 
20
17
 
21
18
  def cast(v): # to JSON serializable types
@@ -34,16 +31,21 @@ def cast(v): # to JSON serializable types
34
31
  return v
35
32
 
36
33
 
37
- def image_description(file):
38
- (xmp, exif, iptc) = ({}, {}, {})
34
+ class ImageDescription(DataModel):
35
+ xmp: dict
36
+ exif: dict
37
+ iptc: dict
38
+
39
+
40
+ def image_description(file: File) -> tuple[ImageDescription, str]:
41
+ xmp, exif, iptc = {}, {}, {}
39
42
  try:
40
43
  img = file.read()
41
44
  xmp = img.getxmp()
42
45
  img_exif = img.getexif()
43
46
  img_iptc = IptcImagePlugin.getiptcinfo(img)
44
47
  except Exception as err: # noqa: BLE001
45
- error = str(err)
46
- return ({}, {}, {}, error)
48
+ return ImageDescription(xmp={}, exif={}, iptc={}), str(err)
47
49
 
48
50
  if img_iptc:
49
51
  for k, v in img_iptc.items():
@@ -57,26 +59,20 @@ def image_description(file):
57
59
  if k in ExifTags.GPSTAGS:
58
60
  exif[ExifTags.GPSTAGS[k]] = v
59
61
 
60
- return (
61
- json.dumps(xmp),
62
- json.dumps(exif),
63
- json.dumps(iptc),
64
- "",
65
- )
62
+ return (ImageDescription(xmp=xmp, exif=exif, iptc=iptc), "")
66
63
 
67
64
 
68
65
  if __name__ == "__main__":
69
66
  (
70
- dc.read_storage(source, type="image")
71
- .settings(parallel=-1)
72
- .filter(dc.C("file.path").glob("*.jpg"))
67
+ dc.read_storage("gs://datachain-demo/open-images-v6/", type="image", anon=True)
68
+ .filter(C("file.path").glob("*.jpg"))
73
69
  .limit(5000)
74
- .map(
75
- image_description,
76
- params=["file"],
77
- output={"xmp": dict, "exif": dict, "iptc": dict, "error": str},
70
+ .settings(parallel=True)
71
+ .map(image_description, output=("description", "error"))
72
+ .filter(
73
+ (C("description.xmp") != "{}")
74
+ | (C("description.exif") != "{}")
75
+ | (C("description.iptc") != "{}")
78
76
  )
79
- .select("file.path", "xmp", "exif", "iptc", "error")
80
- .filter((dc.C("xmp") != "{}") | (dc.C("exif") != "{}") | (dc.C("iptc") != "{}"))
81
77
  .show()
82
78
  )
@@ -0,0 +1,71 @@
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
8
+ import torch
9
+ from transformers import (
10
+ AutoProcessor,
11
+ LlavaForConditionalGeneration,
12
+ )
13
+
14
+ import datachain as dc
15
+ from datachain import C, File
16
+
17
+ model = "llava-hf/llava-1.5-7b-hf"
18
+
19
+ # HuggingFace supports the following base models:
20
+ #
21
+ # "llava-hf/llava-1.5-7b-hf"
22
+ # "llava-hf/llava-1.5-13b-hf"
23
+ # "llava-hf/bakLlava-v1-hf"
24
+ #
25
+ # https://huggingface.co/llava-hf
26
+
27
+
28
+ # Probably this code can be written with HF pipeline
29
+ # but we keep it a bit more low-level for the sake of example.
30
+ class LLaVaProcessor:
31
+ def __init__(self, model_name, max_tokens=300):
32
+ if torch.cuda.is_available():
33
+ self.device = "cuda"
34
+ self.dtype = torch.float16
35
+ else:
36
+ self.device = "cpu"
37
+ self.dtype = torch.float32
38
+
39
+ self.model_name = model_name
40
+ self.max_tokens = max_tokens
41
+ self.prompt = "USER: <image>\nDescribe this picture\nASSISTANT:"
42
+
43
+ self.processor = AutoProcessor.from_pretrained(self.model_name)
44
+ self.model = LlavaForConditionalGeneration.from_pretrained(
45
+ self.model_name, torch_dtype=self.dtype, low_cpu_mem_usage=True
46
+ ).to(self.device)
47
+
48
+
49
+ def process(processor: LLaVaProcessor, file: File) -> tuple[str, str]:
50
+ inputs = processor.processor(
51
+ text=processor.prompt, images=file.read(), return_tensors="pt"
52
+ ).to(processor.device, processor.dtype)
53
+
54
+ generated_ids = processor.model.generate(
55
+ **inputs, max_new_tokens=processor.max_tokens
56
+ )
57
+ generated_text = processor.processor.batch_decode(
58
+ generated_ids, skip_special_tokens=True
59
+ )
60
+ desc = generated_text[0]
61
+ return desc.split("ASSISTANT:")[-1].strip(), ""
62
+
63
+
64
+ if __name__ == "__main__":
65
+ (
66
+ dc.read_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
67
+ .filter(C("file.path").glob("*/cat*.jpg"))
68
+ .setup(processor=lambda: LLaVaProcessor(model_name=model))
69
+ .map(process, output=("description", "error"))
70
+ .show(5)
71
+ )
@@ -1,19 +1,23 @@
1
1
  import json
2
+ from collections.abc import Iterator
2
3
 
3
4
  from PIL import Image
4
5
 
5
6
  import datachain as dc
6
- from datachain import File, model
7
+ from datachain import C, File, model
7
8
  from datachain.func import path
8
9
 
9
10
 
10
- def openimage_detect(args):
11
- if len(args) != 2:
11
+ # Example showing extraction of bounding boxes from Open Images dataset
12
+ # that comes as pairs of JPG and JSON files.
13
+ def openimage_detect(file: list[File]) -> Iterator[tuple[File, model.BBox]]:
14
+ if len(file) != 2:
12
15
  raise ValueError("Group jpg-json mismatch")
13
16
 
14
- stream_jpg = args[0]
15
- stream_json = args[1]
16
- if args[0].get_file_ext() != "jpg":
17
+ stream_jpg = file[0]
18
+ stream_json = file[1]
19
+ source = stream_jpg.source
20
+ if stream_jpg.get_file_ext() != "jpg":
17
21
  stream_jpg, stream_json = stream_json, stream_jpg
18
22
 
19
23
  with stream_jpg.open() as fd:
@@ -38,16 +42,14 @@ def openimage_detect(args):
38
42
  yield fstream, bbox
39
43
 
40
44
 
41
- source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
42
-
43
45
  (
44
- dc.read_storage(source)
45
- .filter(dc.C("file.path").glob("*.jpg") | dc.C("file.path").glob("*.json"))
46
+ dc.read_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/", anon=True)
47
+ .filter(C("file.path").glob("*.jpg") | C("file.path").glob("*.json"))
48
+ .settings(cache=True, parallel=True)
46
49
  .agg(
47
50
  openimage_detect,
48
51
  partition_by=path.file_stem("file.path"),
49
- params=["file"],
50
- output={"file": File, "bbox": model.BBox},
52
+ output=("file", "bbox"),
51
53
  )
52
54
  .show()
53
55
  )
@@ -10,7 +10,7 @@ def process_bboxes(yolo: YOLO, file: dc.File) -> YoloBBoxes:
10
10
 
11
11
 
12
12
  (
13
- dc.read_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
13
+ dc.read_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/", anon=True)
14
14
  .filter(dc.C("file.path").glob("*.jpg"))
15
15
  .limit(20)
16
16
  .setup(yolo=lambda: YOLO("yolo11n.pt"))
@@ -10,7 +10,7 @@ def process_poses(yolo: YOLO, file: dc.File) -> YoloPoses:
10
10
 
11
11
 
12
12
  (
13
- dc.read_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
13
+ dc.read_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/", anon=True)
14
14
  .filter(dc.C("file.path").glob("*.jpg"))
15
15
  .limit(20)
16
16
  .setup(yolo=lambda: YOLO("yolo11n-pose.pt"))
@@ -10,7 +10,7 @@ def process_segments(yolo: YOLO, file: dc.File) -> YoloSegments:
10
10
 
11
11
 
12
12
  (
13
- dc.read_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
13
+ dc.read_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/", anon=True)
14
14
  .filter(dc.C("file.path").glob("*.jpg"))
15
15
  .limit(20)
16
16
  .setup(yolo=lambda: YOLO("yolo11n-seg.pt"))
@@ -1,23 +1,19 @@
1
+ """
2
+ Example demonstrating showing functions (manipulating strings, paths, arrays)
3
+ that are translated directly to SQL (vectorized). They don't require heavy compute,
4
+ fetching object into cluster, etc.
5
+ """
6
+
1
7
  import datachain as dc
8
+ from datachain import C
2
9
  from datachain.func import array, greatest, least, path, string
3
10
 
4
-
5
- def num_chars_udf(file):
6
- parts = file.name.split(".")
7
- if len(parts) > 1:
8
- return (list(parts[1]),)
9
- return ([],)
10
-
11
-
12
11
  chain = dc.read_storage("gs://datachain-demo/dogs-and-cats/", anon=True)
13
- chain.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
14
- "file.path", "num_chars"
15
- ).show(5)
16
12
 
17
13
  (
18
14
  chain.mutate(
19
- length=string.length(path.name(dc.C("file.path"))),
20
- parts=string.split(path.name(dc.C("file.path")), "."),
15
+ length=string.length(path.name(C("file.path"))),
16
+ parts=string.split(path.name(C("file.path")), "."),
21
17
  )
22
18
  .select("file.path", "length", "parts")
23
19
  .show(5)
@@ -25,14 +21,14 @@ chain.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).selec
25
21
 
26
22
  (
27
23
  chain.mutate(
28
- stem=path.file_stem(dc.C("file.path")),
29
- ext=path.file_ext(dc.C("file.path")),
24
+ stem=path.file_stem(C("file.path")),
25
+ ext=path.file_ext(C("file.path")),
30
26
  )
31
27
  .select("file.path", "stem", "ext")
32
28
  .show(5)
33
29
  )
34
30
 
35
- parts = string.split(path.name(dc.C("file.path")), ".")
31
+ parts = string.split(path.name(C("file.path")), ".")
36
32
  chain = chain.mutate(
37
33
  isdog=array.contains(parts, "dog"),
38
34
  iscat=array.contains(parts, "cat"),
@@ -46,8 +42,8 @@ chain = chain.mutate(
46
42
 
47
43
  (
48
44
  chain.mutate(
49
- greatest=greatest(chain.column("a"), dc.C("b")),
50
- least=least(chain.column("a"), dc.C("b")),
45
+ greatest=greatest(chain.column("a"), C("b")),
46
+ least=least(chain.column("a"), C("b")),
51
47
  )
52
48
  .select("a", "b", "greatest", "least")
53
49
  .show(10)
@@ -1,33 +1,25 @@
1
1
  from typing import Optional
2
2
 
3
- from pydantic import BaseModel
4
-
5
3
  import datachain as dc
6
- from datachain.lib.data_model import ModelStore
4
+ from datachain import DataModel
7
5
  from datachain.lib.meta_formats import gen_datamodel_code
8
6
 
9
7
 
10
8
  # Sample model for static JSON model
11
- class LicenseModel(BaseModel):
9
+ class LicenseModel(DataModel):
12
10
  url: str
13
11
  id: int
14
12
  name: str
15
13
 
16
14
 
17
- LicenseFeature = ModelStore.register(LicenseModel)
18
-
19
-
20
15
  # Sample model for static CSV model
21
- class ChatDialog(BaseModel):
16
+ class ChatDialog(DataModel):
22
17
  id: Optional[int] = None
23
18
  count: Optional[int] = None
24
19
  sender: Optional[str] = None
25
20
  text: Optional[str] = None
26
21
 
27
22
 
28
- ChatFeature = ModelStore.register(ChatDialog)
29
-
30
-
31
23
  def main():
32
24
  # Dynamic JSONl schema from 2 objects
33
25
  uri = "gs://datachain-demo/jsonl/object.jsonl"
@@ -53,7 +45,7 @@ def main():
53
45
 
54
46
  # Static JSON schema test parsing 3/7 objects
55
47
  static_json_ds = dc.read_json(
56
- uri, jmespath="licenses", spec=LicenseFeature, nrows=3, anon="True"
48
+ uri, jmespath="licenses", spec=LicenseModel, nrows=3, anon="True"
57
49
  )
58
50
  static_json_ds.show()
59
51
 
@@ -73,6 +65,11 @@ def main():
73
65
  dynamic_csv_ds.print_schema()
74
66
  dynamic_csv_ds.show()
75
67
 
68
+ print(
69
+ "Note: script might hang at the end due to https://github.com/apache/arrow/issues/43497"
70
+ )
71
+ print("Just press Ctrl+C to exit.")
72
+
76
73
 
77
74
  if __name__ == "__main__":
78
75
  main()
@@ -55,8 +55,8 @@ class CNN(nn.Module):
55
55
 
56
56
  if __name__ == "__main__":
57
57
  ds = (
58
- dc.read_storage(STORAGE, type="image")
59
- .settings(prefetch=25)
58
+ dc.read_storage(STORAGE, type="image", anon=True)
59
+ .settings(prefetch=25, cache=True)
60
60
  .filter(dc.C("file.path").glob("*.jpg"))
61
61
  .map(
62
62
  label=lambda path: label_to_int(basename(path)[:3], CLASSES),
@@ -21,19 +21,19 @@ def fibonacci(n):
21
21
 
22
22
 
23
23
  # Define the UDF:
24
- def path_len_benchmark(path):
24
+ def path_len_benchmark(path: str) -> int:
25
25
  # Run the fibonacci benchmark as an example of a single-threaded CPU-bound UDF
26
26
  fibonacci(35)
27
27
  if path.endswith(".json"):
28
- return (-1,)
28
+ return -1
29
29
  return len(path)
30
30
 
31
31
 
32
32
  # Run in chain
33
- dc.read_storage(
34
- "gs://datachain-demo/dogs-and-cats/",
35
- ).settings(parallel=-1).map(
36
- path_len_benchmark,
37
- params=["file.path"],
38
- output={"path_len": int},
39
- ).show()
33
+ (
34
+ dc.read_storage("gs://datachain-demo/dogs-and-cats/", anon=True)
35
+ # Try to disable to see the difference in performance
36
+ .settings(parallel=-1)
37
+ .map(path_len=path_len_benchmark, params=["file.path"])
38
+ .show()
39
+ )
@@ -0,0 +1,21 @@
1
+ import datachain as dc
2
+
3
+
4
+ # Define the UDF:
5
+ # DataChain figures out input and output types automatically
6
+ # based on the function signature and the data provided.
7
+ def path_len(path: str) -> int:
8
+ if path.endswith(".json"):
9
+ return -1
10
+ return len(path)
11
+
12
+
13
+ if __name__ == "__main__":
14
+ # Process all the files in the storage bucket, using the UDF
15
+ # `read_storage` reads files from the specified path
16
+ # and returns a DataChain object that has `File` objects
17
+ (
18
+ dc.read_storage("gs://datachain-demo/dogs-and-cats/", anon=True)
19
+ .map(path_len=path_len, params=["file.path"])
20
+ .show()
21
+ )
@@ -7,41 +7,37 @@ To install the required dependencies:
7
7
 
8
8
  import os
9
9
 
10
- os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
11
-
12
10
  import open_clip
13
11
 
14
12
  import datachain as dc
13
+ from datachain import C, File
14
+
15
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
15
16
 
16
17
 
17
- class ImageEncoder(dc.Mapper):
18
+ class ClipImageEncoder:
18
19
  def __init__(self, model_name: str, pretrained: str):
19
20
  self.model_name = model_name
20
21
  self.pretrained = pretrained
21
-
22
- def setup(self):
23
22
  self.model, _, self.preprocess = open_clip.create_model_and_transforms(
24
23
  self.model_name, self.pretrained
25
24
  )
26
25
 
27
- def process(self, file) -> list[float]:
28
- img = file.read()
29
- img = self.preprocess(img).unsqueeze(0)
30
- emb = self.model.encode_image(img)
31
- return emb[0].tolist()
26
+
27
+ def embeddings(file: File, encoder: ClipImageEncoder) -> list[float]:
28
+ img = file.read()
29
+ img = encoder.preprocess(img).unsqueeze(0)
30
+ emb = encoder.model.encode_image(img)
31
+ return emb[0].tolist()
32
32
 
33
33
 
34
34
  if __name__ == "__main__":
35
- # Run in chain
36
35
  (
37
- dc.read_storage("gs://datachain-demo/dogs-and-cats/", type="image")
38
- .filter(dc.C("file.path").glob("*cat*.jpg"))
39
- .settings(parallel=2)
36
+ dc.read_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
37
+ .filter(C("file.path").glob("*cat*.jpg"))
40
38
  .limit(5)
41
- .map(
42
- ImageEncoder("ViT-B-32", "laion2b_s34b_b79k"),
43
- params=["file"],
44
- output={"emb": list[float]},
45
- )
39
+ .settings(parallel=True)
40
+ .setup(encoder=lambda: ClipImageEncoder("ViT-B-32", "laion2b_s34b_b79k"))
41
+ .map(emb=embeddings)
46
42
  .show()
47
43
  )
@@ -0,0 +1,65 @@
1
+ import os
2
+ import sys
3
+
4
+ import anthropic
5
+ from pydantic import BaseModel
6
+
7
+ import datachain as dc
8
+ from datachain import C, File
9
+
10
+ DATA = "gs://datachain-demo/chatbot-KiT"
11
+ MODEL = "claude-3-5-haiku-latest"
12
+ TEMPERATURE = 0.9
13
+ DEFAULT_OUTPUT_TOKENS = 1024
14
+
15
+ PROMPT = """Consider the dialogue between the 'user' and the 'bot'. The 'user' is a
16
+ human trying to find the best mobile plan. The 'bot' is a chatbot designed to query
17
+ the user and offer the best solution. The dialog is successful if the 'bot' is able to
18
+ gather the information and offer a plan, or inform the user that such plan does not
19
+ exist. The dialog is not successful if the conversation ends early or the 'user'
20
+ requests additional functions the 'bot' cannot perform. Read the dialogue below and
21
+ rate it 'Success' if it is successful, and 'Failure' if not. After that, provide
22
+ one-sentence explanation of the reasons for this rating. Use only JSON object as output
23
+ with the keys 'status', and 'explanation'.
24
+ """
25
+
26
+ API_KEY = os.environ.get("ANTHROPIC_API_KEY")
27
+
28
+ if not API_KEY:
29
+ print("This example requires an Anthropic API key")
30
+ print("Add your key using the ANTHROPIC_API_KEY environment variable.")
31
+ sys.exit(0)
32
+
33
+
34
+ class Rating(BaseModel):
35
+ status: str = ""
36
+ explanation: str = ""
37
+
38
+
39
+ def rate(client: anthropic.Anthropic, file: File) -> Rating:
40
+ content = file.read()
41
+ response = client.messages.create(
42
+ model=MODEL,
43
+ max_tokens=DEFAULT_OUTPUT_TOKENS,
44
+ system=PROMPT,
45
+ temperature=TEMPERATURE,
46
+ messages=[
47
+ {"role": "user", "content": f"{content}"},
48
+ ],
49
+ )
50
+
51
+ first_block = response.content[0]
52
+ if first_block.type == "text":
53
+ return Rating.model_validate_json(first_block.text)
54
+ raise ValueError(f"Unexpected content block type: {first_block.type}")
55
+
56
+
57
+ (
58
+ dc.read_storage(DATA, type="text", anon=True)
59
+ .filter(C("file.path").glob("*.txt"))
60
+ .limit(4)
61
+ .settings(parallel=2, cache=True)
62
+ .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
63
+ .map(rating=rate)
64
+ .show()
65
+ )
@@ -25,20 +25,20 @@ def eval_dialog(
25
25
  ) -> DialogEval:
26
26
  try:
27
27
  completion = client.chat_completion(
28
- model="meta-llama/Llama-3.3-70B-Instruct",
28
+ model="HuggingFaceTB/SmolLM3-3B",
29
29
  messages=[
30
30
  {
31
31
  "role": "user",
32
32
  "content": f"{PROMPT}\n\nUser: {user_input}\nBot: {bot_response}",
33
33
  },
34
34
  ],
35
- response_format={"type": "json", "value": DialogEval.model_json_schema()},
35
+ response_format={
36
+ "type": "json_schema",
37
+ "json_schema": {"schema": DialogEval.model_json_schema()},
38
+ },
36
39
  )
37
40
  except HTTPError as e:
38
- return DialogEval(
39
- result="Error",
40
- reason=f"Error while interacting with the Hugging Face API. {e}",
41
- )
41
+ return DialogEval(result="Error", reason=str(e))
42
42
 
43
43
  message = completion.choices[0].message
44
44
  try: