datachain 0.11.11__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (345) hide show
  1. {datachain-0.11.11 → datachain-0.13.0}/.github/workflows/tests.yml +5 -12
  2. {datachain-0.11.11 → datachain-0.13.0}/.pre-commit-config.yaml +1 -1
  3. {datachain-0.11.11 → datachain-0.13.0}/PKG-INFO +4 -2
  4. {datachain-0.11.11 → datachain-0.13.0}/docs/examples.md +4 -6
  5. {datachain-0.11.11 → datachain-0.13.0}/docs/quick-start.md +1 -1
  6. {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/openimage-detect.py +3 -7
  7. {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/ultralytics-bbox.py +1 -9
  8. {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/ultralytics-pose.py +1 -9
  9. {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/ultralytics-segment.py +1 -9
  10. {datachain-0.11.11 → datachain-0.13.0}/noxfile.py +14 -0
  11. {datachain-0.11.11 → datachain-0.13.0}/pyproject.toml +5 -3
  12. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/catalog/catalog.py +39 -7
  13. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/catalog/loader.py +19 -13
  14. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/__init__.py +2 -1
  15. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/ls.py +8 -6
  16. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/show.py +7 -0
  17. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/parser/studio.py +13 -1
  18. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/fsspec.py +12 -16
  19. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/gcs.py +1 -1
  20. datachain-0.13.0/src/datachain/client/hf.py +60 -0
  21. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/local.py +1 -4
  22. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/s3.py +1 -1
  23. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/metastore.py +6 -0
  24. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/warehouse.py +3 -8
  25. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/dataset.py +8 -0
  26. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/error.py +0 -12
  27. datachain-0.13.0/src/datachain/fs/utils.py +30 -0
  28. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/__init__.py +5 -0
  29. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/func.py +2 -1
  30. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/dc.py +59 -15
  31. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/file.py +63 -18
  32. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/image.py +30 -6
  33. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/listing.py +21 -39
  34. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/meta_formats.py +2 -2
  35. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/signal_schema.py +65 -18
  36. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/udf.py +3 -0
  37. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/udf_signature.py +17 -9
  38. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/video.py +7 -5
  39. datachain-0.13.0/src/datachain/model/bbox.py +253 -0
  40. datachain-0.13.0/src/datachain/model/pose.py +100 -0
  41. datachain-0.13.0/src/datachain/model/segment.py +51 -0
  42. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/model/ultralytics/bbox.py +9 -9
  43. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/model/ultralytics/pose.py +7 -7
  44. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/model/ultralytics/segment.py +7 -7
  45. datachain-0.13.0/src/datachain/model/utils.py +191 -0
  46. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/dataset.py +8 -2
  47. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/sqlite/base.py +2 -2
  48. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/studio.py +8 -6
  49. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/utils.py +0 -16
  50. {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/PKG-INFO +4 -2
  51. {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/SOURCES.txt +18 -3
  52. {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/requires.txt +3 -1
  53. {datachain-0.11.11 → datachain-0.13.0}/tests/conftest.py +49 -3
  54. datachain-0.13.0/tests/func/data/lena.jpg +0 -0
  55. datachain-0.13.0/tests/func/model/data/running-mask0.png +0 -0
  56. datachain-0.13.0/tests/func/model/data/running-mask1.png +0 -0
  57. datachain-0.13.0/tests/func/model/data/running.jpg +0 -0
  58. datachain-0.13.0/tests/func/model/data/ships.jpg +0 -0
  59. datachain-0.13.0/tests/func/model/test_yolo.py +2427 -0
  60. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_client.py +0 -19
  61. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_datachain.py +35 -4
  62. datachain-0.13.0/tests/func/test_image.py +68 -0
  63. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_ls.py +0 -9
  64. {datachain-0.11.11/tests/unit/lib → datachain-0.13.0/tests/func}/test_video.py +35 -21
  65. datachain-0.13.0/tests/test_import_time.py +84 -0
  66. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_datachain.py +69 -0
  67. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_datachain_bootstrap.py +2 -2
  68. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_file.py +14 -0
  69. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_image.py +1 -4
  70. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_signal_schema.py +209 -26
  71. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_udf_signature.py +17 -7
  72. datachain-0.13.0/tests/unit/model/test_bbox.py +259 -0
  73. datachain-0.11.11/tests/unit/lib/test_models.py → datachain-0.13.0/tests/unit/model/test_pose.py +72 -51
  74. datachain-0.13.0/tests/unit/model/test_segment.py +53 -0
  75. datachain-0.13.0/tests/unit/model/test_utils.py +92 -0
  76. datachain-0.13.0/tests/unit/sql/__init__.py +0 -0
  77. datachain-0.13.0/tests/unit/sql/sqlite/__init__.py +0 -0
  78. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_array.py +7 -2
  79. {datachain-0.11.11 → datachain-0.13.0}/tests/utils.py +0 -8
  80. datachain-0.11.11/src/datachain/client/hf.py +0 -38
  81. datachain-0.11.11/src/datachain/model/bbox.py +0 -102
  82. datachain-0.11.11/src/datachain/model/pose.py +0 -88
  83. datachain-0.11.11/src/datachain/model/segment.py +0 -47
  84. {datachain-0.11.11 → datachain-0.13.0}/.cruft.json +0 -0
  85. {datachain-0.11.11 → datachain-0.13.0}/.gitattributes +0 -0
  86. {datachain-0.11.11 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  87. {datachain-0.11.11 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  88. {datachain-0.11.11 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  89. {datachain-0.11.11 → datachain-0.13.0}/.github/codecov.yaml +0 -0
  90. {datachain-0.11.11 → datachain-0.13.0}/.github/dependabot.yml +0 -0
  91. {datachain-0.11.11 → datachain-0.13.0}/.github/workflows/benchmarks.yml +0 -0
  92. {datachain-0.11.11 → datachain-0.13.0}/.github/workflows/release.yml +0 -0
  93. {datachain-0.11.11 → datachain-0.13.0}/.github/workflows/tests-studio.yml +0 -0
  94. {datachain-0.11.11 → datachain-0.13.0}/.github/workflows/update-template.yaml +0 -0
  95. {datachain-0.11.11 → datachain-0.13.0}/.gitignore +0 -0
  96. {datachain-0.11.11 → datachain-0.13.0}/CODE_OF_CONDUCT.rst +0 -0
  97. {datachain-0.11.11 → datachain-0.13.0}/LICENSE +0 -0
  98. {datachain-0.11.11 → datachain-0.13.0}/README.rst +0 -0
  99. {datachain-0.11.11 → datachain-0.13.0}/docs/assets/captioned_cartoons.png +0 -0
  100. {datachain-0.11.11 → datachain-0.13.0}/docs/assets/datachain-white.svg +0 -0
  101. {datachain-0.11.11 → datachain-0.13.0}/docs/assets/datachain.svg +0 -0
  102. {datachain-0.11.11 → datachain-0.13.0}/docs/contributing.md +0 -0
  103. {datachain-0.11.11 → datachain-0.13.0}/docs/css/github-permalink-style.css +0 -0
  104. {datachain-0.11.11 → datachain-0.13.0}/docs/index.md +0 -0
  105. {datachain-0.11.11 → datachain-0.13.0}/docs/overrides/main.html +0 -0
  106. {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/arrowrow.md +0 -0
  107. {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/bbox.md +0 -0
  108. {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/file.md +0 -0
  109. {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/imagefile.md +0 -0
  110. {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/index.md +0 -0
  111. {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/pose.md +0 -0
  112. {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/segment.md +0 -0
  113. {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/tarvfile.md +0 -0
  114. {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/textfile.md +0 -0
  115. {datachain-0.11.11 → datachain-0.13.0}/docs/references/data-types/videofile.md +0 -0
  116. {datachain-0.11.11 → datachain-0.13.0}/docs/references/datachain.md +0 -0
  117. {datachain-0.11.11 → datachain-0.13.0}/docs/references/func.md +0 -0
  118. {datachain-0.11.11 → datachain-0.13.0}/docs/references/index.md +0 -0
  119. {datachain-0.11.11 → datachain-0.13.0}/docs/references/remotes.md +0 -0
  120. {datachain-0.11.11 → datachain-0.13.0}/docs/references/toolkit.md +0 -0
  121. {datachain-0.11.11 → datachain-0.13.0}/docs/references/torch.md +0 -0
  122. {datachain-0.11.11 → datachain-0.13.0}/docs/references/udf.md +0 -0
  123. {datachain-0.11.11 → datachain-0.13.0}/docs/tutorials.md +0 -0
  124. {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  125. {datachain-0.11.11 → datachain-0.13.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  126. {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/common_sql_functions.py +0 -0
  127. {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/json-csv-reader.py +0 -0
  128. {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/torch-loader.py +0 -0
  129. {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/udfs/parallel.py +0 -0
  130. {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/udfs/simple.py +0 -0
  131. {datachain-0.11.11 → datachain-0.13.0}/examples/get_started/udfs/stateful.py +0 -0
  132. {datachain-0.11.11 → datachain-0.13.0}/examples/llm_and_nlp/claude-query.py +0 -0
  133. {datachain-0.11.11 → datachain-0.13.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  134. {datachain-0.11.11 → datachain-0.13.0}/examples/multimodal/clip_inference.py +0 -0
  135. {datachain-0.11.11 → datachain-0.13.0}/examples/multimodal/hf_pipeline.py +0 -0
  136. {datachain-0.11.11 → datachain-0.13.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  137. {datachain-0.11.11 → datachain-0.13.0}/examples/multimodal/wds.py +0 -0
  138. {datachain-0.11.11 → datachain-0.13.0}/examples/multimodal/wds_filtered.py +0 -0
  139. {datachain-0.11.11 → datachain-0.13.0}/mkdocs.yml +0 -0
  140. {datachain-0.11.11 → datachain-0.13.0}/setup.cfg +0 -0
  141. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/__init__.py +0 -0
  142. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/__main__.py +0 -0
  143. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/asyn.py +0 -0
  144. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cache.py +0 -0
  145. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/catalog/__init__.py +0 -0
  146. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/catalog/datasource.py +0 -0
  147. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/__init__.py +0 -0
  148. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/datasets.py +0 -0
  149. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/du.py +0 -0
  150. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/index.py +0 -0
  151. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/misc.py +0 -0
  152. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/commands/query.py +0 -0
  153. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/parser/__init__.py +0 -0
  154. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/parser/job.py +0 -0
  155. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/parser/utils.py +0 -0
  156. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/cli/utils.py +0 -0
  157. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/__init__.py +0 -0
  158. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/azure.py +0 -0
  159. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/client/fileslice.py +0 -0
  160. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/config.py +0 -0
  161. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/__init__.py +0 -0
  162. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/db_engine.py +0 -0
  163. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/job.py +0 -0
  164. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/schema.py +0 -0
  165. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/serializer.py +0 -0
  166. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/data_storage/sqlite.py +0 -0
  167. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/diff/__init__.py +0 -0
  168. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/fs/__init__.py +0 -0
  169. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/fs/reference.py +0 -0
  170. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/aggregate.py +0 -0
  171. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/array.py +0 -0
  172. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/base.py +0 -0
  173. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/conditional.py +0 -0
  174. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/numeric.py +0 -0
  175. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/path.py +0 -0
  176. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/random.py +0 -0
  177. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/string.py +0 -0
  178. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/func/window.py +0 -0
  179. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/job.py +0 -0
  180. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/__init__.py +0 -0
  181. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/arrow.py +0 -0
  182. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/clip.py +0 -0
  183. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/__init__.py +0 -0
  184. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/flatten.py +0 -0
  185. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  186. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  187. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/unflatten.py +0 -0
  188. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  189. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/data_model.py +0 -0
  190. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/dataset_info.py +0 -0
  191. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/hf.py +0 -0
  192. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/listing_info.py +0 -0
  193. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/model_store.py +0 -0
  194. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/pytorch.py +0 -0
  195. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/settings.py +0 -0
  196. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/tar.py +0 -0
  197. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/text.py +0 -0
  198. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/utils.py +0 -0
  199. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/webdataset.py +0 -0
  200. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/lib/webdataset_laion.py +0 -0
  201. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/listing.py +0 -0
  202. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/model/__init__.py +0 -0
  203. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  204. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/node.py +0 -0
  205. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/nodes_fetcher.py +0 -0
  206. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/nodes_thread_pool.py +0 -0
  207. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/progress.py +0 -0
  208. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/py.typed +0 -0
  209. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/__init__.py +0 -0
  210. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/batch.py +0 -0
  211. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/dispatch.py +0 -0
  212. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/metrics.py +0 -0
  213. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/params.py +0 -0
  214. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/queue.py +0 -0
  215. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/schema.py +0 -0
  216. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/session.py +0 -0
  217. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/udf.py +0 -0
  218. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/query/utils.py +0 -0
  219. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/remote/__init__.py +0 -0
  220. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/remote/studio.py +0 -0
  221. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/script_meta.py +0 -0
  222. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/__init__.py +0 -0
  223. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/default/__init__.py +0 -0
  224. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/default/base.py +0 -0
  225. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/__init__.py +0 -0
  226. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/aggregate.py +0 -0
  227. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/array.py +0 -0
  228. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/conditional.py +0 -0
  229. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/numeric.py +0 -0
  230. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/path.py +0 -0
  231. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/random.py +0 -0
  232. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/functions/string.py +0 -0
  233. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/selectable.py +0 -0
  234. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  235. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/sqlite/types.py +0 -0
  236. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/sqlite/vector.py +0 -0
  237. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/types.py +0 -0
  238. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/sql/utils.py +0 -0
  239. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/telemetry.py +0 -0
  240. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/toolkit/__init__.py +0 -0
  241. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/toolkit/split.py +0 -0
  242. {datachain-0.11.11 → datachain-0.13.0}/src/datachain/torch/__init__.py +0 -0
  243. {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  244. {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/entry_points.txt +0 -0
  245. {datachain-0.11.11 → datachain-0.13.0}/src/datachain.egg-info/top_level.txt +0 -0
  246. {datachain-0.11.11 → datachain-0.13.0}/tests/__init__.py +0 -0
  247. {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/__init__.py +0 -0
  248. {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/conftest.py +0 -0
  249. {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  250. {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  251. {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/datasets/.gitignore +0 -0
  252. {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  253. {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/test_datachain.py +0 -0
  254. {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/test_ls.py +0 -0
  255. {datachain-0.11.11 → datachain-0.13.0}/tests/benchmarks/test_version.py +0 -0
  256. {datachain-0.11.11 → datachain-0.13.0}/tests/data.py +0 -0
  257. {datachain-0.11.11 → datachain-0.13.0}/tests/examples/__init__.py +0 -0
  258. {datachain-0.11.11 → datachain-0.13.0}/tests/examples/test_examples.py +0 -0
  259. {datachain-0.11.11 → datachain-0.13.0}/tests/examples/test_wds_e2e.py +0 -0
  260. {datachain-0.11.11 → datachain-0.13.0}/tests/examples/wds_data.py +0 -0
  261. {datachain-0.11.11 → datachain-0.13.0}/tests/func/__init__.py +0 -0
  262. {datachain-0.11.11/tests/unit/lib → datachain-0.13.0/tests/func}/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  263. {datachain-0.11.11 → datachain-0.13.0}/tests/func/fake-service-account-credentials.json +0 -0
  264. {datachain-0.11.11/tests/unit → datachain-0.13.0/tests/func/model}/__init__.py +0 -0
  265. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_catalog.py +0 -0
  266. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_cloud_transfer.py +0 -0
  267. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_data_storage.py +0 -0
  268. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_datachain_merge.py +0 -0
  269. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_dataset_query.py +0 -0
  270. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_datasets.py +0 -0
  271. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_feature_pickling.py +0 -0
  272. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_file.py +0 -0
  273. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_hf.py +0 -0
  274. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_hidden_field.py +0 -0
  275. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_listing.py +0 -0
  276. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_meta_formats.py +0 -0
  277. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_metrics.py +0 -0
  278. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_pull.py +0 -0
  279. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_pytorch.py +0 -0
  280. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_query.py +0 -0
  281. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_session.py +0 -0
  282. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_toolkit.py +0 -0
  283. {datachain-0.11.11 → datachain-0.13.0}/tests/func/test_warehouse.py +0 -0
  284. {datachain-0.11.11 → datachain-0.13.0}/tests/scripts/feature_class.py +0 -0
  285. {datachain-0.11.11 → datachain-0.13.0}/tests/scripts/feature_class_exception.py +0 -0
  286. {datachain-0.11.11 → datachain-0.13.0}/tests/scripts/feature_class_parallel.py +0 -0
  287. {datachain-0.11.11 → datachain-0.13.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  288. {datachain-0.11.11 → datachain-0.13.0}/tests/scripts/name_len_slow.py +0 -0
  289. {datachain-0.11.11 → datachain-0.13.0}/tests/test_atomicity.py +0 -0
  290. {datachain-0.11.11 → datachain-0.13.0}/tests/test_cli_e2e.py +0 -0
  291. {datachain-0.11.11 → datachain-0.13.0}/tests/test_cli_studio.py +0 -0
  292. {datachain-0.11.11 → datachain-0.13.0}/tests/test_query_e2e.py +0 -0
  293. {datachain-0.11.11 → datachain-0.13.0}/tests/test_telemetry.py +0 -0
  294. {datachain-0.11.11/tests/unit/lib → datachain-0.13.0/tests/unit}/__init__.py +0 -0
  295. {datachain-0.11.11/tests/unit/sql → datachain-0.13.0/tests/unit/lib}/__init__.py +0 -0
  296. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/conftest.py +0 -0
  297. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_arrow.py +0 -0
  298. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_clip.py +0 -0
  299. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  300. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_diff.py +0 -0
  301. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_feature.py +0 -0
  302. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_feature_utils.py +0 -0
  303. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_hf.py +0 -0
  304. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_listing_info.py +0 -0
  305. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  306. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_schema.py +0 -0
  307. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  308. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_text.py +0 -0
  309. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_utils.py +0 -0
  310. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/lib/test_webdataset.py +0 -0
  311. {datachain-0.11.11/tests/unit/sql/sqlite → datachain-0.13.0/tests/unit/model}/__init__.py +0 -0
  312. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  313. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  314. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_conditional.py +0 -0
  315. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_path.py +0 -0
  316. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_random.py +0 -0
  317. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_selectable.py +0 -0
  318. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/sql/test_string.py +0 -0
  319. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_asyn.py +0 -0
  320. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_cache.py +0 -0
  321. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_catalog.py +0 -0
  322. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_catalog_loader.py +0 -0
  323. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_cli_parsing.py +0 -0
  324. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_client.py +0 -0
  325. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_client_gcs.py +0 -0
  326. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_client_s3.py +0 -0
  327. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_config.py +0 -0
  328. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_data_storage.py +0 -0
  329. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_database_engine.py +0 -0
  330. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_dataset.py +0 -0
  331. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_dispatch.py +0 -0
  332. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_fileslice.py +0 -0
  333. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_func.py +0 -0
  334. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_listing.py +0 -0
  335. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_metastore.py +0 -0
  336. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_module_exports.py +0 -0
  337. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_pytorch.py +0 -0
  338. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_query.py +0 -0
  339. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_query_metrics.py +0 -0
  340. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_query_params.py +0 -0
  341. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_script_meta.py +0 -0
  342. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_serializer.py +0 -0
  343. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_session.py +0 -0
  344. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_utils.py +0 -0
  345. {datachain-0.11.11 → datachain-0.13.0}/tests/unit/test_warehouse.py +0 -0
@@ -80,14 +80,6 @@ jobs:
80
80
 
81
81
  - name: Set up FFmpeg
82
82
  uses: AnimMouse/setup-ffmpeg@v1
83
- id: ffmpeg-install
84
- continue-on-error: ${{ runner.os == 'macOS' }}
85
-
86
- # https://github.com/AnimMouse/setup-ffmpeg/issues/5
87
- - if: steps.ffmpeg-install.outcome == 'failure' && runner.os == 'macOS'
88
- run: brew install ffmpeg
89
- env:
90
- HOMEBREW_NO_AUTO_UPDATE: "1"
91
83
 
92
84
  - name: Set up Python ${{ matrix.pyv }}
93
85
  uses: actions/setup-python@v5
@@ -117,7 +109,7 @@ jobs:
117
109
  shell: bash
118
110
 
119
111
  - name: Run E2E tests
120
- run: nox -s tests-${{ matrix.pyv }} -- -m "e2e" --cov-append $DISABLE_REMOTES_ARG
112
+ run: nox -s e2e-${{ matrix.pyv }}
121
113
  shell: bash
122
114
 
123
115
  - name: Upload coverage report
@@ -141,11 +133,13 @@ jobs:
141
133
  matrix:
142
134
  os: [ubuntu-latest, windows-latest]
143
135
  pyv: ['3.9', '3.13']
144
- group: ['get_started', 'computer_vision', 'llm_and_nlp', 'multimodal']
136
+ group: ['get_started', 'computer_vision', 'multimodal']
145
137
  exclude:
146
138
  - {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
147
139
  - {os: ubuntu-latest, pyv: '3.13', group: 'multimodal'}
148
140
  include:
141
+ # HF runs against actual API - thus run it only once
142
+ - {os: ubuntu-latest, pyv: "3.13", group: llm_and_nlp}
149
143
  - {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
150
144
  - {os: ubuntu-latest-4-cores, pyv: "3.13", group: multimodal}
151
145
 
@@ -169,9 +163,8 @@ jobs:
169
163
  - name: Install nox
170
164
  run: uv pip install nox --system
171
165
 
172
- # HF runs against actual API - thus run it only once
173
166
  - name: Set hf token
174
- if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.13'
167
+ if: matrix.group == 'llm_and_nlp'
175
168
  run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
176
169
 
177
170
  - name: Run examples
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.9.9'
27
+ rev: 'v0.11.0'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.11.11
3
+ Version: 0.13.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -71,6 +71,7 @@ Requires-Dist: usearch; extra == "vector"
71
71
  Provides-Extra: hf
72
72
  Requires-Dist: numba>=0.60.0; extra == "hf"
73
73
  Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
74
+ Requires-Dist: fsspec>=2024.12.0; extra == "hf"
74
75
  Provides-Extra: video
75
76
  Requires-Dist: ffmpeg-python; extra == "video"
76
77
  Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
@@ -90,6 +91,7 @@ Requires-Dist: hypothesis; extra == "tests"
90
91
  Requires-Dist: aiotools>=1.7.0; extra == "tests"
91
92
  Requires-Dist: requests-mock; extra == "tests"
92
93
  Requires-Dist: scipy; extra == "tests"
94
+ Requires-Dist: ultralytics; extra == "tests"
93
95
  Provides-Extra: dev
94
96
  Requires-Dist: datachain[docs,tests]; extra == "dev"
95
97
  Requires-Dist: mypy==1.15.0; extra == "dev"
@@ -103,7 +105,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
103
105
  Requires-Dist: defusedxml; extra == "examples"
104
106
  Requires-Dist: accelerate; extra == "examples"
105
107
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
106
- Requires-Dist: ultralytics==8.3.82; extra == "examples"
108
+ Requires-Dist: ultralytics; extra == "examples"
107
109
  Requires-Dist: open_clip_torch; extra == "examples"
108
110
 
109
111
  ================
@@ -13,7 +13,7 @@ title: Examples
13
13
  For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”:
14
14
 
15
15
  ```python
16
- from datachain.lib.dc import Column, DataChain, File # (1)!
16
+ from datachain import Column, DataChain, File # (1)!
17
17
  from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
18
18
 
19
19
  images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
@@ -80,12 +80,10 @@ In the below example, we are calling a Mixtral 8x22b model to judge the “servi
80
80
  # $ export MISTRAL_API_KEY='your key'
81
81
 
82
82
  import os
83
- from datachain.lib.feature import Feature
84
- from datachain.lib.dc import Column, DataChain
83
+ from datachain import Column, DataChain, DataModel, Feature
85
84
  from mistralai.client import MistralClient
86
85
  from mistralai.models.chat_completion import ChatMessage
87
86
  from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel
88
- from datachain.lib.data_model import DataModel
89
87
 
90
88
  prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
91
89
  api_key = os.environ["MISTRAL_API_KEY"]
@@ -189,7 +187,7 @@ DataChain library understands common annotation formats (JSON, CSV, webdataset a
189
187
  Here is an example of reading a simple CSV file where schema is heuristically derived from the header:
190
188
 
191
189
  ```python
192
- from datachain.lib.dc import DataChain
190
+ from datachain import DataChain
193
191
 
194
192
  uri="gs://datachain-demo/chatbot-csv/"
195
193
  csv_dataset = DataChain.from_csv(uri)
@@ -234,7 +232,7 @@ However, Datachain can easily parse the entire COCO structure via several readin
234
232
 
235
233
  ```python
236
234
 
237
- from datachain.lib.dc import Column, DataChain
235
+ from datachain import Column, DataChain
238
236
 
239
237
  images_uri="gs://datachain-demo/coco2017/images/val/"
240
238
  captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json"
@@ -138,7 +138,7 @@ chain = (
138
138
  )
139
139
 
140
140
  successful_chain = chain.filter(Column("is_success") == True)
141
- successful_chain.export_files("./output_mistral")
141
+ successful_chain.to_storage("./output_mistral")
142
142
 
143
143
  print(f"{successful_chain.count()} files were exported")
144
144
  ```
@@ -22,13 +22,9 @@ def openimage_detect(args):
22
22
  detections = json.load(stream_json).get("detections", [])
23
23
 
24
24
  for i, detect in enumerate(detections):
25
- bbox = model.BBox.from_list(
26
- [
27
- detect["XMin"] * img.width,
28
- detect["XMax"] * img.width,
29
- detect["YMin"] * img.height,
30
- detect["YMax"] * img.height,
31
- ]
25
+ bbox = model.BBox.from_albumentations(
26
+ [detect[k] for k in ("XMin", "YMin", "XMax", "YMax")],
27
+ img_size=(img.width, img.height),
32
28
  )
33
29
 
34
30
  fstream = File(
@@ -1,11 +1,3 @@
1
- import os
2
-
3
- os.environ["YOLO_VERBOSE"] = "false"
4
-
5
-
6
- from io import BytesIO
7
-
8
- from PIL import Image
9
1
  from ultralytics import YOLO
10
2
 
11
3
  from datachain import C, DataChain, File
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloBBoxes
13
5
 
14
6
 
15
7
  def process_bboxes(yolo: YOLO, file: File) -> YoloBBoxes:
16
- results = yolo(Image.open(BytesIO(file.read())))
8
+ results = yolo(file.as_image_file().read(), verbose=False)
17
9
  return YoloBBoxes.from_results(results)
18
10
 
19
11
 
@@ -1,11 +1,3 @@
1
- import os
2
-
3
- os.environ["YOLO_VERBOSE"] = "false"
4
-
5
-
6
- from io import BytesIO
7
-
8
- from PIL import Image
9
1
  from ultralytics import YOLO
10
2
 
11
3
  from datachain import C, DataChain, File
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloPoses
13
5
 
14
6
 
15
7
  def process_poses(yolo: YOLO, file: File) -> YoloPoses:
16
- results = yolo(Image.open(BytesIO(file.read())))
8
+ results = yolo(file.as_image_file().read(), verbose=False)
17
9
  return YoloPoses.from_results(results)
18
10
 
19
11
 
@@ -1,11 +1,3 @@
1
- import os
2
-
3
- os.environ["YOLO_VERBOSE"] = "false"
4
-
5
-
6
- from io import BytesIO
7
-
8
- from PIL import Image
9
1
  from ultralytics import YOLO
10
2
 
11
3
  from datachain import C, DataChain, File
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloSegments
13
5
 
14
6
 
15
7
  def process_segments(yolo: YOLO, file: File) -> YoloSegments:
16
- results = yolo(Image.open(BytesIO(file.read())))
8
+ results = yolo(file.as_image_file().read(), verbose=False)
17
9
  return YoloSegments.from_results(results)
18
10
 
19
11
 
@@ -56,6 +56,20 @@ def tests(session: nox.Session) -> None:
56
56
  )
57
57
 
58
58
 
59
+ @nox.session(python=python_versions)
60
+ def e2e(session: nox.Session) -> None:
61
+ session.install(".[tests]")
62
+ session.run(
63
+ "pytest",
64
+ "--durations=0",
65
+ "--numprocesses=logical",
66
+ "--dist=loadgroup",
67
+ "-m",
68
+ "e2e",
69
+ *session.posargs,
70
+ )
71
+
72
+
59
73
  @nox.session
60
74
  def lint(session: nox.Session) -> None:
61
75
  session.install("pre-commit")
@@ -80,7 +80,8 @@ vector = [
80
80
  ]
81
81
  hf = [
82
82
  "numba>=0.60.0",
83
- "datasets[audio,vision]>=2.21.0"
83
+ "datasets[audio,vision]>=2.21.0",
84
+ "fsspec>=2024.12.0"
84
85
  ]
85
86
  video = [
86
87
  "ffmpeg-python",
@@ -101,7 +102,8 @@ tests = [
101
102
  "hypothesis",
102
103
  "aiotools>=1.7.0",
103
104
  "requests-mock",
104
- "scipy"
105
+ "scipy",
106
+ "ultralytics"
105
107
  ]
106
108
  dev = [
107
109
  "datachain[docs,tests]",
@@ -117,7 +119,7 @@ examples = [
117
119
  "defusedxml",
118
120
  "accelerate",
119
121
  "huggingface_hub[hf_transfer]",
120
- "ultralytics==8.3.82",
122
+ "ultralytics",
121
123
  "open_clip_torch"
122
124
  ]
123
125
 
@@ -25,7 +25,6 @@ from typing import (
25
25
  )
26
26
  from uuid import uuid4
27
27
 
28
- import requests
29
28
  import sqlalchemy as sa
30
29
  from sqlalchemy import Column
31
30
  from tqdm.auto import tqdm
@@ -54,7 +53,6 @@ from datachain.error import (
54
53
  from datachain.lib.listing import get_listing
55
54
  from datachain.node import DirType, Node, NodeWithPath
56
55
  from datachain.nodes_thread_pool import NodesThreadPool
57
- from datachain.remote.studio import StudioClient
58
56
  from datachain.sql.types import DateTime, SQLType
59
57
  from datachain.utils import DataChainDir
60
58
 
@@ -162,6 +160,8 @@ class DatasetRowsFetcher(NodesThreadPool):
162
160
  max_threads: int = PULL_DATASET_MAX_THREADS,
163
161
  progress_bar=None,
164
162
  ):
163
+ from datachain.remote.studio import StudioClient
164
+
165
165
  super().__init__(max_threads)
166
166
  self._check_dependencies()
167
167
  self.metastore = metastore
@@ -234,6 +234,8 @@ class DatasetRowsFetcher(NodesThreadPool):
234
234
  return df.drop("sys__id", axis=1)
235
235
 
236
236
  def get_parquet_content(self, url: str):
237
+ import requests
238
+
237
239
  while True:
238
240
  if self.should_check_for_status():
239
241
  self.check_for_status()
@@ -775,6 +777,8 @@ class Catalog:
775
777
  validate_version: Optional[bool] = True,
776
778
  listing: Optional[bool] = False,
777
779
  uuid: Optional[str] = None,
780
+ description: Optional[str] = None,
781
+ labels: Optional[list[str]] = None,
778
782
  ) -> "DatasetRecord":
779
783
  """
780
784
  Creates new dataset of a specific version.
@@ -801,6 +805,8 @@ class Catalog:
801
805
  query_script=query_script,
802
806
  schema=schema,
803
807
  ignore_if_exists=True,
808
+ description=description,
809
+ labels=labels,
804
810
  )
805
811
 
806
812
  version = version or default_version
@@ -1130,6 +1136,8 @@ class Catalog:
1130
1136
  raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
1131
1137
 
1132
1138
  def get_remote_dataset(self, name: str) -> DatasetRecord:
1139
+ from datachain.remote.studio import StudioClient
1140
+
1133
1141
  studio_client = StudioClient()
1134
1142
 
1135
1143
  info_response = studio_client.dataset_info(name)
@@ -1164,8 +1172,27 @@ class Catalog:
1164
1172
 
1165
1173
  return direct_dependencies
1166
1174
 
1167
- def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetListRecord]:
1168
- datasets = self.metastore.list_datasets()
1175
+ def ls_datasets(
1176
+ self, include_listing: bool = False, studio: bool = False
1177
+ ) -> Iterator[DatasetListRecord]:
1178
+ from datachain.remote.studio import StudioClient
1179
+
1180
+ if studio:
1181
+ client = StudioClient()
1182
+ response = client.ls_datasets()
1183
+ if not response.ok:
1184
+ raise DataChainError(response.message)
1185
+ if not response.data:
1186
+ return
1187
+
1188
+ datasets: Iterator[DatasetListRecord] = (
1189
+ DatasetListRecord.from_dict(d)
1190
+ for d in response.data
1191
+ if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
1192
+ )
1193
+ else:
1194
+ datasets = self.metastore.list_datasets()
1195
+
1169
1196
  for d in datasets:
1170
1197
  if not d.is_bucket_listing or include_listing:
1171
1198
  yield d
@@ -1173,9 +1200,12 @@ class Catalog:
1173
1200
  def list_datasets_versions(
1174
1201
  self,
1175
1202
  include_listing: bool = False,
1203
+ studio: bool = False,
1176
1204
  ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
1177
1205
  """Iterate over all dataset versions with related jobs."""
1178
- datasets = list(self.ls_datasets(include_listing=include_listing))
1206
+ datasets = list(
1207
+ self.ls_datasets(include_listing=include_listing, studio=studio)
1208
+ )
1179
1209
 
1180
1210
  # preselect dataset versions jobs from db to avoid multiple queries
1181
1211
  jobs_ids: set[str] = {
@@ -1345,6 +1375,8 @@ class Catalog:
1345
1375
  if cp and not output:
1346
1376
  raise ValueError("Please provide output directory for instantiation")
1347
1377
 
1378
+ from datachain.remote.studio import StudioClient
1379
+
1348
1380
  studio_client = StudioClient()
1349
1381
 
1350
1382
  try:
@@ -1580,7 +1612,7 @@ class Catalog:
1580
1612
  except TerminationSignal as exc:
1581
1613
  signal.signal(signal.SIGTERM, orig_sigterm_handler)
1582
1614
  signal.signal(signal.SIGINT, orig_sigint_handler)
1583
- logging.info("Shutting down process %s, received %r", proc.pid, exc)
1615
+ logger.info("Shutting down process %s, received %r", proc.pid, exc)
1584
1616
  # Rather than forwarding the signal to the child, we try to shut it down
1585
1617
  # gracefully. This is because we consider the script to be interactive
1586
1618
  # and special, so we give it time to cleanup before exiting.
@@ -1595,7 +1627,7 @@ class Catalog:
1595
1627
  if thread:
1596
1628
  thread.join() # wait for the reader thread
1597
1629
 
1598
- logging.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1630
+ logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1599
1631
  if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1600
1632
  raise QueryScriptCancelError(
1601
1633
  "Query script was canceled by user",
@@ -1,19 +1,13 @@
1
1
  import os
2
2
  from importlib import import_module
3
- from typing import Any, Optional
4
-
5
- from datachain.catalog import Catalog
6
- from datachain.data_storage import (
7
- AbstractMetastore,
8
- AbstractWarehouse,
9
- )
10
- from datachain.data_storage.serializer import deserialize
11
- from datachain.data_storage.sqlite import (
12
- SQLiteMetastore,
13
- SQLiteWarehouse,
14
- )
3
+ from typing import TYPE_CHECKING, Any, Optional
4
+
15
5
  from datachain.utils import get_envs_by_prefix
16
6
 
7
+ if TYPE_CHECKING:
8
+ from datachain.catalog import Catalog
9
+ from datachain.data_storage import AbstractMetastore, AbstractWarehouse
10
+
17
11
  METASTORE_SERIALIZED = "DATACHAIN__METASTORE"
18
12
  METASTORE_IMPORT_PATH = "DATACHAIN_METASTORE"
19
13
  METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
@@ -27,6 +21,9 @@ IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
27
21
 
28
22
 
29
23
  def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
24
+ from datachain.data_storage import AbstractMetastore
25
+ from datachain.data_storage.serializer import deserialize
26
+
30
27
  metastore_serialized = os.environ.get(METASTORE_SERIALIZED)
31
28
  if metastore_serialized:
32
29
  metastore_obj = deserialize(metastore_serialized)
@@ -45,6 +42,8 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
45
42
  }
46
43
 
47
44
  if not metastore_import_path:
45
+ from datachain.data_storage.sqlite import SQLiteMetastore
46
+
48
47
  metastore_args["in_memory"] = in_memory
49
48
  return SQLiteMetastore(**metastore_args)
50
49
  if in_memory:
@@ -62,6 +61,9 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
62
61
 
63
62
 
64
63
  def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
64
+ from datachain.data_storage import AbstractWarehouse
65
+ from datachain.data_storage.serializer import deserialize
66
+
65
67
  warehouse_serialized = os.environ.get(WAREHOUSE_SERIALIZED)
66
68
  if warehouse_serialized:
67
69
  warehouse_obj = deserialize(warehouse_serialized)
@@ -80,6 +82,8 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
80
82
  }
81
83
 
82
84
  if not warehouse_import_path:
85
+ from datachain.data_storage.sqlite import SQLiteWarehouse
86
+
83
87
  warehouse_args["in_memory"] = in_memory
84
88
  return SQLiteWarehouse(**warehouse_args)
85
89
  if in_memory:
@@ -121,7 +125,7 @@ def get_distributed_class(**kwargs):
121
125
 
122
126
  def get_catalog(
123
127
  client_config: Optional[dict[str, Any]] = None, in_memory: bool = False
124
- ) -> Catalog:
128
+ ) -> "Catalog":
125
129
  """
126
130
  Function that creates Catalog instance with appropriate metastore
127
131
  and warehouse classes. Metastore class can be provided with env variable
@@ -133,6 +137,8 @@ def get_catalog(
133
137
  and name of variable after, e.g. if it accepts team_id as kwargs
134
138
  we can provide DATACHAIN_METASTORE_ARG_TEAM_ID=12345 env variable.
135
139
  """
140
+ from datachain.catalog import Catalog
141
+
136
142
  return Catalog(
137
143
  metastore=get_metastore(in_memory=in_memory),
138
144
  warehouse=get_warehouse(in_memory=in_memory),
@@ -6,7 +6,6 @@ from multiprocessing import freeze_support
6
6
  from typing import Optional
7
7
 
8
8
  from datachain.cli.utils import get_logging_level
9
- from datachain.telemetry import telemetry
10
9
 
11
10
  from .commands import (
12
11
  clear_cache,
@@ -70,6 +69,8 @@ def main(argv: Optional[list[str]] = None) -> int:
70
69
  error, return_code = handle_general_exception(exc, args, logging_level)
71
70
  return return_code
72
71
  finally:
72
+ from datachain.telemetry import telemetry
73
+
73
74
  telemetry.send_cli_call(args.command, error=error)
74
75
 
75
76
 
@@ -38,11 +38,12 @@ def ls_local(
38
38
  ):
39
39
  from datachain import DataChain
40
40
 
41
- if catalog is None:
42
- from datachain.catalog import get_catalog
43
-
44
- catalog = get_catalog(client_config=client_config)
45
41
  if sources:
42
+ if catalog is None:
43
+ from datachain.catalog import get_catalog
44
+
45
+ catalog = get_catalog(client_config=client_config)
46
+
46
47
  actual_sources = list(ls_urls(sources, catalog=catalog, long=long, **kwargs))
47
48
  if len(actual_sources) == 1:
48
49
  for _, entries in actual_sources:
@@ -61,8 +62,9 @@ def ls_local(
61
62
  for entry in entries:
62
63
  print(format_ls_entry(entry))
63
64
  else:
64
- chain = DataChain.listings()
65
- for ls in chain.collect("listing"):
65
+ # Collect results in a list here to prevent interference from `tqdm` and `print`
66
+ listing = list(DataChain.listings().collect("listing"))
67
+ for ls in listing:
66
68
  print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
67
69
 
68
70
 
@@ -40,6 +40,13 @@ def show(
40
40
  .offset(offset)
41
41
  )
42
42
  records = query.to_db_records()
43
+ print("Name: ", name)
44
+ if dataset.description:
45
+ print("Description: ", dataset.description)
46
+ if dataset.labels:
47
+ print("Labels: ", ",".join(dataset.labels))
48
+ print("\n")
49
+
43
50
  show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
44
51
 
45
52
  if schema and dataset_version.feature_schema:
@@ -63,19 +63,31 @@ def add_auth_parser(subparsers, parent_parser) -> None:
63
63
  default=False,
64
64
  help="Use code-based authentication without browser",
65
65
  )
66
+ login_parser.add_argument(
67
+ "--local",
68
+ action="store_true",
69
+ default=False,
70
+ help="Save the token in the local project config",
71
+ )
66
72
 
67
73
  auth_logout_help = "Log out from Studio"
68
74
  auth_logout_description = (
69
75
  "Remove the Studio authentication token from global config."
70
76
  )
71
77
 
72
- auth_subparser.add_parser(
78
+ logout_parser = auth_subparser.add_parser(
73
79
  "logout",
74
80
  parents=[parent_parser],
75
81
  description=auth_logout_description,
76
82
  help=auth_logout_help,
77
83
  formatter_class=CustomHelpFormatter,
78
84
  )
85
+ logout_parser.add_argument(
86
+ "--local",
87
+ action="store_true",
88
+ default=False,
89
+ help="Remove the token from the local project config",
90
+ )
79
91
 
80
92
  auth_team_help = "Set default team for Studio operations"
81
93
  auth_team_description = "Set the default team for Studio operations."
@@ -17,10 +17,10 @@ from typing import (
17
17
  ClassVar,
18
18
  NamedTuple,
19
19
  Optional,
20
+ Union,
20
21
  )
21
22
  from urllib.parse import urlparse
22
23
 
23
- from botocore.exceptions import ClientError
24
24
  from dvc_objects.fs.system import reflink
25
25
  from fsspec.asyn import get_loop, sync
26
26
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -28,7 +28,6 @@ from tqdm.auto import tqdm
28
28
 
29
29
  from datachain.cache import Cache
30
30
  from datachain.client.fileslice import FileWrapper
31
- from datachain.error import ClientError as DataChainClientError
32
31
  from datachain.nodes_fetcher import NodesFetcher
33
32
  from datachain.nodes_thread_pool import NodeChunk
34
33
 
@@ -83,19 +82,17 @@ class Client(ABC):
83
82
  self.uri = self.get_uri(self.name)
84
83
 
85
84
  @staticmethod
86
- def get_implementation(url: str) -> type["Client"]:
85
+ def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]:
87
86
  from .azure import AzureClient
88
87
  from .gcs import GCSClient
89
88
  from .hf import HfClient
90
89
  from .local import FileClient
91
90
  from .s3 import ClientS3
92
91
 
93
- protocol = urlparse(url).scheme
92
+ protocol = urlparse(str(url)).scheme
94
93
 
95
- if not protocol or _is_win_local_path(url):
94
+ if not protocol or _is_win_local_path(str(url)):
96
95
  return FileClient
97
-
98
- protocol = protocol.lower()
99
96
  if protocol == ClientS3.protocol:
100
97
  return ClientS3
101
98
  if protocol == GCSClient.protocol:
@@ -121,9 +118,11 @@ class Client(ABC):
121
118
  return cls.get_uri(storage_name), rel_path
122
119
 
123
120
  @staticmethod
124
- def get_client(source: str, cache: Cache, **kwargs) -> "Client":
121
+ def get_client(
122
+ source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
123
+ ) -> "Client":
125
124
  cls = Client.get_implementation(source)
126
- storage_url, _ = cls.split_url(source)
125
+ storage_url, _ = cls.split_url(str(source))
127
126
  if os.name == "nt":
128
127
  storage_url = storage_url.removeprefix("/")
129
128
 
@@ -209,7 +208,7 @@ class Client(ABC):
209
208
 
210
209
  async def get_current_etag(self, file: "File") -> str:
211
210
  kwargs = {}
212
- if self.fs.version_aware:
211
+ if getattr(self.fs, "version_aware", False):
213
212
  kwargs["version_id"] = file.version
214
213
  info = await self.fs._info(
215
214
  self.get_full_path(file.path, file.version), **kwargs
@@ -286,11 +285,6 @@ class Client(ABC):
286
285
  worker.cancel()
287
286
  if excs:
288
287
  raise excs[0]
289
- except ClientError as exc:
290
- raise DataChainClientError(
291
- exc.response.get("Error", {}).get("Message") or exc,
292
- exc.response.get("Error", {}).get("Code"),
293
- ) from exc
294
288
  finally:
295
289
  # This ensures the progress bar is closed before any exceptions are raised
296
290
  progress_bar.close()
@@ -333,7 +327,9 @@ class Client(ABC):
333
327
  return not (key.startswith("/") or key.endswith("/") or "//" in key)
334
328
 
335
329
  async def ls_dir(self, path):
336
- return await self.fs._ls(path, detail=True, versions=True)
330
+ if getattr(self.fs, "version_aware", False):
331
+ kwargs = {"versions": True}
332
+ return await self.fs._ls(path, detail=True, **kwargs)
337
333
 
338
334
  def rel_path(self, path: str) -> str:
339
335
  return self.fs.split_path(path)[1]