datachain 0.11.11__tar.gz → 0.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (345) hide show
  1. {datachain-0.11.11 → datachain-0.12.0}/.github/workflows/tests.yml +5 -12
  2. {datachain-0.11.11 → datachain-0.12.0}/.pre-commit-config.yaml +1 -1
  3. {datachain-0.11.11 → datachain-0.12.0}/PKG-INFO +4 -2
  4. {datachain-0.11.11 → datachain-0.12.0}/docs/examples.md +4 -6
  5. {datachain-0.11.11 → datachain-0.12.0}/docs/quick-start.md +1 -1
  6. {datachain-0.11.11 → datachain-0.12.0}/examples/computer_vision/openimage-detect.py +3 -7
  7. {datachain-0.11.11 → datachain-0.12.0}/examples/computer_vision/ultralytics-bbox.py +1 -9
  8. {datachain-0.11.11 → datachain-0.12.0}/examples/computer_vision/ultralytics-pose.py +1 -9
  9. {datachain-0.11.11 → datachain-0.12.0}/examples/computer_vision/ultralytics-segment.py +1 -9
  10. {datachain-0.11.11 → datachain-0.12.0}/noxfile.py +14 -0
  11. {datachain-0.11.11 → datachain-0.12.0}/pyproject.toml +5 -3
  12. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/catalog/catalog.py +33 -5
  13. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/catalog/loader.py +19 -13
  14. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/__init__.py +2 -1
  15. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/parser/studio.py +13 -1
  16. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/client/fsspec.py +12 -16
  17. datachain-0.12.0/src/datachain/client/hf.py +60 -0
  18. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/client/local.py +1 -4
  19. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/data_storage/warehouse.py +3 -8
  20. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/dataset.py +8 -0
  21. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/error.py +0 -12
  22. datachain-0.12.0/src/datachain/fs/utils.py +30 -0
  23. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/__init__.py +5 -0
  24. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/func.py +2 -1
  25. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/dc.py +23 -8
  26. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/file.py +55 -17
  27. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/image.py +30 -6
  28. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/listing.py +21 -39
  29. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/video.py +7 -5
  30. datachain-0.12.0/src/datachain/model/bbox.py +253 -0
  31. datachain-0.12.0/src/datachain/model/pose.py +100 -0
  32. datachain-0.12.0/src/datachain/model/segment.py +51 -0
  33. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/model/ultralytics/bbox.py +9 -9
  34. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/model/ultralytics/pose.py +7 -7
  35. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/model/ultralytics/segment.py +7 -7
  36. datachain-0.12.0/src/datachain/model/utils.py +191 -0
  37. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/dataset.py +4 -2
  38. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/studio.py +8 -6
  39. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/utils.py +0 -16
  40. {datachain-0.11.11 → datachain-0.12.0}/src/datachain.egg-info/PKG-INFO +4 -2
  41. {datachain-0.11.11 → datachain-0.12.0}/src/datachain.egg-info/SOURCES.txt +18 -3
  42. {datachain-0.11.11 → datachain-0.12.0}/src/datachain.egg-info/requires.txt +3 -1
  43. {datachain-0.11.11 → datachain-0.12.0}/tests/conftest.py +49 -3
  44. datachain-0.12.0/tests/func/data/lena.jpg +0 -0
  45. datachain-0.12.0/tests/func/model/data/running-mask0.png +0 -0
  46. datachain-0.12.0/tests/func/model/data/running-mask1.png +0 -0
  47. datachain-0.12.0/tests/func/model/data/running.jpg +0 -0
  48. datachain-0.12.0/tests/func/model/data/ships.jpg +0 -0
  49. datachain-0.12.0/tests/func/model/test_yolo.py +2427 -0
  50. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_client.py +0 -19
  51. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_datachain.py +19 -3
  52. datachain-0.12.0/tests/func/test_image.py +68 -0
  53. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_ls.py +0 -9
  54. {datachain-0.11.11/tests/unit/lib → datachain-0.12.0/tests/func}/test_video.py +35 -21
  55. datachain-0.12.0/tests/test_import_time.py +84 -0
  56. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_datachain.py +20 -0
  57. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_file.py +14 -0
  58. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_image.py +1 -4
  59. datachain-0.12.0/tests/unit/model/test_bbox.py +259 -0
  60. datachain-0.11.11/tests/unit/lib/test_models.py → datachain-0.12.0/tests/unit/model/test_pose.py +72 -51
  61. datachain-0.12.0/tests/unit/model/test_segment.py +53 -0
  62. datachain-0.12.0/tests/unit/model/test_utils.py +92 -0
  63. datachain-0.12.0/tests/unit/sql/__init__.py +0 -0
  64. datachain-0.12.0/tests/unit/sql/sqlite/__init__.py +0 -0
  65. {datachain-0.11.11 → datachain-0.12.0}/tests/utils.py +0 -8
  66. datachain-0.11.11/src/datachain/client/hf.py +0 -38
  67. datachain-0.11.11/src/datachain/model/bbox.py +0 -102
  68. datachain-0.11.11/src/datachain/model/pose.py +0 -88
  69. datachain-0.11.11/src/datachain/model/segment.py +0 -47
  70. {datachain-0.11.11 → datachain-0.12.0}/.cruft.json +0 -0
  71. {datachain-0.11.11 → datachain-0.12.0}/.gitattributes +0 -0
  72. {datachain-0.11.11 → datachain-0.12.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  73. {datachain-0.11.11 → datachain-0.12.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  74. {datachain-0.11.11 → datachain-0.12.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  75. {datachain-0.11.11 → datachain-0.12.0}/.github/codecov.yaml +0 -0
  76. {datachain-0.11.11 → datachain-0.12.0}/.github/dependabot.yml +0 -0
  77. {datachain-0.11.11 → datachain-0.12.0}/.github/workflows/benchmarks.yml +0 -0
  78. {datachain-0.11.11 → datachain-0.12.0}/.github/workflows/release.yml +0 -0
  79. {datachain-0.11.11 → datachain-0.12.0}/.github/workflows/tests-studio.yml +0 -0
  80. {datachain-0.11.11 → datachain-0.12.0}/.github/workflows/update-template.yaml +0 -0
  81. {datachain-0.11.11 → datachain-0.12.0}/.gitignore +0 -0
  82. {datachain-0.11.11 → datachain-0.12.0}/CODE_OF_CONDUCT.rst +0 -0
  83. {datachain-0.11.11 → datachain-0.12.0}/LICENSE +0 -0
  84. {datachain-0.11.11 → datachain-0.12.0}/README.rst +0 -0
  85. {datachain-0.11.11 → datachain-0.12.0}/docs/assets/captioned_cartoons.png +0 -0
  86. {datachain-0.11.11 → datachain-0.12.0}/docs/assets/datachain-white.svg +0 -0
  87. {datachain-0.11.11 → datachain-0.12.0}/docs/assets/datachain.svg +0 -0
  88. {datachain-0.11.11 → datachain-0.12.0}/docs/contributing.md +0 -0
  89. {datachain-0.11.11 → datachain-0.12.0}/docs/css/github-permalink-style.css +0 -0
  90. {datachain-0.11.11 → datachain-0.12.0}/docs/index.md +0 -0
  91. {datachain-0.11.11 → datachain-0.12.0}/docs/overrides/main.html +0 -0
  92. {datachain-0.11.11 → datachain-0.12.0}/docs/references/data-types/arrowrow.md +0 -0
  93. {datachain-0.11.11 → datachain-0.12.0}/docs/references/data-types/bbox.md +0 -0
  94. {datachain-0.11.11 → datachain-0.12.0}/docs/references/data-types/file.md +0 -0
  95. {datachain-0.11.11 → datachain-0.12.0}/docs/references/data-types/imagefile.md +0 -0
  96. {datachain-0.11.11 → datachain-0.12.0}/docs/references/data-types/index.md +0 -0
  97. {datachain-0.11.11 → datachain-0.12.0}/docs/references/data-types/pose.md +0 -0
  98. {datachain-0.11.11 → datachain-0.12.0}/docs/references/data-types/segment.md +0 -0
  99. {datachain-0.11.11 → datachain-0.12.0}/docs/references/data-types/tarvfile.md +0 -0
  100. {datachain-0.11.11 → datachain-0.12.0}/docs/references/data-types/textfile.md +0 -0
  101. {datachain-0.11.11 → datachain-0.12.0}/docs/references/data-types/videofile.md +0 -0
  102. {datachain-0.11.11 → datachain-0.12.0}/docs/references/datachain.md +0 -0
  103. {datachain-0.11.11 → datachain-0.12.0}/docs/references/func.md +0 -0
  104. {datachain-0.11.11 → datachain-0.12.0}/docs/references/index.md +0 -0
  105. {datachain-0.11.11 → datachain-0.12.0}/docs/references/remotes.md +0 -0
  106. {datachain-0.11.11 → datachain-0.12.0}/docs/references/toolkit.md +0 -0
  107. {datachain-0.11.11 → datachain-0.12.0}/docs/references/torch.md +0 -0
  108. {datachain-0.11.11 → datachain-0.12.0}/docs/references/udf.md +0 -0
  109. {datachain-0.11.11 → datachain-0.12.0}/docs/tutorials.md +0 -0
  110. {datachain-0.11.11 → datachain-0.12.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  111. {datachain-0.11.11 → datachain-0.12.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  112. {datachain-0.11.11 → datachain-0.12.0}/examples/get_started/common_sql_functions.py +0 -0
  113. {datachain-0.11.11 → datachain-0.12.0}/examples/get_started/json-csv-reader.py +0 -0
  114. {datachain-0.11.11 → datachain-0.12.0}/examples/get_started/torch-loader.py +0 -0
  115. {datachain-0.11.11 → datachain-0.12.0}/examples/get_started/udfs/parallel.py +0 -0
  116. {datachain-0.11.11 → datachain-0.12.0}/examples/get_started/udfs/simple.py +0 -0
  117. {datachain-0.11.11 → datachain-0.12.0}/examples/get_started/udfs/stateful.py +0 -0
  118. {datachain-0.11.11 → datachain-0.12.0}/examples/llm_and_nlp/claude-query.py +0 -0
  119. {datachain-0.11.11 → datachain-0.12.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  120. {datachain-0.11.11 → datachain-0.12.0}/examples/multimodal/clip_inference.py +0 -0
  121. {datachain-0.11.11 → datachain-0.12.0}/examples/multimodal/hf_pipeline.py +0 -0
  122. {datachain-0.11.11 → datachain-0.12.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  123. {datachain-0.11.11 → datachain-0.12.0}/examples/multimodal/wds.py +0 -0
  124. {datachain-0.11.11 → datachain-0.12.0}/examples/multimodal/wds_filtered.py +0 -0
  125. {datachain-0.11.11 → datachain-0.12.0}/mkdocs.yml +0 -0
  126. {datachain-0.11.11 → datachain-0.12.0}/setup.cfg +0 -0
  127. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/__init__.py +0 -0
  128. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/__main__.py +0 -0
  129. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/asyn.py +0 -0
  130. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cache.py +0 -0
  131. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/catalog/__init__.py +0 -0
  132. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/catalog/datasource.py +0 -0
  133. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/commands/__init__.py +0 -0
  134. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/commands/datasets.py +0 -0
  135. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/commands/du.py +0 -0
  136. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/commands/index.py +0 -0
  137. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/commands/ls.py +0 -0
  138. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/commands/misc.py +0 -0
  139. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/commands/query.py +0 -0
  140. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/commands/show.py +0 -0
  141. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/parser/__init__.py +0 -0
  142. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/parser/job.py +0 -0
  143. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/parser/utils.py +0 -0
  144. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/cli/utils.py +0 -0
  145. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/client/__init__.py +0 -0
  146. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/client/azure.py +0 -0
  147. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/client/fileslice.py +0 -0
  148. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/client/gcs.py +0 -0
  149. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/client/s3.py +0 -0
  150. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/config.py +0 -0
  151. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/data_storage/__init__.py +0 -0
  152. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/data_storage/db_engine.py +0 -0
  153. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/data_storage/job.py +0 -0
  154. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/data_storage/metastore.py +0 -0
  155. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/data_storage/schema.py +0 -0
  156. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/data_storage/serializer.py +0 -0
  157. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/data_storage/sqlite.py +0 -0
  158. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/diff/__init__.py +0 -0
  159. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/fs/__init__.py +0 -0
  160. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/fs/reference.py +0 -0
  161. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/aggregate.py +0 -0
  162. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/array.py +0 -0
  163. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/base.py +0 -0
  164. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/conditional.py +0 -0
  165. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/numeric.py +0 -0
  166. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/path.py +0 -0
  167. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/random.py +0 -0
  168. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/string.py +0 -0
  169. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/func/window.py +0 -0
  170. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/job.py +0 -0
  171. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/__init__.py +0 -0
  172. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/arrow.py +0 -0
  173. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/clip.py +0 -0
  174. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/convert/__init__.py +0 -0
  175. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/convert/flatten.py +0 -0
  176. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  177. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  178. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/convert/unflatten.py +0 -0
  179. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  180. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/data_model.py +0 -0
  181. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/dataset_info.py +0 -0
  182. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/hf.py +0 -0
  183. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/listing_info.py +0 -0
  184. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/meta_formats.py +0 -0
  185. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/model_store.py +0 -0
  186. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/pytorch.py +0 -0
  187. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/settings.py +0 -0
  188. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/signal_schema.py +0 -0
  189. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/tar.py +0 -0
  190. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/text.py +0 -0
  191. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/udf.py +0 -0
  192. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/udf_signature.py +0 -0
  193. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/utils.py +0 -0
  194. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/webdataset.py +0 -0
  195. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/lib/webdataset_laion.py +0 -0
  196. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/listing.py +0 -0
  197. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/model/__init__.py +0 -0
  198. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  199. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/node.py +0 -0
  200. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/nodes_fetcher.py +0 -0
  201. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/nodes_thread_pool.py +0 -0
  202. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/progress.py +0 -0
  203. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/py.typed +0 -0
  204. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/__init__.py +0 -0
  205. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/batch.py +0 -0
  206. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/dispatch.py +0 -0
  207. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/metrics.py +0 -0
  208. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/params.py +0 -0
  209. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/queue.py +0 -0
  210. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/schema.py +0 -0
  211. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/session.py +0 -0
  212. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/udf.py +0 -0
  213. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/query/utils.py +0 -0
  214. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/remote/__init__.py +0 -0
  215. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/remote/studio.py +0 -0
  216. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/script_meta.py +0 -0
  217. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/__init__.py +0 -0
  218. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/default/__init__.py +0 -0
  219. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/default/base.py +0 -0
  220. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/functions/__init__.py +0 -0
  221. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/functions/aggregate.py +0 -0
  222. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/functions/array.py +0 -0
  223. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/functions/conditional.py +0 -0
  224. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/functions/numeric.py +0 -0
  225. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/functions/path.py +0 -0
  226. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/functions/random.py +0 -0
  227. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/functions/string.py +0 -0
  228. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/selectable.py +0 -0
  229. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  230. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/sqlite/base.py +0 -0
  231. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/sqlite/types.py +0 -0
  232. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/sqlite/vector.py +0 -0
  233. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/types.py +0 -0
  234. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/sql/utils.py +0 -0
  235. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/telemetry.py +0 -0
  236. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/toolkit/__init__.py +0 -0
  237. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/toolkit/split.py +0 -0
  238. {datachain-0.11.11 → datachain-0.12.0}/src/datachain/torch/__init__.py +0 -0
  239. {datachain-0.11.11 → datachain-0.12.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  240. {datachain-0.11.11 → datachain-0.12.0}/src/datachain.egg-info/entry_points.txt +0 -0
  241. {datachain-0.11.11 → datachain-0.12.0}/src/datachain.egg-info/top_level.txt +0 -0
  242. {datachain-0.11.11 → datachain-0.12.0}/tests/__init__.py +0 -0
  243. {datachain-0.11.11 → datachain-0.12.0}/tests/benchmarks/__init__.py +0 -0
  244. {datachain-0.11.11 → datachain-0.12.0}/tests/benchmarks/conftest.py +0 -0
  245. {datachain-0.11.11 → datachain-0.12.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  246. {datachain-0.11.11 → datachain-0.12.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  247. {datachain-0.11.11 → datachain-0.12.0}/tests/benchmarks/datasets/.gitignore +0 -0
  248. {datachain-0.11.11 → datachain-0.12.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  249. {datachain-0.11.11 → datachain-0.12.0}/tests/benchmarks/test_datachain.py +0 -0
  250. {datachain-0.11.11 → datachain-0.12.0}/tests/benchmarks/test_ls.py +0 -0
  251. {datachain-0.11.11 → datachain-0.12.0}/tests/benchmarks/test_version.py +0 -0
  252. {datachain-0.11.11 → datachain-0.12.0}/tests/data.py +0 -0
  253. {datachain-0.11.11 → datachain-0.12.0}/tests/examples/__init__.py +0 -0
  254. {datachain-0.11.11 → datachain-0.12.0}/tests/examples/test_examples.py +0 -0
  255. {datachain-0.11.11 → datachain-0.12.0}/tests/examples/test_wds_e2e.py +0 -0
  256. {datachain-0.11.11 → datachain-0.12.0}/tests/examples/wds_data.py +0 -0
  257. {datachain-0.11.11 → datachain-0.12.0}/tests/func/__init__.py +0 -0
  258. {datachain-0.11.11/tests/unit/lib → datachain-0.12.0/tests/func}/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  259. {datachain-0.11.11 → datachain-0.12.0}/tests/func/fake-service-account-credentials.json +0 -0
  260. {datachain-0.11.11/tests/unit → datachain-0.12.0/tests/func/model}/__init__.py +0 -0
  261. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_catalog.py +0 -0
  262. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_cloud_transfer.py +0 -0
  263. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_data_storage.py +0 -0
  264. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_datachain_merge.py +0 -0
  265. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_dataset_query.py +0 -0
  266. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_datasets.py +0 -0
  267. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_feature_pickling.py +0 -0
  268. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_file.py +0 -0
  269. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_hf.py +0 -0
  270. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_hidden_field.py +0 -0
  271. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_listing.py +0 -0
  272. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_meta_formats.py +0 -0
  273. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_metrics.py +0 -0
  274. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_pull.py +0 -0
  275. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_pytorch.py +0 -0
  276. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_query.py +0 -0
  277. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_session.py +0 -0
  278. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_toolkit.py +0 -0
  279. {datachain-0.11.11 → datachain-0.12.0}/tests/func/test_warehouse.py +0 -0
  280. {datachain-0.11.11 → datachain-0.12.0}/tests/scripts/feature_class.py +0 -0
  281. {datachain-0.11.11 → datachain-0.12.0}/tests/scripts/feature_class_exception.py +0 -0
  282. {datachain-0.11.11 → datachain-0.12.0}/tests/scripts/feature_class_parallel.py +0 -0
  283. {datachain-0.11.11 → datachain-0.12.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  284. {datachain-0.11.11 → datachain-0.12.0}/tests/scripts/name_len_slow.py +0 -0
  285. {datachain-0.11.11 → datachain-0.12.0}/tests/test_atomicity.py +0 -0
  286. {datachain-0.11.11 → datachain-0.12.0}/tests/test_cli_e2e.py +0 -0
  287. {datachain-0.11.11 → datachain-0.12.0}/tests/test_cli_studio.py +0 -0
  288. {datachain-0.11.11 → datachain-0.12.0}/tests/test_query_e2e.py +0 -0
  289. {datachain-0.11.11 → datachain-0.12.0}/tests/test_telemetry.py +0 -0
  290. {datachain-0.11.11/tests/unit/lib → datachain-0.12.0/tests/unit}/__init__.py +0 -0
  291. {datachain-0.11.11/tests/unit/sql → datachain-0.12.0/tests/unit/lib}/__init__.py +0 -0
  292. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/conftest.py +0 -0
  293. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_arrow.py +0 -0
  294. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_clip.py +0 -0
  295. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  296. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  297. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_diff.py +0 -0
  298. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_feature.py +0 -0
  299. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_feature_utils.py +0 -0
  300. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_hf.py +0 -0
  301. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_listing_info.py +0 -0
  302. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  303. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_schema.py +0 -0
  304. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_signal_schema.py +0 -0
  305. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  306. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_text.py +0 -0
  307. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_udf_signature.py +0 -0
  308. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_utils.py +0 -0
  309. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/lib/test_webdataset.py +0 -0
  310. {datachain-0.11.11/tests/unit/sql/sqlite → datachain-0.12.0/tests/unit/model}/__init__.py +0 -0
  311. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  312. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  313. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/sql/test_array.py +0 -0
  314. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/sql/test_conditional.py +0 -0
  315. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/sql/test_path.py +0 -0
  316. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/sql/test_random.py +0 -0
  317. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/sql/test_selectable.py +0 -0
  318. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/sql/test_string.py +0 -0
  319. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_asyn.py +0 -0
  320. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_cache.py +0 -0
  321. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_catalog.py +0 -0
  322. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_catalog_loader.py +0 -0
  323. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_cli_parsing.py +0 -0
  324. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_client.py +0 -0
  325. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_client_gcs.py +0 -0
  326. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_client_s3.py +0 -0
  327. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_config.py +0 -0
  328. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_data_storage.py +0 -0
  329. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_database_engine.py +0 -0
  330. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_dataset.py +0 -0
  331. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_dispatch.py +0 -0
  332. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_fileslice.py +0 -0
  333. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_func.py +0 -0
  334. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_listing.py +0 -0
  335. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_metastore.py +0 -0
  336. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_module_exports.py +0 -0
  337. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_pytorch.py +0 -0
  338. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_query.py +0 -0
  339. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_query_metrics.py +0 -0
  340. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_query_params.py +0 -0
  341. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_script_meta.py +0 -0
  342. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_serializer.py +0 -0
  343. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_session.py +0 -0
  344. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_utils.py +0 -0
  345. {datachain-0.11.11 → datachain-0.12.0}/tests/unit/test_warehouse.py +0 -0
@@ -80,14 +80,6 @@ jobs:
80
80
 
81
81
  - name: Set up FFmpeg
82
82
  uses: AnimMouse/setup-ffmpeg@v1
83
- id: ffmpeg-install
84
- continue-on-error: ${{ runner.os == 'macOS' }}
85
-
86
- # https://github.com/AnimMouse/setup-ffmpeg/issues/5
87
- - if: steps.ffmpeg-install.outcome == 'failure' && runner.os == 'macOS'
88
- run: brew install ffmpeg
89
- env:
90
- HOMEBREW_NO_AUTO_UPDATE: "1"
91
83
 
92
84
  - name: Set up Python ${{ matrix.pyv }}
93
85
  uses: actions/setup-python@v5
@@ -117,7 +109,7 @@ jobs:
117
109
  shell: bash
118
110
 
119
111
  - name: Run E2E tests
120
- run: nox -s tests-${{ matrix.pyv }} -- -m "e2e" --cov-append $DISABLE_REMOTES_ARG
112
+ run: nox -s e2e-${{ matrix.pyv }}
121
113
  shell: bash
122
114
 
123
115
  - name: Upload coverage report
@@ -141,11 +133,13 @@ jobs:
141
133
  matrix:
142
134
  os: [ubuntu-latest, windows-latest]
143
135
  pyv: ['3.9', '3.13']
144
- group: ['get_started', 'computer_vision', 'llm_and_nlp', 'multimodal']
136
+ group: ['get_started', 'computer_vision', 'multimodal']
145
137
  exclude:
146
138
  - {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
147
139
  - {os: ubuntu-latest, pyv: '3.13', group: 'multimodal'}
148
140
  include:
141
+ # HF runs against actual API - thus run it only once
142
+ - {os: ubuntu-latest, pyv: "3.13", group: llm_and_nlp}
149
143
  - {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
150
144
  - {os: ubuntu-latest-4-cores, pyv: "3.13", group: multimodal}
151
145
 
@@ -169,9 +163,8 @@ jobs:
169
163
  - name: Install nox
170
164
  run: uv pip install nox --system
171
165
 
172
- # HF runs against actual API - thus run it only once
173
166
  - name: Set hf token
174
- if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.13'
167
+ if: matrix.group == 'llm_and_nlp'
175
168
  run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
176
169
 
177
170
  - name: Run examples
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.9.9'
27
+ rev: 'v0.9.10'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.11.11
3
+ Version: 0.12.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -71,6 +71,7 @@ Requires-Dist: usearch; extra == "vector"
71
71
  Provides-Extra: hf
72
72
  Requires-Dist: numba>=0.60.0; extra == "hf"
73
73
  Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
74
+ Requires-Dist: fsspec>=2024.12.0; extra == "hf"
74
75
  Provides-Extra: video
75
76
  Requires-Dist: ffmpeg-python; extra == "video"
76
77
  Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
@@ -90,6 +91,7 @@ Requires-Dist: hypothesis; extra == "tests"
90
91
  Requires-Dist: aiotools>=1.7.0; extra == "tests"
91
92
  Requires-Dist: requests-mock; extra == "tests"
92
93
  Requires-Dist: scipy; extra == "tests"
94
+ Requires-Dist: ultralytics; extra == "tests"
93
95
  Provides-Extra: dev
94
96
  Requires-Dist: datachain[docs,tests]; extra == "dev"
95
97
  Requires-Dist: mypy==1.15.0; extra == "dev"
@@ -103,7 +105,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
103
105
  Requires-Dist: defusedxml; extra == "examples"
104
106
  Requires-Dist: accelerate; extra == "examples"
105
107
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
106
- Requires-Dist: ultralytics==8.3.82; extra == "examples"
108
+ Requires-Dist: ultralytics; extra == "examples"
107
109
  Requires-Dist: open_clip_torch; extra == "examples"
108
110
 
109
111
  ================
@@ -13,7 +13,7 @@ title: Examples
13
13
  For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”:
14
14
 
15
15
  ```python
16
- from datachain.lib.dc import Column, DataChain, File # (1)!
16
+ from datachain import Column, DataChain, File # (1)!
17
17
  from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
18
18
 
19
19
  images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
@@ -80,12 +80,10 @@ In the below example, we are calling a Mixtral 8x22b model to judge the “servi
80
80
  # $ export MISTRAL_API_KEY='your key'
81
81
 
82
82
  import os
83
- from datachain.lib.feature import Feature
84
- from datachain.lib.dc import Column, DataChain
83
+ from datachain import Column, DataChain, DataModel, Feature
85
84
  from mistralai.client import MistralClient
86
85
  from mistralai.models.chat_completion import ChatMessage
87
86
  from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel
88
- from datachain.lib.data_model import DataModel
89
87
 
90
88
  prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
91
89
  api_key = os.environ["MISTRAL_API_KEY"]
@@ -189,7 +187,7 @@ DataChain library understands common annotation formats (JSON, CSV, webdataset a
189
187
  Here is an example of reading a simple CSV file where schema is heuristically derived from the header:
190
188
 
191
189
  ```python
192
- from datachain.lib.dc import DataChain
190
+ from datachain import DataChain
193
191
 
194
192
  uri="gs://datachain-demo/chatbot-csv/"
195
193
  csv_dataset = DataChain.from_csv(uri)
@@ -234,7 +232,7 @@ However, Datachain can easily parse the entire COCO structure via several readin
234
232
 
235
233
  ```python
236
234
 
237
- from datachain.lib.dc import Column, DataChain
235
+ from datachain import Column, DataChain
238
236
 
239
237
  images_uri="gs://datachain-demo/coco2017/images/val/"
240
238
  captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json"
@@ -138,7 +138,7 @@ chain = (
138
138
  )
139
139
 
140
140
  successful_chain = chain.filter(Column("is_success") == True)
141
- successful_chain.export_files("./output_mistral")
141
+ successful_chain.to_storage("./output_mistral")
142
142
 
143
143
  print(f"{successful_chain.count()} files were exported")
144
144
  ```
@@ -22,13 +22,9 @@ def openimage_detect(args):
22
22
  detections = json.load(stream_json).get("detections", [])
23
23
 
24
24
  for i, detect in enumerate(detections):
25
- bbox = model.BBox.from_list(
26
- [
27
- detect["XMin"] * img.width,
28
- detect["XMax"] * img.width,
29
- detect["YMin"] * img.height,
30
- detect["YMax"] * img.height,
31
- ]
25
+ bbox = model.BBox.from_albumentations(
26
+ [detect[k] for k in ("XMin", "YMin", "XMax", "YMax")],
27
+ img_size=(img.width, img.height),
32
28
  )
33
29
 
34
30
  fstream = File(
@@ -1,11 +1,3 @@
1
- import os
2
-
3
- os.environ["YOLO_VERBOSE"] = "false"
4
-
5
-
6
- from io import BytesIO
7
-
8
- from PIL import Image
9
1
  from ultralytics import YOLO
10
2
 
11
3
  from datachain import C, DataChain, File
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloBBoxes
13
5
 
14
6
 
15
7
  def process_bboxes(yolo: YOLO, file: File) -> YoloBBoxes:
16
- results = yolo(Image.open(BytesIO(file.read())))
8
+ results = yolo(file.as_image_file().read(), verbose=False)
17
9
  return YoloBBoxes.from_results(results)
18
10
 
19
11
 
@@ -1,11 +1,3 @@
1
- import os
2
-
3
- os.environ["YOLO_VERBOSE"] = "false"
4
-
5
-
6
- from io import BytesIO
7
-
8
- from PIL import Image
9
1
  from ultralytics import YOLO
10
2
 
11
3
  from datachain import C, DataChain, File
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloPoses
13
5
 
14
6
 
15
7
  def process_poses(yolo: YOLO, file: File) -> YoloPoses:
16
- results = yolo(Image.open(BytesIO(file.read())))
8
+ results = yolo(file.as_image_file().read(), verbose=False)
17
9
  return YoloPoses.from_results(results)
18
10
 
19
11
 
@@ -1,11 +1,3 @@
1
- import os
2
-
3
- os.environ["YOLO_VERBOSE"] = "false"
4
-
5
-
6
- from io import BytesIO
7
-
8
- from PIL import Image
9
1
  from ultralytics import YOLO
10
2
 
11
3
  from datachain import C, DataChain, File
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloSegments
13
5
 
14
6
 
15
7
  def process_segments(yolo: YOLO, file: File) -> YoloSegments:
16
- results = yolo(Image.open(BytesIO(file.read())))
8
+ results = yolo(file.as_image_file().read(), verbose=False)
17
9
  return YoloSegments.from_results(results)
18
10
 
19
11
 
@@ -56,6 +56,20 @@ def tests(session: nox.Session) -> None:
56
56
  )
57
57
 
58
58
 
59
+ @nox.session(python=python_versions)
60
+ def e2e(session: nox.Session) -> None:
61
+ session.install(".[tests]")
62
+ session.run(
63
+ "pytest",
64
+ "--durations=0",
65
+ "--numprocesses=logical",
66
+ "--dist=loadgroup",
67
+ "-m",
68
+ "e2e",
69
+ *session.posargs,
70
+ )
71
+
72
+
59
73
  @nox.session
60
74
  def lint(session: nox.Session) -> None:
61
75
  session.install("pre-commit")
@@ -80,7 +80,8 @@ vector = [
80
80
  ]
81
81
  hf = [
82
82
  "numba>=0.60.0",
83
- "datasets[audio,vision]>=2.21.0"
83
+ "datasets[audio,vision]>=2.21.0",
84
+ "fsspec>=2024.12.0"
84
85
  ]
85
86
  video = [
86
87
  "ffmpeg-python",
@@ -101,7 +102,8 @@ tests = [
101
102
  "hypothesis",
102
103
  "aiotools>=1.7.0",
103
104
  "requests-mock",
104
- "scipy"
105
+ "scipy",
106
+ "ultralytics"
105
107
  ]
106
108
  dev = [
107
109
  "datachain[docs,tests]",
@@ -117,7 +119,7 @@ examples = [
117
119
  "defusedxml",
118
120
  "accelerate",
119
121
  "huggingface_hub[hf_transfer]",
120
- "ultralytics==8.3.82",
122
+ "ultralytics",
121
123
  "open_clip_torch"
122
124
  ]
123
125
 
@@ -25,7 +25,6 @@ from typing import (
25
25
  )
26
26
  from uuid import uuid4
27
27
 
28
- import requests
29
28
  import sqlalchemy as sa
30
29
  from sqlalchemy import Column
31
30
  from tqdm.auto import tqdm
@@ -54,7 +53,6 @@ from datachain.error import (
54
53
  from datachain.lib.listing import get_listing
55
54
  from datachain.node import DirType, Node, NodeWithPath
56
55
  from datachain.nodes_thread_pool import NodesThreadPool
57
- from datachain.remote.studio import StudioClient
58
56
  from datachain.sql.types import DateTime, SQLType
59
57
  from datachain.utils import DataChainDir
60
58
 
@@ -162,6 +160,8 @@ class DatasetRowsFetcher(NodesThreadPool):
162
160
  max_threads: int = PULL_DATASET_MAX_THREADS,
163
161
  progress_bar=None,
164
162
  ):
163
+ from datachain.remote.studio import StudioClient
164
+
165
165
  super().__init__(max_threads)
166
166
  self._check_dependencies()
167
167
  self.metastore = metastore
@@ -234,6 +234,8 @@ class DatasetRowsFetcher(NodesThreadPool):
234
234
  return df.drop("sys__id", axis=1)
235
235
 
236
236
  def get_parquet_content(self, url: str):
237
+ import requests
238
+
237
239
  while True:
238
240
  if self.should_check_for_status():
239
241
  self.check_for_status()
@@ -1130,6 +1132,8 @@ class Catalog:
1130
1132
  raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
1131
1133
 
1132
1134
  def get_remote_dataset(self, name: str) -> DatasetRecord:
1135
+ from datachain.remote.studio import StudioClient
1136
+
1133
1137
  studio_client = StudioClient()
1134
1138
 
1135
1139
  info_response = studio_client.dataset_info(name)
@@ -1164,8 +1168,27 @@ class Catalog:
1164
1168
 
1165
1169
  return direct_dependencies
1166
1170
 
1167
- def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetListRecord]:
1168
- datasets = self.metastore.list_datasets()
1171
+ def ls_datasets(
1172
+ self, include_listing: bool = False, studio: bool = False
1173
+ ) -> Iterator[DatasetListRecord]:
1174
+ from datachain.remote.studio import StudioClient
1175
+
1176
+ if studio:
1177
+ client = StudioClient()
1178
+ response = client.ls_datasets()
1179
+ if not response.ok:
1180
+ raise DataChainError(response.message)
1181
+ if not response.data:
1182
+ return
1183
+
1184
+ datasets: Iterator[DatasetListRecord] = (
1185
+ DatasetListRecord.from_dict(d)
1186
+ for d in response.data
1187
+ if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
1188
+ )
1189
+ else:
1190
+ datasets = self.metastore.list_datasets()
1191
+
1169
1192
  for d in datasets:
1170
1193
  if not d.is_bucket_listing or include_listing:
1171
1194
  yield d
@@ -1173,9 +1196,12 @@ class Catalog:
1173
1196
  def list_datasets_versions(
1174
1197
  self,
1175
1198
  include_listing: bool = False,
1199
+ studio: bool = False,
1176
1200
  ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
1177
1201
  """Iterate over all dataset versions with related jobs."""
1178
- datasets = list(self.ls_datasets(include_listing=include_listing))
1202
+ datasets = list(
1203
+ self.ls_datasets(include_listing=include_listing, studio=studio)
1204
+ )
1179
1205
 
1180
1206
  # preselect dataset versions jobs from db to avoid multiple queries
1181
1207
  jobs_ids: set[str] = {
@@ -1345,6 +1371,8 @@ class Catalog:
1345
1371
  if cp and not output:
1346
1372
  raise ValueError("Please provide output directory for instantiation")
1347
1373
 
1374
+ from datachain.remote.studio import StudioClient
1375
+
1348
1376
  studio_client = StudioClient()
1349
1377
 
1350
1378
  try:
@@ -1,19 +1,13 @@
1
1
  import os
2
2
  from importlib import import_module
3
- from typing import Any, Optional
4
-
5
- from datachain.catalog import Catalog
6
- from datachain.data_storage import (
7
- AbstractMetastore,
8
- AbstractWarehouse,
9
- )
10
- from datachain.data_storage.serializer import deserialize
11
- from datachain.data_storage.sqlite import (
12
- SQLiteMetastore,
13
- SQLiteWarehouse,
14
- )
3
+ from typing import TYPE_CHECKING, Any, Optional
4
+
15
5
  from datachain.utils import get_envs_by_prefix
16
6
 
7
+ if TYPE_CHECKING:
8
+ from datachain.catalog import Catalog
9
+ from datachain.data_storage import AbstractMetastore, AbstractWarehouse
10
+
17
11
  METASTORE_SERIALIZED = "DATACHAIN__METASTORE"
18
12
  METASTORE_IMPORT_PATH = "DATACHAIN_METASTORE"
19
13
  METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
@@ -27,6 +21,9 @@ IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
27
21
 
28
22
 
29
23
  def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
24
+ from datachain.data_storage import AbstractMetastore
25
+ from datachain.data_storage.serializer import deserialize
26
+
30
27
  metastore_serialized = os.environ.get(METASTORE_SERIALIZED)
31
28
  if metastore_serialized:
32
29
  metastore_obj = deserialize(metastore_serialized)
@@ -45,6 +42,8 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
45
42
  }
46
43
 
47
44
  if not metastore_import_path:
45
+ from datachain.data_storage.sqlite import SQLiteMetastore
46
+
48
47
  metastore_args["in_memory"] = in_memory
49
48
  return SQLiteMetastore(**metastore_args)
50
49
  if in_memory:
@@ -62,6 +61,9 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
62
61
 
63
62
 
64
63
  def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
64
+ from datachain.data_storage import AbstractWarehouse
65
+ from datachain.data_storage.serializer import deserialize
66
+
65
67
  warehouse_serialized = os.environ.get(WAREHOUSE_SERIALIZED)
66
68
  if warehouse_serialized:
67
69
  warehouse_obj = deserialize(warehouse_serialized)
@@ -80,6 +82,8 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
80
82
  }
81
83
 
82
84
  if not warehouse_import_path:
85
+ from datachain.data_storage.sqlite import SQLiteWarehouse
86
+
83
87
  warehouse_args["in_memory"] = in_memory
84
88
  return SQLiteWarehouse(**warehouse_args)
85
89
  if in_memory:
@@ -121,7 +125,7 @@ def get_distributed_class(**kwargs):
121
125
 
122
126
  def get_catalog(
123
127
  client_config: Optional[dict[str, Any]] = None, in_memory: bool = False
124
- ) -> Catalog:
128
+ ) -> "Catalog":
125
129
  """
126
130
  Function that creates Catalog instance with appropriate metastore
127
131
  and warehouse classes. Metastore class can be provided with env variable
@@ -133,6 +137,8 @@ def get_catalog(
133
137
  and name of variable after, e.g. if it accepts team_id as kwargs
134
138
  we can provide DATACHAIN_METASTORE_ARG_TEAM_ID=12345 env variable.
135
139
  """
140
+ from datachain.catalog import Catalog
141
+
136
142
  return Catalog(
137
143
  metastore=get_metastore(in_memory=in_memory),
138
144
  warehouse=get_warehouse(in_memory=in_memory),
@@ -6,7 +6,6 @@ from multiprocessing import freeze_support
6
6
  from typing import Optional
7
7
 
8
8
  from datachain.cli.utils import get_logging_level
9
- from datachain.telemetry import telemetry
10
9
 
11
10
  from .commands import (
12
11
  clear_cache,
@@ -70,6 +69,8 @@ def main(argv: Optional[list[str]] = None) -> int:
70
69
  error, return_code = handle_general_exception(exc, args, logging_level)
71
70
  return return_code
72
71
  finally:
72
+ from datachain.telemetry import telemetry
73
+
73
74
  telemetry.send_cli_call(args.command, error=error)
74
75
 
75
76
 
@@ -63,19 +63,31 @@ def add_auth_parser(subparsers, parent_parser) -> None:
63
63
  default=False,
64
64
  help="Use code-based authentication without browser",
65
65
  )
66
+ login_parser.add_argument(
67
+ "--local",
68
+ action="store_true",
69
+ default=False,
70
+ help="Save the token in the local project config",
71
+ )
66
72
 
67
73
  auth_logout_help = "Log out from Studio"
68
74
  auth_logout_description = (
69
75
  "Remove the Studio authentication token from global config."
70
76
  )
71
77
 
72
- auth_subparser.add_parser(
78
+ logout_parser = auth_subparser.add_parser(
73
79
  "logout",
74
80
  parents=[parent_parser],
75
81
  description=auth_logout_description,
76
82
  help=auth_logout_help,
77
83
  formatter_class=CustomHelpFormatter,
78
84
  )
85
+ logout_parser.add_argument(
86
+ "--local",
87
+ action="store_true",
88
+ default=False,
89
+ help="Remove the token from the local project config",
90
+ )
79
91
 
80
92
  auth_team_help = "Set default team for Studio operations"
81
93
  auth_team_description = "Set the default team for Studio operations."
@@ -17,10 +17,10 @@ from typing import (
17
17
  ClassVar,
18
18
  NamedTuple,
19
19
  Optional,
20
+ Union,
20
21
  )
21
22
  from urllib.parse import urlparse
22
23
 
23
- from botocore.exceptions import ClientError
24
24
  from dvc_objects.fs.system import reflink
25
25
  from fsspec.asyn import get_loop, sync
26
26
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -28,7 +28,6 @@ from tqdm.auto import tqdm
28
28
 
29
29
  from datachain.cache import Cache
30
30
  from datachain.client.fileslice import FileWrapper
31
- from datachain.error import ClientError as DataChainClientError
32
31
  from datachain.nodes_fetcher import NodesFetcher
33
32
  from datachain.nodes_thread_pool import NodeChunk
34
33
 
@@ -83,19 +82,17 @@ class Client(ABC):
83
82
  self.uri = self.get_uri(self.name)
84
83
 
85
84
  @staticmethod
86
- def get_implementation(url: str) -> type["Client"]:
85
+ def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]:
87
86
  from .azure import AzureClient
88
87
  from .gcs import GCSClient
89
88
  from .hf import HfClient
90
89
  from .local import FileClient
91
90
  from .s3 import ClientS3
92
91
 
93
- protocol = urlparse(url).scheme
92
+ protocol = urlparse(str(url)).scheme
94
93
 
95
- if not protocol or _is_win_local_path(url):
94
+ if not protocol or _is_win_local_path(str(url)):
96
95
  return FileClient
97
-
98
- protocol = protocol.lower()
99
96
  if protocol == ClientS3.protocol:
100
97
  return ClientS3
101
98
  if protocol == GCSClient.protocol:
@@ -121,9 +118,11 @@ class Client(ABC):
121
118
  return cls.get_uri(storage_name), rel_path
122
119
 
123
120
  @staticmethod
124
- def get_client(source: str, cache: Cache, **kwargs) -> "Client":
121
+ def get_client(
122
+ source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
123
+ ) -> "Client":
125
124
  cls = Client.get_implementation(source)
126
- storage_url, _ = cls.split_url(source)
125
+ storage_url, _ = cls.split_url(str(source))
127
126
  if os.name == "nt":
128
127
  storage_url = storage_url.removeprefix("/")
129
128
 
@@ -209,7 +208,7 @@ class Client(ABC):
209
208
 
210
209
  async def get_current_etag(self, file: "File") -> str:
211
210
  kwargs = {}
212
- if self.fs.version_aware:
211
+ if getattr(self.fs, "version_aware", False):
213
212
  kwargs["version_id"] = file.version
214
213
  info = await self.fs._info(
215
214
  self.get_full_path(file.path, file.version), **kwargs
@@ -286,11 +285,6 @@ class Client(ABC):
286
285
  worker.cancel()
287
286
  if excs:
288
287
  raise excs[0]
289
- except ClientError as exc:
290
- raise DataChainClientError(
291
- exc.response.get("Error", {}).get("Message") or exc,
292
- exc.response.get("Error", {}).get("Code"),
293
- ) from exc
294
288
  finally:
295
289
  # This ensures the progress bar is closed before any exceptions are raised
296
290
  progress_bar.close()
@@ -333,7 +327,9 @@ class Client(ABC):
333
327
  return not (key.startswith("/") or key.endswith("/") or "//" in key)
334
328
 
335
329
  async def ls_dir(self, path):
336
- return await self.fs._ls(path, detail=True, versions=True)
330
+ if getattr(self.fs, "version_aware", False):
331
+ kwargs = {"versions": True}
332
+ return await self.fs._ls(path, detail=True, **kwargs)
337
333
 
338
334
  def rel_path(self, path: str) -> str:
339
335
  return self.fs.split_path(path)[1]
@@ -0,0 +1,60 @@
1
+ import functools
2
+ import posixpath
3
+ from typing import Any
4
+
5
+ from datachain.lib.file import File
6
+
7
+ from .fsspec import Client
8
+
9
+
10
+ class classproperty: # noqa: N801
11
+ def __init__(self, func):
12
+ self.fget = func
13
+
14
+ def __get__(self, instance, owner):
15
+ return self.fget(owner)
16
+
17
+
18
+ @functools.cache
19
+ def get_hf_filesystem_cls():
20
+ import fsspec
21
+ from packaging.version import Version, parse
22
+
23
+ fsspec_version = parse(fsspec.__version__)
24
+ minver = Version("2024.12.0")
25
+
26
+ if fsspec_version < minver:
27
+ raise ImportError(
28
+ f"datachain requires 'fsspec>={minver}' but version "
29
+ f"{fsspec_version} is installed."
30
+ )
31
+
32
+ from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
33
+ from huggingface_hub import HfFileSystem
34
+
35
+ fs_cls = AsyncFileSystemWrapper.wrap_class(HfFileSystem)
36
+ # AsyncFileSystemWrapper does not set class properties, so we need to set them back.
37
+ fs_cls.protocol = HfFileSystem.protocol
38
+ return fs_cls
39
+
40
+
41
+ class HfClient(Client):
42
+ PREFIX = "hf://"
43
+ protocol = "hf"
44
+
45
+ @classproperty
46
+ def FS_CLASS(cls): # noqa: N802, N805
47
+ return get_hf_filesystem_cls()
48
+
49
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
50
+ return File(
51
+ source=self.uri,
52
+ path=path,
53
+ size=v["size"],
54
+ version=v["last_commit"].oid,
55
+ etag=v.get("blob_id", ""),
56
+ last_modified=v["last_commit"].date,
57
+ )
58
+
59
+ def rel_path(self, path):
60
+ return posixpath.relpath(path, self.name)
@@ -67,10 +67,7 @@ class FileClient(Client):
67
67
  @classmethod
68
68
  def split_url(cls, url: str) -> tuple[str, str]:
69
69
  parsed = urlparse(url)
70
- if parsed.scheme == "file":
71
- scheme, rest = url.split(":", 1)
72
- url = f"{scheme.lower()}:{rest}"
73
- else:
70
+ if parsed.scheme != "file":
74
71
  url = cls.path_to_uri(url)
75
72
 
76
73
  fill_path = url[len(cls.PREFIX) :]