datachain 0.11.0__tar.gz → 0.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (345) hide show
  1. {datachain-0.11.0 → datachain-0.12.0}/.github/workflows/benchmarks.yml +2 -2
  2. {datachain-0.11.0 → datachain-0.12.0}/.github/workflows/release.yml +2 -2
  3. {datachain-0.11.0 → datachain-0.12.0}/.github/workflows/tests.yml +11 -10
  4. {datachain-0.11.0 → datachain-0.12.0}/.pre-commit-config.yaml +1 -1
  5. {datachain-0.11.0 → datachain-0.12.0}/PKG-INFO +6 -4
  6. {datachain-0.11.0 → datachain-0.12.0}/docs/examples.md +4 -6
  7. {datachain-0.11.0 → datachain-0.12.0}/docs/quick-start.md +1 -1
  8. datachain-0.12.0/docs/references/remotes.md +346 -0
  9. {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/openimage-detect.py +3 -7
  10. {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/ultralytics-bbox.py +1 -9
  11. {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/ultralytics-pose.py +1 -9
  12. {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/ultralytics-segment.py +1 -9
  13. {datachain-0.11.0 → datachain-0.12.0}/mkdocs.yml +1 -0
  14. {datachain-0.11.0 → datachain-0.12.0}/noxfile.py +28 -19
  15. {datachain-0.11.0 → datachain-0.12.0}/pyproject.toml +7 -7
  16. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/catalog/catalog.py +33 -5
  17. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/catalog/loader.py +19 -13
  18. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/__init__.py +3 -1
  19. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/show.py +12 -1
  20. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/parser/studio.py +13 -1
  21. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/parser/utils.py +6 -0
  22. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/fsspec.py +12 -16
  23. datachain-0.12.0/src/datachain/client/hf.py +60 -0
  24. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/local.py +1 -4
  25. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/warehouse.py +3 -8
  26. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/dataset.py +8 -0
  27. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/error.py +0 -12
  28. datachain-0.12.0/src/datachain/fs/utils.py +30 -0
  29. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/__init__.py +5 -0
  30. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/func.py +2 -1
  31. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/data_model.py +6 -0
  32. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/dc.py +114 -28
  33. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/file.py +100 -25
  34. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/image.py +30 -6
  35. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/listing.py +21 -39
  36. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/signal_schema.py +194 -15
  37. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/video.py +7 -5
  38. datachain-0.12.0/src/datachain/model/bbox.py +253 -0
  39. datachain-0.12.0/src/datachain/model/pose.py +100 -0
  40. datachain-0.12.0/src/datachain/model/segment.py +51 -0
  41. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/model/ultralytics/bbox.py +9 -9
  42. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/model/ultralytics/pose.py +7 -7
  43. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/model/ultralytics/segment.py +7 -7
  44. datachain-0.12.0/src/datachain/model/utils.py +191 -0
  45. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/nodes_thread_pool.py +32 -11
  46. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/dataset.py +4 -2
  47. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/studio.py +8 -6
  48. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/utils.py +3 -16
  49. {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/PKG-INFO +6 -4
  50. {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/SOURCES.txt +20 -3
  51. {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/requires.txt +4 -3
  52. {datachain-0.11.0 → datachain-0.12.0}/tests/conftest.py +49 -3
  53. datachain-0.12.0/tests/func/data/lena.jpg +0 -0
  54. datachain-0.12.0/tests/func/model/data/running-mask0.png +0 -0
  55. datachain-0.12.0/tests/func/model/data/running-mask1.png +0 -0
  56. datachain-0.12.0/tests/func/model/data/running.jpg +0 -0
  57. datachain-0.12.0/tests/func/model/data/ships.jpg +0 -0
  58. datachain-0.12.0/tests/func/model/test_yolo.py +2427 -0
  59. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_client.py +0 -19
  60. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_datachain.py +309 -18
  61. datachain-0.12.0/tests/func/test_hidden_field.py +70 -0
  62. datachain-0.12.0/tests/func/test_image.py +68 -0
  63. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_ls.py +0 -9
  64. {datachain-0.11.0/tests/unit/lib → datachain-0.12.0/tests/func}/test_video.py +35 -21
  65. datachain-0.12.0/tests/test_import_time.py +84 -0
  66. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_datachain.py +100 -0
  67. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_file.py +14 -0
  68. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_image.py +1 -4
  69. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_signal_schema.py +140 -0
  70. datachain-0.12.0/tests/unit/model/test_bbox.py +259 -0
  71. datachain-0.11.0/tests/unit/lib/test_models.py → datachain-0.12.0/tests/unit/model/test_pose.py +72 -51
  72. datachain-0.12.0/tests/unit/model/test_segment.py +53 -0
  73. datachain-0.12.0/tests/unit/model/test_utils.py +92 -0
  74. datachain-0.12.0/tests/unit/sql/__init__.py +0 -0
  75. datachain-0.12.0/tests/unit/sql/sqlite/__init__.py +0 -0
  76. {datachain-0.11.0 → datachain-0.12.0}/tests/utils.py +0 -8
  77. datachain-0.11.0/src/datachain/client/hf.py +0 -38
  78. datachain-0.11.0/src/datachain/model/bbox.py +0 -102
  79. datachain-0.11.0/src/datachain/model/pose.py +0 -88
  80. datachain-0.11.0/src/datachain/model/segment.py +0 -47
  81. {datachain-0.11.0 → datachain-0.12.0}/.cruft.json +0 -0
  82. {datachain-0.11.0 → datachain-0.12.0}/.gitattributes +0 -0
  83. {datachain-0.11.0 → datachain-0.12.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  84. {datachain-0.11.0 → datachain-0.12.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  85. {datachain-0.11.0 → datachain-0.12.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  86. {datachain-0.11.0 → datachain-0.12.0}/.github/codecov.yaml +0 -0
  87. {datachain-0.11.0 → datachain-0.12.0}/.github/dependabot.yml +0 -0
  88. {datachain-0.11.0 → datachain-0.12.0}/.github/workflows/tests-studio.yml +0 -0
  89. {datachain-0.11.0 → datachain-0.12.0}/.github/workflows/update-template.yaml +0 -0
  90. {datachain-0.11.0 → datachain-0.12.0}/.gitignore +0 -0
  91. {datachain-0.11.0 → datachain-0.12.0}/CODE_OF_CONDUCT.rst +0 -0
  92. {datachain-0.11.0 → datachain-0.12.0}/LICENSE +0 -0
  93. {datachain-0.11.0 → datachain-0.12.0}/README.rst +0 -0
  94. {datachain-0.11.0 → datachain-0.12.0}/docs/assets/captioned_cartoons.png +0 -0
  95. {datachain-0.11.0 → datachain-0.12.0}/docs/assets/datachain-white.svg +0 -0
  96. {datachain-0.11.0 → datachain-0.12.0}/docs/assets/datachain.svg +0 -0
  97. {datachain-0.11.0 → datachain-0.12.0}/docs/contributing.md +0 -0
  98. {datachain-0.11.0 → datachain-0.12.0}/docs/css/github-permalink-style.css +0 -0
  99. {datachain-0.11.0 → datachain-0.12.0}/docs/index.md +0 -0
  100. {datachain-0.11.0 → datachain-0.12.0}/docs/overrides/main.html +0 -0
  101. {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/arrowrow.md +0 -0
  102. {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/bbox.md +0 -0
  103. {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/file.md +0 -0
  104. {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/imagefile.md +0 -0
  105. {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/index.md +0 -0
  106. {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/pose.md +0 -0
  107. {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/segment.md +0 -0
  108. {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/tarvfile.md +0 -0
  109. {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/textfile.md +0 -0
  110. {datachain-0.11.0 → datachain-0.12.0}/docs/references/data-types/videofile.md +0 -0
  111. {datachain-0.11.0 → datachain-0.12.0}/docs/references/datachain.md +0 -0
  112. {datachain-0.11.0 → datachain-0.12.0}/docs/references/func.md +0 -0
  113. {datachain-0.11.0 → datachain-0.12.0}/docs/references/index.md +0 -0
  114. {datachain-0.11.0 → datachain-0.12.0}/docs/references/toolkit.md +0 -0
  115. {datachain-0.11.0 → datachain-0.12.0}/docs/references/torch.md +0 -0
  116. {datachain-0.11.0 → datachain-0.12.0}/docs/references/udf.md +0 -0
  117. {datachain-0.11.0 → datachain-0.12.0}/docs/tutorials.md +0 -0
  118. {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  119. {datachain-0.11.0 → datachain-0.12.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  120. {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/common_sql_functions.py +0 -0
  121. {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/json-csv-reader.py +0 -0
  122. {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/torch-loader.py +0 -0
  123. {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/udfs/parallel.py +0 -0
  124. {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/udfs/simple.py +0 -0
  125. {datachain-0.11.0 → datachain-0.12.0}/examples/get_started/udfs/stateful.py +0 -0
  126. {datachain-0.11.0 → datachain-0.12.0}/examples/llm_and_nlp/claude-query.py +0 -0
  127. {datachain-0.11.0 → datachain-0.12.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  128. {datachain-0.11.0 → datachain-0.12.0}/examples/multimodal/clip_inference.py +0 -0
  129. {datachain-0.11.0 → datachain-0.12.0}/examples/multimodal/hf_pipeline.py +0 -0
  130. {datachain-0.11.0 → datachain-0.12.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  131. {datachain-0.11.0 → datachain-0.12.0}/examples/multimodal/wds.py +0 -0
  132. {datachain-0.11.0 → datachain-0.12.0}/examples/multimodal/wds_filtered.py +0 -0
  133. {datachain-0.11.0 → datachain-0.12.0}/setup.cfg +0 -0
  134. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/__init__.py +0 -0
  135. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/__main__.py +0 -0
  136. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/asyn.py +0 -0
  137. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cache.py +0 -0
  138. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/catalog/__init__.py +0 -0
  139. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/catalog/datasource.py +0 -0
  140. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/__init__.py +0 -0
  141. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/datasets.py +0 -0
  142. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/du.py +0 -0
  143. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/index.py +0 -0
  144. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/ls.py +0 -0
  145. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/misc.py +0 -0
  146. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/commands/query.py +0 -0
  147. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/parser/__init__.py +0 -0
  148. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/parser/job.py +0 -0
  149. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/cli/utils.py +0 -0
  150. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/__init__.py +0 -0
  151. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/azure.py +0 -0
  152. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/fileslice.py +0 -0
  153. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/gcs.py +0 -0
  154. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/client/s3.py +0 -0
  155. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/config.py +0 -0
  156. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/__init__.py +0 -0
  157. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/db_engine.py +0 -0
  158. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/job.py +0 -0
  159. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/metastore.py +0 -0
  160. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/schema.py +0 -0
  161. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/serializer.py +0 -0
  162. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/data_storage/sqlite.py +0 -0
  163. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/diff/__init__.py +0 -0
  164. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/fs/__init__.py +0 -0
  165. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/fs/reference.py +0 -0
  166. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/aggregate.py +0 -0
  167. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/array.py +0 -0
  168. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/base.py +0 -0
  169. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/conditional.py +0 -0
  170. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/numeric.py +0 -0
  171. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/path.py +0 -0
  172. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/random.py +0 -0
  173. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/string.py +0 -0
  174. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/func/window.py +0 -0
  175. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/job.py +0 -0
  176. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/__init__.py +0 -0
  177. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/arrow.py +0 -0
  178. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/clip.py +0 -0
  179. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/__init__.py +0 -0
  180. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/flatten.py +0 -0
  181. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  182. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  183. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/unflatten.py +0 -0
  184. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  185. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/dataset_info.py +0 -0
  186. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/hf.py +0 -0
  187. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/listing_info.py +0 -0
  188. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/meta_formats.py +0 -0
  189. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/model_store.py +0 -0
  190. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/pytorch.py +0 -0
  191. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/settings.py +0 -0
  192. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/tar.py +0 -0
  193. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/text.py +0 -0
  194. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/udf.py +0 -0
  195. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/udf_signature.py +0 -0
  196. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/utils.py +0 -0
  197. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/webdataset.py +0 -0
  198. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/lib/webdataset_laion.py +0 -0
  199. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/listing.py +0 -0
  200. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/model/__init__.py +0 -0
  201. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  202. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/node.py +0 -0
  203. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/nodes_fetcher.py +0 -0
  204. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/progress.py +0 -0
  205. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/py.typed +0 -0
  206. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/__init__.py +0 -0
  207. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/batch.py +0 -0
  208. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/dispatch.py +0 -0
  209. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/metrics.py +0 -0
  210. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/params.py +0 -0
  211. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/queue.py +0 -0
  212. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/schema.py +0 -0
  213. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/session.py +0 -0
  214. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/udf.py +0 -0
  215. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/query/utils.py +0 -0
  216. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/remote/__init__.py +0 -0
  217. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/remote/studio.py +0 -0
  218. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/script_meta.py +0 -0
  219. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/__init__.py +0 -0
  220. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/default/__init__.py +0 -0
  221. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/default/base.py +0 -0
  222. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/__init__.py +0 -0
  223. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/aggregate.py +0 -0
  224. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/array.py +0 -0
  225. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/conditional.py +0 -0
  226. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/numeric.py +0 -0
  227. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/path.py +0 -0
  228. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/random.py +0 -0
  229. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/functions/string.py +0 -0
  230. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/selectable.py +0 -0
  231. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  232. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/sqlite/base.py +0 -0
  233. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/sqlite/types.py +0 -0
  234. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/sqlite/vector.py +0 -0
  235. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/types.py +0 -0
  236. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/sql/utils.py +0 -0
  237. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/telemetry.py +0 -0
  238. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/toolkit/__init__.py +0 -0
  239. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/toolkit/split.py +0 -0
  240. {datachain-0.11.0 → datachain-0.12.0}/src/datachain/torch/__init__.py +0 -0
  241. {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  242. {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/entry_points.txt +0 -0
  243. {datachain-0.11.0 → datachain-0.12.0}/src/datachain.egg-info/top_level.txt +0 -0
  244. {datachain-0.11.0 → datachain-0.12.0}/tests/__init__.py +0 -0
  245. {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/__init__.py +0 -0
  246. {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/conftest.py +0 -0
  247. {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  248. {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  249. {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/datasets/.gitignore +0 -0
  250. {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  251. {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/test_datachain.py +0 -0
  252. {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/test_ls.py +0 -0
  253. {datachain-0.11.0 → datachain-0.12.0}/tests/benchmarks/test_version.py +0 -0
  254. {datachain-0.11.0 → datachain-0.12.0}/tests/data.py +0 -0
  255. {datachain-0.11.0 → datachain-0.12.0}/tests/examples/__init__.py +0 -0
  256. {datachain-0.11.0 → datachain-0.12.0}/tests/examples/test_examples.py +0 -0
  257. {datachain-0.11.0 → datachain-0.12.0}/tests/examples/test_wds_e2e.py +0 -0
  258. {datachain-0.11.0 → datachain-0.12.0}/tests/examples/wds_data.py +0 -0
  259. {datachain-0.11.0 → datachain-0.12.0}/tests/func/__init__.py +0 -0
  260. {datachain-0.11.0/tests/unit/lib → datachain-0.12.0/tests/func}/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  261. {datachain-0.11.0 → datachain-0.12.0}/tests/func/fake-service-account-credentials.json +0 -0
  262. {datachain-0.11.0/tests/unit → datachain-0.12.0/tests/func/model}/__init__.py +0 -0
  263. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_catalog.py +0 -0
  264. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_cloud_transfer.py +0 -0
  265. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_data_storage.py +0 -0
  266. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_datachain_merge.py +0 -0
  267. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_dataset_query.py +0 -0
  268. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_datasets.py +0 -0
  269. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_feature_pickling.py +0 -0
  270. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_file.py +0 -0
  271. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_hf.py +0 -0
  272. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_listing.py +0 -0
  273. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_meta_formats.py +0 -0
  274. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_metrics.py +0 -0
  275. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_pull.py +0 -0
  276. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_pytorch.py +0 -0
  277. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_query.py +0 -0
  278. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_session.py +0 -0
  279. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_toolkit.py +0 -0
  280. {datachain-0.11.0 → datachain-0.12.0}/tests/func/test_warehouse.py +0 -0
  281. {datachain-0.11.0 → datachain-0.12.0}/tests/scripts/feature_class.py +0 -0
  282. {datachain-0.11.0 → datachain-0.12.0}/tests/scripts/feature_class_exception.py +0 -0
  283. {datachain-0.11.0 → datachain-0.12.0}/tests/scripts/feature_class_parallel.py +0 -0
  284. {datachain-0.11.0 → datachain-0.12.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  285. {datachain-0.11.0 → datachain-0.12.0}/tests/scripts/name_len_slow.py +0 -0
  286. {datachain-0.11.0 → datachain-0.12.0}/tests/test_atomicity.py +0 -0
  287. {datachain-0.11.0 → datachain-0.12.0}/tests/test_cli_e2e.py +0 -0
  288. {datachain-0.11.0 → datachain-0.12.0}/tests/test_cli_studio.py +0 -0
  289. {datachain-0.11.0 → datachain-0.12.0}/tests/test_query_e2e.py +0 -0
  290. {datachain-0.11.0 → datachain-0.12.0}/tests/test_telemetry.py +0 -0
  291. {datachain-0.11.0/tests/unit/lib → datachain-0.12.0/tests/unit}/__init__.py +0 -0
  292. {datachain-0.11.0/tests/unit/sql → datachain-0.12.0/tests/unit/lib}/__init__.py +0 -0
  293. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/conftest.py +0 -0
  294. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_arrow.py +0 -0
  295. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_clip.py +0 -0
  296. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  297. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  298. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_diff.py +0 -0
  299. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_feature.py +0 -0
  300. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_feature_utils.py +0 -0
  301. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_hf.py +0 -0
  302. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_listing_info.py +0 -0
  303. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  304. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_schema.py +0 -0
  305. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  306. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_text.py +0 -0
  307. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_udf_signature.py +0 -0
  308. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_utils.py +0 -0
  309. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/lib/test_webdataset.py +0 -0
  310. {datachain-0.11.0/tests/unit/sql/sqlite → datachain-0.12.0/tests/unit/model}/__init__.py +0 -0
  311. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  312. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  313. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_array.py +0 -0
  314. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_conditional.py +0 -0
  315. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_path.py +0 -0
  316. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_random.py +0 -0
  317. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_selectable.py +0 -0
  318. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/sql/test_string.py +0 -0
  319. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_asyn.py +0 -0
  320. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_cache.py +0 -0
  321. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_catalog.py +0 -0
  322. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_catalog_loader.py +0 -0
  323. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_cli_parsing.py +0 -0
  324. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_client.py +0 -0
  325. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_client_gcs.py +0 -0
  326. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_client_s3.py +0 -0
  327. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_config.py +0 -0
  328. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_data_storage.py +0 -0
  329. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_database_engine.py +0 -0
  330. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_dataset.py +0 -0
  331. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_dispatch.py +0 -0
  332. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_fileslice.py +0 -0
  333. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_func.py +0 -0
  334. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_listing.py +0 -0
  335. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_metastore.py +0 -0
  336. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_module_exports.py +0 -0
  337. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_pytorch.py +0 -0
  338. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_query.py +0 -0
  339. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_query_metrics.py +0 -0
  340. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_query_params.py +0 -0
  341. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_script_meta.py +0 -0
  342. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_serializer.py +0 -0
  343. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_session.py +0 -0
  344. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_utils.py +0 -0
  345. {datachain-0.11.0 → datachain-0.12.0}/tests/unit/test_warehouse.py +0 -0
@@ -19,10 +19,10 @@ jobs:
19
19
  runs-on: ubuntu-latest
20
20
  steps:
21
21
  - uses: actions/checkout@v4
22
- - name: Set up Python 3.12
22
+ - name: Set up Python 3.13
23
23
  uses: actions/setup-python@v5
24
24
  with:
25
- python-version: '3.12'
25
+ python-version: '3.13'
26
26
 
27
27
  - name: Setup uv
28
28
  uses: astral-sh/setup-uv@v5
@@ -21,10 +21,10 @@ jobs:
21
21
  with:
22
22
  fetch-depth: 0
23
23
 
24
- - name: Set up Python 3.12
24
+ - name: Set up Python 3.13
25
25
  uses: actions/setup-python@v5
26
26
  with:
27
- python-version: '3.12'
27
+ python-version: '3.13'
28
28
 
29
29
  - name: Setup uv
30
30
  uses: astral-sh/setup-uv@v5
@@ -60,16 +60,16 @@ jobs:
60
60
  fail-fast: false
61
61
  matrix:
62
62
  os: [ubuntu-latest-8-cores]
63
- pyv: ['3.9', '3.10', '3.11', '3.12']
63
+ pyv: ['3.9', '3.10', '3.11', '3.12', '3.13']
64
64
  include:
65
65
  - os: macos-latest
66
66
  pyv: '3.9'
67
67
  - os: macos-latest
68
- pyv: '3.12'
68
+ pyv: '3.13'
69
69
  - os: windows-latest
70
70
  pyv: '3.9'
71
71
  - os: windows-latest
72
- pyv: '3.12'
72
+ pyv: '3.13'
73
73
 
74
74
  steps:
75
75
  - name: Check out the repository
@@ -109,7 +109,7 @@ jobs:
109
109
  shell: bash
110
110
 
111
111
  - name: Run E2E tests
112
- run: nox -s tests-${{ matrix.pyv }} -- -m "e2e" --cov-append $DISABLE_REMOTES_ARG
112
+ run: nox -s e2e-${{ matrix.pyv }}
113
113
  shell: bash
114
114
 
115
115
  - name: Upload coverage report
@@ -132,14 +132,16 @@ jobs:
132
132
  fail-fast: false
133
133
  matrix:
134
134
  os: [ubuntu-latest, windows-latest]
135
- pyv: ['3.9', '3.12']
136
- group: ['get_started', 'computer_vision', 'llm_and_nlp', 'multimodal']
135
+ pyv: ['3.9', '3.13']
136
+ group: ['get_started', 'computer_vision', 'multimodal']
137
137
  exclude:
138
138
  - {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
139
- - {os: ubuntu-latest, pyv: '3.12', group: 'multimodal'}
139
+ - {os: ubuntu-latest, pyv: '3.13', group: 'multimodal'}
140
140
  include:
141
+ # HF runs against actual API - thus run it only once
142
+ - {os: ubuntu-latest, pyv: "3.13", group: llm_and_nlp}
141
143
  - {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
142
- - {os: ubuntu-latest-4-cores, pyv: "3.12", group: multimodal}
144
+ - {os: ubuntu-latest-4-cores, pyv: "3.13", group: multimodal}
143
145
 
144
146
  steps:
145
147
  - uses: actions/checkout@v4
@@ -161,9 +163,8 @@ jobs:
161
163
  - name: Install nox
162
164
  run: uv pip install nox --system
163
165
 
164
- # HF runs against actual API - thus run it only once
165
166
  - name: Set hf token
166
- if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.12'
167
+ if: matrix.group == 'llm_and_nlp'
167
168
  run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
168
169
 
169
170
  - name: Run examples
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.9.7'
27
+ rev: 'v0.9.10'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.11.0
3
+ Version: 0.12.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.9
12
12
  Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
15
16
  Classifier: Development Status :: 2 - Pre-Alpha
16
17
  Requires-Python: >=3.9
17
18
  Description-Content-Type: text/x-rst
@@ -70,10 +71,10 @@ Requires-Dist: usearch; extra == "vector"
70
71
  Provides-Extra: hf
71
72
  Requires-Dist: numba>=0.60.0; extra == "hf"
72
73
  Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
74
+ Requires-Dist: fsspec>=2024.12.0; extra == "hf"
73
75
  Provides-Extra: video
74
- Requires-Dist: av<14; extra == "video"
75
76
  Requires-Dist: ffmpeg-python; extra == "video"
76
- Requires-Dist: imageio[ffmpeg]; extra == "video"
77
+ Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
77
78
  Requires-Dist: opencv-python; extra == "video"
78
79
  Provides-Extra: tests
79
80
  Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
@@ -90,6 +91,7 @@ Requires-Dist: hypothesis; extra == "tests"
90
91
  Requires-Dist: aiotools>=1.7.0; extra == "tests"
91
92
  Requires-Dist: requests-mock; extra == "tests"
92
93
  Requires-Dist: scipy; extra == "tests"
94
+ Requires-Dist: ultralytics; extra == "tests"
93
95
  Provides-Extra: dev
94
96
  Requires-Dist: datachain[docs,tests]; extra == "dev"
95
97
  Requires-Dist: mypy==1.15.0; extra == "dev"
@@ -103,7 +105,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
103
105
  Requires-Dist: defusedxml; extra == "examples"
104
106
  Requires-Dist: accelerate; extra == "examples"
105
107
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
106
- Requires-Dist: ultralytics==8.3.78; extra == "examples"
108
+ Requires-Dist: ultralytics; extra == "examples"
107
109
  Requires-Dist: open_clip_torch; extra == "examples"
108
110
 
109
111
  ================
@@ -13,7 +13,7 @@ title: Examples
13
13
  For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”:
14
14
 
15
15
  ```python
16
- from datachain.lib.dc import Column, DataChain, File # (1)!
16
+ from datachain import Column, DataChain, File # (1)!
17
17
  from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
18
18
 
19
19
  images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
@@ -80,12 +80,10 @@ In the below example, we are calling a Mixtral 8x22b model to judge the “servi
80
80
  # $ export MISTRAL_API_KEY='your key'
81
81
 
82
82
  import os
83
- from datachain.lib.feature import Feature
84
- from datachain.lib.dc import Column, DataChain
83
+ from datachain import Column, DataChain, DataModel, Feature
85
84
  from mistralai.client import MistralClient
86
85
  from mistralai.models.chat_completion import ChatMessage
87
86
  from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel
88
- from datachain.lib.data_model import DataModel
89
87
 
90
88
  prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
91
89
  api_key = os.environ["MISTRAL_API_KEY"]
@@ -189,7 +187,7 @@ DataChain library understands common annotation formats (JSON, CSV, webdataset a
189
187
  Here is an example of reading a simple CSV file where schema is heuristically derived from the header:
190
188
 
191
189
  ```python
192
- from datachain.lib.dc import DataChain
190
+ from datachain import DataChain
193
191
 
194
192
  uri="gs://datachain-demo/chatbot-csv/"
195
193
  csv_dataset = DataChain.from_csv(uri)
@@ -234,7 +232,7 @@ However, Datachain can easily parse the entire COCO structure via several readin
234
232
 
235
233
  ```python
236
234
 
237
- from datachain.lib.dc import Column, DataChain
235
+ from datachain import Column, DataChain
238
236
 
239
237
  images_uri="gs://datachain-demo/coco2017/images/val/"
240
238
  captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json"
@@ -138,7 +138,7 @@ chain = (
138
138
  )
139
139
 
140
140
  successful_chain = chain.filter(Column("is_success") == True)
141
- successful_chain.export_files("./output_mistral")
141
+ successful_chain.to_storage("./output_mistral")
142
142
 
143
143
  print(f"{successful_chain.count()} files were exported")
144
144
  ```
@@ -0,0 +1,346 @@
1
+ # Interacting with remote storage
2
+
3
+ DataChain supports reading and writing data from different remote storages using methods like `DataChain.from_storage` and `DataChain.to_storage`. The supported storages includes: local file system, AWS S3 storage, Google Cloud Storage, Azure Blob Storage, Hugging Face and more.
4
+
5
+ Example implementation for reading and writing data from/to different remote storages:
6
+
7
+ ```python
8
+ from datachain import DataChain
9
+
10
+ dc = DataChain.from_storage("s3://bucket-name/path/to/data")
11
+ dc.to_storage("gs://bucket-name/path/to/data")
12
+ ```
13
+
14
+ DataChain uses [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) to interact with different remote storages. You can pass the following fsspec-supported URIs to `from_storage` and `to_storage` methods.
15
+
16
+ - Local file system: `file://path/to/data`
17
+ - AWS S3 storage: `s3://bucket-name/path/to/data`
18
+ - Google Cloud Storage: `gs://bucket-name/path/to/data`
19
+ - Azure Blob Storage: `az://container-name/path/to/data`
20
+ - Hugging Face: `hf://dataset-name`
21
+
22
+ ## Extra configuration
23
+ For the configuration parameters to the filesystem, you can pass the key and value pair as client_config dictionary that will be passed to the respective filesystem.
24
+
25
+
26
+ ### AWS S3 compatible storage
27
+
28
+ DataChain uses [s3fs](https://s3fs.readthedocs.io/en/latest/) to interact with AWS S3 storage. Authentication can be configured using standard AWS credential locations, such as `~/.aws/credentials` and `~/.aws/config`. You can also pass the following configuration parameters to the s3fs filesystem as `client_config` dictionary.
29
+
30
+ - `anon`: `bool` (default: `False`)
31
+
32
+ Whether to use anonymous connection (public buckets only). If `False`,
33
+ uses the key/secret given, or boto's credential resolver (client_kwargs,
34
+ environment, variables, config files, EC2 IAM server, in that order)
35
+
36
+ - `endpoint_url`: `string` (default: `None`)
37
+
38
+ Use this endpoint URL, if specified. Needed for connecting to non-AWS
39
+ S3 buckets. Takes precedence over `endpoint_url` in client_kwargs.
40
+
41
+ - `key`: `string` (default: `None`)
42
+
43
+ If not anonymous, use this access key ID, if specified. Takes precedence
44
+ over `aws_access_key_id` in client_kwargs.
45
+
46
+ - `secret`: `string` (default: `None`)
47
+
48
+ If not anonymous, use this secret access key, if specified. Takes
49
+ precedence over `aws_secret_access_key` in client_kwargs.
50
+
51
+ - `token`: `string` (default: `None`)
52
+
53
+ If not anonymous, use this security token, if specified
54
+
55
+ - `use_ssl`: `bool` (default: `True`)
56
+
57
+ Whether to use SSL in connections to S3; may be faster without, but
58
+ insecure. If `use_ssl` is also set in `client_kwargs`,
59
+ the value set in `client_kwargs` will take priority.
60
+
61
+ - `s3_additional_kwargs`: `dict` (default: `{}`)
62
+
63
+ Dict of parameters that are used when calling s3 api
64
+ methods. Typically used for things like "ServerSideEncryption".
65
+
66
+ - `client_kwargs`: `dict` (default: `{}`)
67
+
68
+ Dict of parameters for the botocore client.
69
+
70
+ - `requester_pays`: `bool` (default: `False`)
71
+
72
+ If RequesterPays buckets are supported.
73
+
74
+ - `default_block_size`: `int` (default: `None`)
75
+
76
+ If given, the default block size value used for `open()`, if no
77
+ specific value is given at all time. The built-in default is 5MB.
78
+
79
+ - `default_fill_cache`: `bool` (default: `True`)
80
+
81
+ Whether to use cache filling with open by default. Refer to `S3File.open`.
82
+
83
+ - `default_cache_type`: `string` (default: `"readahead"`)
84
+
85
+ If given, the default cache_type value used for `open()`. Set to `None`
86
+ if no caching is desired. See fsspec's documentation for other available
87
+ `cache_type` values. Default cache_type is `"readahead"`.
88
+
89
+ - `version_aware`: `bool` (default: `False`)
90
+
91
+ Whether to support bucket versioning. If enable this will require the
92
+ user to have the necessary IAM permissions for dealing with versioned
93
+ objects. Note that in the event that you only need to work with the
94
+ latest version of objects in a versioned bucket, and do not need the
95
+ VersionId for those objects, you should set `version_aware` to `False`
96
+ for performance reasons. When set to `True`, filesystem instances will
97
+ use the S3 `ListObjectVersions` API call to list directory contents,
98
+ which requires listing all historical object versions.
99
+
100
+ - `cache_regions`: `bool` (default: `False`)
101
+
102
+ Whether to cache bucket regions or not. Whenever a new bucket is used,
103
+ it will first find out which region it belongs and then use the client
104
+ for that region.
105
+
106
+ - `asynchronous`: `bool` (default: `False`)
107
+
108
+ Whether this instance is to be used from inside coroutines.
109
+
110
+ - `config_kwargs`: `dict` (default: `{}`)
111
+
112
+ Dict of parameters passed to `botocore.client.Config`.
113
+
114
+ - `kwargs`: `dict` (default: `{}`)
115
+
116
+ Other parameters for core session.
117
+
118
+ - `session`: `aiobotocore.session.AioSession` (default: `None`)
119
+
120
+ Aiobotocore `AioSession` object to be used for all connections.
121
+ This session will be used inplace of creating a new session inside S3FileSystem.
122
+
123
+ For example: `aiobotocore.session.AioSession(profile='test_user')`
124
+
125
+ - `max_concurrency`: `int` (default: `1`)
126
+
127
+ The maximum number of concurrent transfers to use per file for multipart
128
+ upload (`put()`) operations. Defaults to `1` (sequential). When used in
129
+ conjunction with `S3FileSystem.put(batch_size=...)` the maximum number of
130
+ simultaneous connections is `max_concurrency * batch_size`. We may extend
131
+ this parameter to affect `pipe()`, `cat()` and `get()`. Increasing this
132
+ value will result in higher memory usage during multipart upload operations (by
133
+ `max_concurrency * chunksize` bytes per file).
134
+
135
+
136
+ Example:
137
+ ```python
138
+ chain = DataChain.from_storage(
139
+ "s3://my-bucket/my-dir",
140
+ client_config = {
141
+ "endpoint_url": "<minio-endpoint-url>",
142
+ "key": "<minio-access-key",
143
+ "secret": "<minio-secret-key"
144
+ }
145
+ )
146
+ ```
147
+
148
+ ### Google Cloud Storage
149
+
150
+ DataChain uses [gcsfs](https://gcsfs.readthedocs.io/en/latest/) to interact with Google Cloud Storage. Authentication can be achieved by using any of the method described at [gcsfs documentation](https://gcsfs.readthedocs.io/en/latest/#credentials). You can also pass the following configuration parameters to the gcsfs filesystem as client_config dictionary.
151
+
152
+ - `project`: `string` (default: `None`)
153
+
154
+ The project to work under. Note that this is not the same as, but often
155
+ very similar to, the project name. This is required in order to list all
156
+ the buckets you have access to within a project and to create/delete
157
+ buckets, or update their access policies. If `token='google_default'`,
158
+ the value is overridden by the default, if `token='anon'`, the value is
159
+ ignored.
160
+
161
+ - `access`: `string` (default: `None`)
162
+
163
+ One of `"read_only"`, `"read_write"`, `"full_control"`. Full control implies
164
+ read/write as well as modifying metadata, e.g., access control.
165
+
166
+ - `token`: `None`, `dict` or `string` (default: `None`)
167
+
168
+ The token to use for authentication. If `None`, the default is used. If
169
+ a string, it is interpreted as a path to a token file. If a dict, it is
170
+ interpreted as a token dictionary, such as that provided by Google Cloud
171
+ Platform. See also description of authentication methods, from link above.
172
+
173
+ - `consistency`: `string` (default: `None`)
174
+
175
+ One of `"none"`, `"size"`, `"md5"`. Check method when writing files.
176
+ Can be overridden in `open()`.
177
+
178
+ - `cache_timeout`: `float` (default: `None`)
179
+
180
+ Cache expiration time in seconds for object metadata cache. Set
181
+ `cache_timeout <= 0` for no caching, `None` for no cache expiration.
182
+
183
+ - `secure_serialize`: `bool` (default: `None`)
184
+
185
+ Whether to use secure serialization. This is a deprecated option and
186
+ will be removed in future versions.
187
+
188
+ - `requester_pays`: `bool` or `str` (default: `False`)
189
+
190
+ Whether to use requester-pays requests. This will include your
191
+ project ID `project` in requests as the `userProject`, and you'll be
192
+ billed for accessing data from requester-pays buckets. Optionally,
193
+ pass a project-id here as a string to use that as the `userProject`.
194
+
195
+ - `session_kwargs`: `dict` (default: `{}`)
196
+
197
+ Passed on to `aiohttp.ClientSession`. Can contain, for example, proxy
198
+ settings.
199
+
200
+ - `endpoint_url`: `string` (default: `None`)
201
+
202
+ If given, use this URL (format: `protocol://host:port`, *without* any
203
+ path part) for communication. If not given, defaults to the value
204
+ of environment variable `"STORAGE_EMULATOR_HOST"`; if that is not set
205
+ either, will use the standard Google endpoint.
206
+
207
+ - `default_location`: `str` (default: `None`)
208
+
209
+ Default location where buckets are created, like `"US"` or `"EUROPE-WEST3"`.
210
+ You can find a list of all available locations here:
211
+ https://cloud.google.com/storage/docs/locations#available-locations
212
+
213
+ - `version_aware`: `bool` (default: `False`)
214
+
215
+ Whether to support object versioning. If enabled this will require the
216
+ user to have the necessary permissions for dealing with versioned objects.
217
+
218
+
219
+ ### Azure Blob Storage
220
+
221
+ DataChain uses [adlfs](https://fsspec.github.io/adlfs/) to interact with Azure Blob Storage. Authentication can be achieved by using any of the method described at [adlfs documentation](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials). You can also pass the following configuration parameters to the adlfs filesystem as client_config dictionary.
222
+
223
+ - `account_name`: `str` (default: `None`)
224
+
225
+ The storage account name. This is used to authenticate requests
226
+ signed with an account key and to construct the storage endpoint. It
227
+ is required unless a connection string is given, or if a custom
228
+ domain is used with anonymous authentication.
229
+
230
+ - `account_key`: `str` (default: `None`)
231
+
232
+ The storage account key. This is used for shared key authentication.
233
+ If any of account key, sas token or client_id is specified, anonymous access
234
+ will be used.
235
+
236
+ - `sas_token`: `str` (default: `None`)
237
+
238
+ A shared access signature token to use to authenticate requests
239
+ instead of the account key. If account key and sas token are both
240
+ specified, account key will be used to sign. If any of account key, sas token
241
+ or client_id are specified, anonymous access will be used.
242
+
243
+ - `request_session`: `requests.Session` (default: `None`)
244
+
245
+ The session object to use for http requests.
246
+
247
+ - `connection_string`: `str` (default: `None`)
248
+
249
+ If specified, this will override all other parameters besides
250
+ request session. See
251
+ http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/
252
+ for the connection string format.
253
+
254
+ - `credential`: `azure.core.credentials_async.AsyncTokenCredential` or SAS token (default: `None`)
255
+
256
+ The credentials with which to authenticate. Optional if the account URL already has a SAS token.
257
+ Can include an instance of TokenCredential class from azure.identity.aio.
258
+
259
+ - `blocksize`: `int` (default: `None`)
260
+
261
+ The block size to use for download/upload operations. Defaults to hardcoded value of
262
+ `BlockBlobService.MAX_BLOCK_SIZE`
263
+
264
+ - `client_id`: `str` (default: `None`)
265
+
266
+ Client ID to use when authenticating using an AD Service Principal client/secret.
267
+
268
+ - `client_secret`: `str` (default: `None`)
269
+
270
+ Client secret to use when authenticating using an AD Service Principal client/secret.
271
+
272
+ - `tenant_id`: `str` (default: `None`)
273
+
274
+ Tenant ID to use when authenticating using an AD Service Principal client/secret.
275
+
276
+ - `anon`: `boolean` (default: `None`)
277
+
278
+ The value to use for whether to attempt anonymous access if no other credential is
279
+ passed. By default (`None`), the `AZURE_STORAGE_ANON` environment variable is
280
+ checked. False values (`false`, `0`, `f`) will resolve to `False` and
281
+ anonymous access will not be attempted. Otherwise the value for `anon` resolves
282
+ to `True`.
283
+
284
+ - `default_fill_cache`: `bool` (default: `True`)
285
+
286
+ Whether to use cache filling with open by default
287
+
288
+ - `default_cache_type`: `string` (default: `"bytes"`)
289
+
290
+ If given, the default cache_type value used for `open()`. Set to `None` if no caching
291
+ is desired. Docs in fsspec.
292
+
293
+ - `version_aware`: `bool` (default: `False`)
294
+
295
+ Whether to support blob versioning. If enable this will require the user to have the
296
+ necessary permissions for dealing with versioned blobs.
297
+
298
+ - `assume_container_exists`: `bool` (default: `None`)
299
+
300
+ Set this to `True` to not check for existence of containers at all, assuming they exist.
301
+ `None` (default) means to warn in case of a failure when checking for existence of a container.
302
+ `False` throws if retrieving container properties fails, which might happen if your
303
+ authentication is only valid at the storage container level, and not the
304
+ storage account level.
305
+
306
+ - `max_concurrency`: `int` (default: `None`)
307
+
308
+ The number of concurrent connections to use when uploading or downloading a blob.
309
+ If `None` it will be inferred from `fsspec.asyn._get_batch_size()`.
310
+
311
+ - `timeout`: `int` (default: `None`)
312
+
313
+ Sets the server-side timeout when uploading or downloading a blob.
314
+
315
+ - `connection_timeout`: `int` (default: `None`)
316
+
317
+ The number of seconds the client will wait to establish a connection to the server
318
+ when uploading or downloading a blob.
319
+
320
+ - `read_timeout`: `int` (default: `None`)
321
+
322
+ The number of seconds the client will wait, between consecutive read operations,
323
+ for a response from the server while uploading or downloading a blob.
324
+
325
+ - `account_host`: `str` (default: `None`)
326
+
327
+ The storage account host. This string is the entire url to the for the storage
328
+ after the `https://`, i.e. `"https://{account_host}"`. This parameter is only
329
+ required for Azure clouds where account urls do not end with `"blob.core.windows.net"`.
330
+ Note that the `account_name` parameter is still required.
331
+
332
+
333
+ ### Hugging Face
334
+
335
+ DataChain uses [huggingface_hub](https://pypi.org/project/huggingface-hub/) to interact with Hugging Face. You can pass the following parameters to client config to interact with Hugging Face.
336
+
337
+ - `token`: `str` or `bool` (default: `None`)
338
+
339
+ A valid user access token (string). Defaults to the locally saved
340
+ token, which is the recommended method for authentication (see
341
+ https://huggingface.co/docs/huggingface_hub/quick-start#authentication).
342
+ To disable authentication, pass `False`.
343
+
344
+ - `endpoint`: `str` (default: `None`)
345
+
346
+ Endpoint of the Hub. Defaults to `https://huggingface.co`.
@@ -22,13 +22,9 @@ def openimage_detect(args):
22
22
  detections = json.load(stream_json).get("detections", [])
23
23
 
24
24
  for i, detect in enumerate(detections):
25
- bbox = model.BBox.from_list(
26
- [
27
- detect["XMin"] * img.width,
28
- detect["XMax"] * img.width,
29
- detect["YMin"] * img.height,
30
- detect["YMax"] * img.height,
31
- ]
25
+ bbox = model.BBox.from_albumentations(
26
+ [detect[k] for k in ("XMin", "YMin", "XMax", "YMax")],
27
+ img_size=(img.width, img.height),
32
28
  )
33
29
 
34
30
  fstream = File(
@@ -1,11 +1,3 @@
1
- import os
2
-
3
- os.environ["YOLO_VERBOSE"] = "false"
4
-
5
-
6
- from io import BytesIO
7
-
8
- from PIL import Image
9
1
  from ultralytics import YOLO
10
2
 
11
3
  from datachain import C, DataChain, File
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloBBoxes
13
5
 
14
6
 
15
7
  def process_bboxes(yolo: YOLO, file: File) -> YoloBBoxes:
16
- results = yolo(Image.open(BytesIO(file.read())))
8
+ results = yolo(file.as_image_file().read(), verbose=False)
17
9
  return YoloBBoxes.from_results(results)
18
10
 
19
11
 
@@ -1,11 +1,3 @@
1
- import os
2
-
3
- os.environ["YOLO_VERBOSE"] = "false"
4
-
5
-
6
- from io import BytesIO
7
-
8
- from PIL import Image
9
1
  from ultralytics import YOLO
10
2
 
11
3
  from datachain import C, DataChain, File
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloPoses
13
5
 
14
6
 
15
7
  def process_poses(yolo: YOLO, file: File) -> YoloPoses:
16
- results = yolo(Image.open(BytesIO(file.read())))
8
+ results = yolo(file.as_image_file().read(), verbose=False)
17
9
  return YoloPoses.from_results(results)
18
10
 
19
11
 
@@ -1,11 +1,3 @@
1
- import os
2
-
3
- os.environ["YOLO_VERBOSE"] = "false"
4
-
5
-
6
- from io import BytesIO
7
-
8
- from PIL import Image
9
1
  from ultralytics import YOLO
10
2
 
11
3
  from datachain import C, DataChain, File
@@ -13,7 +5,7 @@ from datachain.model.ultralytics import YoloSegments
13
5
 
14
6
 
15
7
  def process_segments(yolo: YOLO, file: File) -> YoloSegments:
16
- results = yolo(Image.open(BytesIO(file.read())))
8
+ results = yolo(file.as_image_file().read(), verbose=False)
17
9
  return YoloSegments.from_results(results)
18
10
 
19
11
 
@@ -84,6 +84,7 @@ nav:
84
84
  - Torch: references/torch.md
85
85
  - Functions: references/func.md
86
86
  - Toolkit: references/toolkit.md
87
+ - 📡 Interacting with remote storage: references/remotes.md
87
88
  - 🤝 Contributing: contributing.md
88
89
 
89
90
  - DataChain Website ↗: https://datachain.ai" target="_blank"