datachain 0.10.0__tar.gz → 0.11.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (326) hide show
  1. {datachain-0.10.0 → datachain-0.11.11}/.github/workflows/benchmarks.yml +2 -2
  2. {datachain-0.10.0 → datachain-0.11.11}/.github/workflows/release.yml +2 -2
  3. {datachain-0.10.0 → datachain-0.11.11}/.github/workflows/tests.yml +15 -7
  4. {datachain-0.10.0 → datachain-0.11.11}/.pre-commit-config.yaml +1 -1
  5. {datachain-0.10.0 → datachain-0.11.11}/PKG-INFO +5 -4
  6. datachain-0.11.11/docs/references/remotes.md +346 -0
  7. {datachain-0.10.0 → datachain-0.11.11}/mkdocs.yml +1 -0
  8. {datachain-0.10.0 → datachain-0.11.11}/noxfile.py +14 -19
  9. {datachain-0.10.0 → datachain-0.11.11}/pyproject.toml +5 -6
  10. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/__init__.py +1 -0
  11. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/show.py +12 -1
  12. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/parser/utils.py +6 -0
  13. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/data_model.py +6 -0
  14. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/dc.py +91 -20
  15. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/file.py +52 -11
  16. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/signal_schema.py +194 -15
  17. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/nodes_thread_pool.py +32 -11
  18. datachain-0.11.11/src/datachain/script_meta.py +147 -0
  19. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/utils.py +3 -0
  20. {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/PKG-INFO +5 -4
  21. {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/SOURCES.txt +4 -0
  22. {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/requires.txt +5 -3
  23. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_datachain.py +311 -15
  24. datachain-0.11.11/tests/func/test_hidden_field.py +70 -0
  25. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_datachain.py +80 -0
  26. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_signal_schema.py +140 -0
  27. datachain-0.11.11/tests/unit/test_script_meta.py +119 -0
  28. {datachain-0.10.0 → datachain-0.11.11}/.cruft.json +0 -0
  29. {datachain-0.10.0 → datachain-0.11.11}/.gitattributes +0 -0
  30. {datachain-0.10.0 → datachain-0.11.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  31. {datachain-0.10.0 → datachain-0.11.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  32. {datachain-0.10.0 → datachain-0.11.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  33. {datachain-0.10.0 → datachain-0.11.11}/.github/codecov.yaml +0 -0
  34. {datachain-0.10.0 → datachain-0.11.11}/.github/dependabot.yml +0 -0
  35. {datachain-0.10.0 → datachain-0.11.11}/.github/workflows/tests-studio.yml +0 -0
  36. {datachain-0.10.0 → datachain-0.11.11}/.github/workflows/update-template.yaml +0 -0
  37. {datachain-0.10.0 → datachain-0.11.11}/.gitignore +0 -0
  38. {datachain-0.10.0 → datachain-0.11.11}/CODE_OF_CONDUCT.rst +0 -0
  39. {datachain-0.10.0 → datachain-0.11.11}/LICENSE +0 -0
  40. {datachain-0.10.0 → datachain-0.11.11}/README.rst +0 -0
  41. {datachain-0.10.0 → datachain-0.11.11}/docs/assets/captioned_cartoons.png +0 -0
  42. {datachain-0.10.0 → datachain-0.11.11}/docs/assets/datachain-white.svg +0 -0
  43. {datachain-0.10.0 → datachain-0.11.11}/docs/assets/datachain.svg +0 -0
  44. {datachain-0.10.0 → datachain-0.11.11}/docs/contributing.md +0 -0
  45. {datachain-0.10.0 → datachain-0.11.11}/docs/css/github-permalink-style.css +0 -0
  46. {datachain-0.10.0 → datachain-0.11.11}/docs/examples.md +0 -0
  47. {datachain-0.10.0 → datachain-0.11.11}/docs/index.md +0 -0
  48. {datachain-0.10.0 → datachain-0.11.11}/docs/overrides/main.html +0 -0
  49. {datachain-0.10.0 → datachain-0.11.11}/docs/quick-start.md +0 -0
  50. {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/arrowrow.md +0 -0
  51. {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/bbox.md +0 -0
  52. {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/file.md +0 -0
  53. {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/imagefile.md +0 -0
  54. {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/index.md +0 -0
  55. {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/pose.md +0 -0
  56. {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/segment.md +0 -0
  57. {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/tarvfile.md +0 -0
  58. {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/textfile.md +0 -0
  59. {datachain-0.10.0 → datachain-0.11.11}/docs/references/data-types/videofile.md +0 -0
  60. {datachain-0.10.0 → datachain-0.11.11}/docs/references/datachain.md +0 -0
  61. {datachain-0.10.0 → datachain-0.11.11}/docs/references/func.md +0 -0
  62. {datachain-0.10.0 → datachain-0.11.11}/docs/references/index.md +0 -0
  63. {datachain-0.10.0 → datachain-0.11.11}/docs/references/toolkit.md +0 -0
  64. {datachain-0.10.0 → datachain-0.11.11}/docs/references/torch.md +0 -0
  65. {datachain-0.10.0 → datachain-0.11.11}/docs/references/udf.md +0 -0
  66. {datachain-0.10.0 → datachain-0.11.11}/docs/tutorials.md +0 -0
  67. {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  68. {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  69. {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/openimage-detect.py +0 -0
  70. {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/ultralytics-bbox.py +0 -0
  71. {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/ultralytics-pose.py +0 -0
  72. {datachain-0.10.0 → datachain-0.11.11}/examples/computer_vision/ultralytics-segment.py +0 -0
  73. {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/common_sql_functions.py +0 -0
  74. {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/json-csv-reader.py +0 -0
  75. {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/torch-loader.py +0 -0
  76. {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/udfs/parallel.py +0 -0
  77. {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/udfs/simple.py +0 -0
  78. {datachain-0.10.0 → datachain-0.11.11}/examples/get_started/udfs/stateful.py +0 -0
  79. {datachain-0.10.0 → datachain-0.11.11}/examples/llm_and_nlp/claude-query.py +0 -0
  80. {datachain-0.10.0 → datachain-0.11.11}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  81. {datachain-0.10.0 → datachain-0.11.11}/examples/multimodal/clip_inference.py +0 -0
  82. {datachain-0.10.0 → datachain-0.11.11}/examples/multimodal/hf_pipeline.py +0 -0
  83. {datachain-0.10.0 → datachain-0.11.11}/examples/multimodal/openai_image_desc_lib.py +0 -0
  84. {datachain-0.10.0 → datachain-0.11.11}/examples/multimodal/wds.py +0 -0
  85. {datachain-0.10.0 → datachain-0.11.11}/examples/multimodal/wds_filtered.py +0 -0
  86. {datachain-0.10.0 → datachain-0.11.11}/setup.cfg +0 -0
  87. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/__init__.py +0 -0
  88. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/__main__.py +0 -0
  89. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/asyn.py +0 -0
  90. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cache.py +0 -0
  91. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/catalog/__init__.py +0 -0
  92. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/catalog/catalog.py +0 -0
  93. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/catalog/datasource.py +0 -0
  94. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/catalog/loader.py +0 -0
  95. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/__init__.py +0 -0
  96. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/datasets.py +0 -0
  97. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/du.py +0 -0
  98. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/index.py +0 -0
  99. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/ls.py +0 -0
  100. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/misc.py +0 -0
  101. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/commands/query.py +0 -0
  102. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/parser/__init__.py +0 -0
  103. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/parser/job.py +0 -0
  104. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/parser/studio.py +0 -0
  105. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/cli/utils.py +0 -0
  106. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/__init__.py +0 -0
  107. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/azure.py +0 -0
  108. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/fileslice.py +0 -0
  109. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/fsspec.py +0 -0
  110. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/gcs.py +0 -0
  111. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/hf.py +0 -0
  112. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/local.py +0 -0
  113. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/client/s3.py +0 -0
  114. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/config.py +0 -0
  115. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/__init__.py +0 -0
  116. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/db_engine.py +0 -0
  117. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/job.py +0 -0
  118. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/metastore.py +0 -0
  119. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/schema.py +0 -0
  120. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/serializer.py +0 -0
  121. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/sqlite.py +0 -0
  122. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/data_storage/warehouse.py +0 -0
  123. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/dataset.py +0 -0
  124. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/diff/__init__.py +0 -0
  125. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/error.py +0 -0
  126. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/fs/__init__.py +0 -0
  127. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/fs/reference.py +0 -0
  128. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/__init__.py +0 -0
  129. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/aggregate.py +0 -0
  130. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/array.py +0 -0
  131. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/base.py +0 -0
  132. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/conditional.py +0 -0
  133. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/func.py +0 -0
  134. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/numeric.py +0 -0
  135. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/path.py +0 -0
  136. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/random.py +0 -0
  137. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/string.py +0 -0
  138. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/func/window.py +0 -0
  139. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/job.py +0 -0
  140. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/__init__.py +0 -0
  141. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/arrow.py +0 -0
  142. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/clip.py +0 -0
  143. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/__init__.py +0 -0
  144. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/flatten.py +0 -0
  145. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/python_to_sql.py +0 -0
  146. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/sql_to_python.py +0 -0
  147. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/unflatten.py +0 -0
  148. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  149. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/dataset_info.py +0 -0
  150. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/hf.py +0 -0
  151. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/image.py +0 -0
  152. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/listing.py +0 -0
  153. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/listing_info.py +0 -0
  154. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/meta_formats.py +0 -0
  155. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/model_store.py +0 -0
  156. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/pytorch.py +0 -0
  157. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/settings.py +0 -0
  158. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/tar.py +0 -0
  159. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/text.py +0 -0
  160. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/udf.py +0 -0
  161. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/udf_signature.py +0 -0
  162. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/utils.py +0 -0
  163. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/video.py +0 -0
  164. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/webdataset.py +0 -0
  165. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/lib/webdataset_laion.py +0 -0
  166. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/listing.py +0 -0
  167. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/__init__.py +0 -0
  168. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/bbox.py +0 -0
  169. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/pose.py +0 -0
  170. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/segment.py +0 -0
  171. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/ultralytics/__init__.py +0 -0
  172. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/ultralytics/bbox.py +0 -0
  173. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/ultralytics/pose.py +0 -0
  174. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/model/ultralytics/segment.py +0 -0
  175. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/node.py +0 -0
  176. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/nodes_fetcher.py +0 -0
  177. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/progress.py +0 -0
  178. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/py.typed +0 -0
  179. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/__init__.py +0 -0
  180. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/batch.py +0 -0
  181. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/dataset.py +0 -0
  182. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/dispatch.py +0 -0
  183. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/metrics.py +0 -0
  184. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/params.py +0 -0
  185. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/queue.py +0 -0
  186. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/schema.py +0 -0
  187. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/session.py +0 -0
  188. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/udf.py +0 -0
  189. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/query/utils.py +0 -0
  190. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/remote/__init__.py +0 -0
  191. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/remote/studio.py +0 -0
  192. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/__init__.py +0 -0
  193. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/default/__init__.py +0 -0
  194. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/default/base.py +0 -0
  195. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/__init__.py +0 -0
  196. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/aggregate.py +0 -0
  197. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/array.py +0 -0
  198. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/conditional.py +0 -0
  199. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/numeric.py +0 -0
  200. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/path.py +0 -0
  201. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/random.py +0 -0
  202. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/functions/string.py +0 -0
  203. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/selectable.py +0 -0
  204. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/sqlite/__init__.py +0 -0
  205. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/sqlite/base.py +0 -0
  206. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/sqlite/types.py +0 -0
  207. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/sqlite/vector.py +0 -0
  208. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/types.py +0 -0
  209. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/sql/utils.py +0 -0
  210. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/studio.py +0 -0
  211. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/telemetry.py +0 -0
  212. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/toolkit/__init__.py +0 -0
  213. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/toolkit/split.py +0 -0
  214. {datachain-0.10.0 → datachain-0.11.11}/src/datachain/torch/__init__.py +0 -0
  215. {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/dependency_links.txt +0 -0
  216. {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/entry_points.txt +0 -0
  217. {datachain-0.10.0 → datachain-0.11.11}/src/datachain.egg-info/top_level.txt +0 -0
  218. {datachain-0.10.0 → datachain-0.11.11}/tests/__init__.py +0 -0
  219. {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/__init__.py +0 -0
  220. {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/conftest.py +0 -0
  221. {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  222. {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/datasets/.dvc/config +0 -0
  223. {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/datasets/.gitignore +0 -0
  224. {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  225. {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/test_datachain.py +0 -0
  226. {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/test_ls.py +0 -0
  227. {datachain-0.10.0 → datachain-0.11.11}/tests/benchmarks/test_version.py +0 -0
  228. {datachain-0.10.0 → datachain-0.11.11}/tests/conftest.py +0 -0
  229. {datachain-0.10.0 → datachain-0.11.11}/tests/data.py +0 -0
  230. {datachain-0.10.0 → datachain-0.11.11}/tests/examples/__init__.py +0 -0
  231. {datachain-0.10.0 → datachain-0.11.11}/tests/examples/test_examples.py +0 -0
  232. {datachain-0.10.0 → datachain-0.11.11}/tests/examples/test_wds_e2e.py +0 -0
  233. {datachain-0.10.0 → datachain-0.11.11}/tests/examples/wds_data.py +0 -0
  234. {datachain-0.10.0 → datachain-0.11.11}/tests/func/__init__.py +0 -0
  235. {datachain-0.10.0 → datachain-0.11.11}/tests/func/fake-service-account-credentials.json +0 -0
  236. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_catalog.py +0 -0
  237. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_client.py +0 -0
  238. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_cloud_transfer.py +0 -0
  239. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_data_storage.py +0 -0
  240. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_datachain_merge.py +0 -0
  241. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_dataset_query.py +0 -0
  242. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_datasets.py +0 -0
  243. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_feature_pickling.py +0 -0
  244. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_file.py +0 -0
  245. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_hf.py +0 -0
  246. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_listing.py +0 -0
  247. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_ls.py +0 -0
  248. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_meta_formats.py +0 -0
  249. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_metrics.py +0 -0
  250. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_pull.py +0 -0
  251. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_pytorch.py +0 -0
  252. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_query.py +0 -0
  253. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_session.py +0 -0
  254. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_toolkit.py +0 -0
  255. {datachain-0.10.0 → datachain-0.11.11}/tests/func/test_warehouse.py +0 -0
  256. {datachain-0.10.0 → datachain-0.11.11}/tests/scripts/feature_class.py +0 -0
  257. {datachain-0.10.0 → datachain-0.11.11}/tests/scripts/feature_class_exception.py +0 -0
  258. {datachain-0.10.0 → datachain-0.11.11}/tests/scripts/feature_class_parallel.py +0 -0
  259. {datachain-0.10.0 → datachain-0.11.11}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  260. {datachain-0.10.0 → datachain-0.11.11}/tests/scripts/name_len_slow.py +0 -0
  261. {datachain-0.10.0 → datachain-0.11.11}/tests/test_atomicity.py +0 -0
  262. {datachain-0.10.0 → datachain-0.11.11}/tests/test_cli_e2e.py +0 -0
  263. {datachain-0.10.0 → datachain-0.11.11}/tests/test_cli_studio.py +0 -0
  264. {datachain-0.10.0 → datachain-0.11.11}/tests/test_query_e2e.py +0 -0
  265. {datachain-0.10.0 → datachain-0.11.11}/tests/test_telemetry.py +0 -0
  266. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/__init__.py +0 -0
  267. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/__init__.py +0 -0
  268. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/conftest.py +0 -0
  269. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  270. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_arrow.py +0 -0
  271. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_clip.py +0 -0
  272. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  273. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_datachain_merge.py +0 -0
  274. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_diff.py +0 -0
  275. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_feature.py +0 -0
  276. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_feature_utils.py +0 -0
  277. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_file.py +0 -0
  278. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_hf.py +0 -0
  279. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_image.py +0 -0
  280. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_listing_info.py +0 -0
  281. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_models.py +0 -0
  282. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_python_to_sql.py +0 -0
  283. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_schema.py +0 -0
  284. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_sql_to_python.py +0 -0
  285. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_text.py +0 -0
  286. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_udf_signature.py +0 -0
  287. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_utils.py +0 -0
  288. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_video.py +0 -0
  289. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/lib/test_webdataset.py +0 -0
  290. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/__init__.py +0 -0
  291. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/sqlite/__init__.py +0 -0
  292. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/sqlite/test_types.py +0 -0
  293. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
  294. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_array.py +0 -0
  295. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_conditional.py +0 -0
  296. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_path.py +0 -0
  297. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_random.py +0 -0
  298. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_selectable.py +0 -0
  299. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/sql/test_string.py +0 -0
  300. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_asyn.py +0 -0
  301. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_cache.py +0 -0
  302. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_catalog.py +0 -0
  303. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_catalog_loader.py +0 -0
  304. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_cli_parsing.py +0 -0
  305. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_client.py +0 -0
  306. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_client_gcs.py +0 -0
  307. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_client_s3.py +0 -0
  308. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_config.py +0 -0
  309. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_data_storage.py +0 -0
  310. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_database_engine.py +0 -0
  311. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_dataset.py +0 -0
  312. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_dispatch.py +0 -0
  313. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_fileslice.py +0 -0
  314. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_func.py +0 -0
  315. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_listing.py +0 -0
  316. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_metastore.py +0 -0
  317. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_module_exports.py +0 -0
  318. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_pytorch.py +0 -0
  319. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_query.py +0 -0
  320. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_query_metrics.py +0 -0
  321. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_query_params.py +0 -0
  322. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_serializer.py +0 -0
  323. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_session.py +0 -0
  324. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_utils.py +0 -0
  325. {datachain-0.10.0 → datachain-0.11.11}/tests/unit/test_warehouse.py +0 -0
  326. {datachain-0.10.0 → datachain-0.11.11}/tests/utils.py +0 -0
@@ -19,10 +19,10 @@ jobs:
19
19
  runs-on: ubuntu-latest
20
20
  steps:
21
21
  - uses: actions/checkout@v4
22
- - name: Set up Python 3.12
22
+ - name: Set up Python 3.13
23
23
  uses: actions/setup-python@v5
24
24
  with:
25
- python-version: '3.12'
25
+ python-version: '3.13'
26
26
 
27
27
  - name: Setup uv
28
28
  uses: astral-sh/setup-uv@v5
@@ -21,10 +21,10 @@ jobs:
21
21
  with:
22
22
  fetch-depth: 0
23
23
 
24
- - name: Set up Python 3.12
24
+ - name: Set up Python 3.13
25
25
  uses: actions/setup-python@v5
26
26
  with:
27
- python-version: '3.12'
27
+ python-version: '3.13'
28
28
 
29
29
  - name: Setup uv
30
30
  uses: astral-sh/setup-uv@v5
@@ -60,16 +60,16 @@ jobs:
60
60
  fail-fast: false
61
61
  matrix:
62
62
  os: [ubuntu-latest-8-cores]
63
- pyv: ['3.9', '3.10', '3.11', '3.12']
63
+ pyv: ['3.9', '3.10', '3.11', '3.12', '3.13']
64
64
  include:
65
65
  - os: macos-latest
66
66
  pyv: '3.9'
67
67
  - os: macos-latest
68
- pyv: '3.12'
68
+ pyv: '3.13'
69
69
  - os: windows-latest
70
70
  pyv: '3.9'
71
71
  - os: windows-latest
72
- pyv: '3.12'
72
+ pyv: '3.13'
73
73
 
74
74
  steps:
75
75
  - name: Check out the repository
@@ -80,6 +80,14 @@ jobs:
80
80
 
81
81
  - name: Set up FFmpeg
82
82
  uses: AnimMouse/setup-ffmpeg@v1
83
+ id: ffmpeg-install
84
+ continue-on-error: ${{ runner.os == 'macOS' }}
85
+
86
+ # https://github.com/AnimMouse/setup-ffmpeg/issues/5
87
+ - if: steps.ffmpeg-install.outcome == 'failure' && runner.os == 'macOS'
88
+ run: brew install ffmpeg
89
+ env:
90
+ HOMEBREW_NO_AUTO_UPDATE: "1"
83
91
 
84
92
  - name: Set up Python ${{ matrix.pyv }}
85
93
  uses: actions/setup-python@v5
@@ -132,14 +140,14 @@ jobs:
132
140
  fail-fast: false
133
141
  matrix:
134
142
  os: [ubuntu-latest, windows-latest]
135
- pyv: ['3.9', '3.12']
143
+ pyv: ['3.9', '3.13']
136
144
  group: ['get_started', 'computer_vision', 'llm_and_nlp', 'multimodal']
137
145
  exclude:
138
146
  - {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
139
- - {os: ubuntu-latest, pyv: '3.12', group: 'multimodal'}
147
+ - {os: ubuntu-latest, pyv: '3.13', group: 'multimodal'}
140
148
  include:
141
149
  - {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
142
- - {os: ubuntu-latest-4-cores, pyv: "3.12", group: multimodal}
150
+ - {os: ubuntu-latest-4-cores, pyv: "3.13", group: multimodal}
143
151
 
144
152
  steps:
145
153
  - uses: actions/checkout@v4
@@ -163,7 +171,7 @@ jobs:
163
171
 
164
172
  # HF runs against actual API - thus run it only once
165
173
  - name: Set hf token
166
- if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.12'
174
+ if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.13'
167
175
  run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
168
176
 
169
177
  - name: Run examples
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.9.6'
27
+ rev: 'v0.9.9'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.10.0
3
+ Version: 0.11.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.9
12
12
  Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
15
16
  Classifier: Development Status :: 2 - Pre-Alpha
16
17
  Requires-Python: >=3.9
17
18
  Description-Content-Type: text/x-rst
@@ -49,6 +50,7 @@ Requires-Dist: platformdirs
49
50
  Requires-Dist: dvc-studio-client<1,>=0.21
50
51
  Requires-Dist: tabulate
51
52
  Requires-Dist: websockets
53
+ Requires-Dist: tomli; python_version < "3.11"
52
54
  Provides-Extra: docs
53
55
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
54
56
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -70,9 +72,8 @@ Provides-Extra: hf
70
72
  Requires-Dist: numba>=0.60.0; extra == "hf"
71
73
  Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
72
74
  Provides-Extra: video
73
- Requires-Dist: av<14; extra == "video"
74
75
  Requires-Dist: ffmpeg-python; extra == "video"
75
- Requires-Dist: imageio[ffmpeg]; extra == "video"
76
+ Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
76
77
  Requires-Dist: opencv-python; extra == "video"
77
78
  Provides-Extra: tests
78
79
  Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
@@ -102,7 +103,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
102
103
  Requires-Dist: defusedxml; extra == "examples"
103
104
  Requires-Dist: accelerate; extra == "examples"
104
105
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
105
- Requires-Dist: ultralytics==8.3.74; extra == "examples"
106
+ Requires-Dist: ultralytics==8.3.82; extra == "examples"
106
107
  Requires-Dist: open_clip_torch; extra == "examples"
107
108
 
108
109
  ================
@@ -0,0 +1,346 @@
1
+ # Interacting with remote storage
2
+
3
+ DataChain supports reading and writing data from different remote storages using methods like `DataChain.from_storage` and `DataChain.to_storage`. The supported storages includes: local file system, AWS S3 storage, Google Cloud Storage, Azure Blob Storage, Hugging Face and more.
4
+
5
+ Example implementation for reading and writing data from/to different remote storages:
6
+
7
+ ```python
8
+ from datachain import DataChain
9
+
10
+ dc = DataChain.from_storage("s3://bucket-name/path/to/data")
11
+ dc.to_storage("gs://bucket-name/path/to/data")
12
+ ```
13
+
14
+ DataChain uses [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) to interact with different remote storages. You can pass the following fsspec-supported URIs to `from_storage` and `to_storage` methods.
15
+
16
+ - Local file system: `file://path/to/data`
17
+ - AWS S3 storage: `s3://bucket-name/path/to/data`
18
+ - Google Cloud Storage: `gs://bucket-name/path/to/data`
19
+ - Azure Blob Storage: `az://container-name/path/to/data`
20
+ - Hugging Face: `hf://dataset-name`
21
+
22
+ ## Extra configuration
23
+ For the configuration parameters to the filesystem, you can pass the key and value pair as client_config dictionary that will be passed to the respective filesystem.
24
+
25
+
26
+ ### AWS S3 compatible storage
27
+
28
+ DataChain uses [s3fs](https://s3fs.readthedocs.io/en/latest/) to interact with AWS S3 storage. Authentication can be configured using standard AWS credential locations, such as `~/.aws/credentials` and `~/.aws/config`. You can also pass the following configuration parameters to the s3fs filesystem as `client_config` dictionary.
29
+
30
+ - `anon`: `bool` (default: `False`)
31
+
32
+ Whether to use anonymous connection (public buckets only). If `False`,
33
+ uses the key/secret given, or boto's credential resolver (client_kwargs,
34
+ environment, variables, config files, EC2 IAM server, in that order)
35
+
36
+ - `endpoint_url`: `string` (default: `None`)
37
+
38
+ Use this endpoint URL, if specified. Needed for connecting to non-AWS
39
+ S3 buckets. Takes precedence over `endpoint_url` in client_kwargs.
40
+
41
+ - `key`: `string` (default: `None`)
42
+
43
+ If not anonymous, use this access key ID, if specified. Takes precedence
44
+ over `aws_access_key_id` in client_kwargs.
45
+
46
+ - `secret`: `string` (default: `None`)
47
+
48
+ If not anonymous, use this secret access key, if specified. Takes
49
+ precedence over `aws_secret_access_key` in client_kwargs.
50
+
51
+ - `token`: `string` (default: `None`)
52
+
53
+ If not anonymous, use this security token, if specified
54
+
55
+ - `use_ssl`: `bool` (default: `True`)
56
+
57
+ Whether to use SSL in connections to S3; may be faster without, but
58
+ insecure. If `use_ssl` is also set in `client_kwargs`,
59
+ the value set in `client_kwargs` will take priority.
60
+
61
+ - `s3_additional_kwargs`: `dict` (default: `{}`)
62
+
63
+ Dict of parameters that are used when calling s3 api
64
+ methods. Typically used for things like "ServerSideEncryption".
65
+
66
+ - `client_kwargs`: `dict` (default: `{}`)
67
+
68
+ Dict of parameters for the botocore client.
69
+
70
+ - `requester_pays`: `bool` (default: `False`)
71
+
72
+ If RequesterPays buckets are supported.
73
+
74
+ - `default_block_size`: `int` (default: `None`)
75
+
76
+ If given, the default block size value used for `open()`, if no
77
+ specific value is given at all time. The built-in default is 5MB.
78
+
79
+ - `default_fill_cache`: `bool` (default: `True`)
80
+
81
+ Whether to use cache filling with open by default. Refer to `S3File.open`.
82
+
83
+ - `default_cache_type`: `string` (default: `"readahead"`)
84
+
85
+ If given, the default cache_type value used for `open()`. Set to `None`
86
+ if no caching is desired. See fsspec's documentation for other available
87
+ `cache_type` values. Default cache_type is `"readahead"`.
88
+
89
+ - `version_aware`: `bool` (default: `False`)
90
+
91
+ Whether to support bucket versioning. If enable this will require the
92
+ user to have the necessary IAM permissions for dealing with versioned
93
+ objects. Note that in the event that you only need to work with the
94
+ latest version of objects in a versioned bucket, and do not need the
95
+ VersionId for those objects, you should set `version_aware` to `False`
96
+ for performance reasons. When set to `True`, filesystem instances will
97
+ use the S3 `ListObjectVersions` API call to list directory contents,
98
+ which requires listing all historical object versions.
99
+
100
+ - `cache_regions`: `bool` (default: `False`)
101
+
102
+ Whether to cache bucket regions or not. Whenever a new bucket is used,
103
+ it will first find out which region it belongs and then use the client
104
+ for that region.
105
+
106
+ - `asynchronous`: `bool` (default: `False`)
107
+
108
+ Whether this instance is to be used from inside coroutines.
109
+
110
+ - `config_kwargs`: `dict` (default: `{}`)
111
+
112
+ Dict of parameters passed to `botocore.client.Config`.
113
+
114
+ - `kwargs`: `dict` (default: `{}`)
115
+
116
+ Other parameters for core session.
117
+
118
+ - `session`: `aiobotocore.session.AioSession` (default: `None`)
119
+
120
+ Aiobotocore `AioSession` object to be used for all connections.
121
+ This session will be used inplace of creating a new session inside S3FileSystem.
122
+
123
+ For example: `aiobotocore.session.AioSession(profile='test_user')`
124
+
125
+ - `max_concurrency`: `int` (default: `1`)
126
+
127
+ The maximum number of concurrent transfers to use per file for multipart
128
+ upload (`put()`) operations. Defaults to `1` (sequential). When used in
129
+ conjunction with `S3FileSystem.put(batch_size=...)` the maximum number of
130
+ simultaneous connections is `max_concurrency * batch_size`. We may extend
131
+ this parameter to affect `pipe()`, `cat()` and `get()`. Increasing this
132
+ value will result in higher memory usage during multipart upload operations (by
133
+ `max_concurrency * chunksize` bytes per file).
134
+
135
+
136
+ Example:
137
+ ```python
138
+ chain = DataChain.from_storage(
139
+ "s3://my-bucket/my-dir",
140
+ client_config = {
141
+ "endpoint_url": "<minio-endpoint-url>",
142
+ "key": "<minio-access-key",
143
+ "secret": "<minio-secret-key"
144
+ }
145
+ )
146
+ ```
147
+
148
+ ### Google Cloud Storage
149
+
150
+ DataChain uses [gcsfs](https://gcsfs.readthedocs.io/en/latest/) to interact with Google Cloud Storage. Authentication can be achieved by using any of the method described at [gcsfs documentation](https://gcsfs.readthedocs.io/en/latest/#credentials). You can also pass the following configuration parameters to the gcsfs filesystem as client_config dictionary.
151
+
152
+ - `project`: `string` (default: `None`)
153
+
154
+ The project to work under. Note that this is not the same as, but often
155
+ very similar to, the project name. This is required in order to list all
156
+ the buckets you have access to within a project and to create/delete
157
+ buckets, or update their access policies. If `token='google_default'`,
158
+ the value is overridden by the default, if `token='anon'`, the value is
159
+ ignored.
160
+
161
+ - `access`: `string` (default: `None`)
162
+
163
+ One of `"read_only"`, `"read_write"`, `"full_control"`. Full control implies
164
+ read/write as well as modifying metadata, e.g., access control.
165
+
166
+ - `token`: `None`, `dict` or `string` (default: `None`)
167
+
168
+ The token to use for authentication. If `None`, the default is used. If
169
+ a string, it is interpreted as a path to a token file. If a dict, it is
170
+ interpreted as a token dictionary, such as that provided by Google Cloud
171
+ Platform. See also description of authentication methods, from link above.
172
+
173
+ - `consistency`: `string` (default: `None`)
174
+
175
+ One of `"none"`, `"size"`, `"md5"`. Check method when writing files.
176
+ Can be overridden in `open()`.
177
+
178
+ - `cache_timeout`: `float` (default: `None`)
179
+
180
+ Cache expiration time in seconds for object metadata cache. Set
181
+ `cache_timeout <= 0` for no caching, `None` for no cache expiration.
182
+
183
+ - `secure_serialize`: `bool` (default: `None`)
184
+
185
+ Whether to use secure serialization. This is a deprecated option and
186
+ will be removed in future versions.
187
+
188
+ - `requester_pays`: `bool` or `str` (default: `False`)
189
+
190
+ Whether to use requester-pays requests. This will include your
191
+ project ID `project` in requests as the `userProject`, and you'll be
192
+ billed for accessing data from requester-pays buckets. Optionally,
193
+ pass a project-id here as a string to use that as the `userProject`.
194
+
195
+ - `session_kwargs`: `dict` (default: `{}`)
196
+
197
+ Passed on to `aiohttp.ClientSession`. Can contain, for example, proxy
198
+ settings.
199
+
200
+ - `endpoint_url`: `string` (default: `None`)
201
+
202
+ If given, use this URL (format: `protocol://host:port`, *without* any
203
+ path part) for communication. If not given, defaults to the value
204
+ of environment variable `"STORAGE_EMULATOR_HOST"`; if that is not set
205
+ either, will use the standard Google endpoint.
206
+
207
+ - `default_location`: `str` (default: `None`)
208
+
209
+ Default location where buckets are created, like `"US"` or `"EUROPE-WEST3"`.
210
+ You can find a list of all available locations here:
211
+ https://cloud.google.com/storage/docs/locations#available-locations
212
+
213
+ - `version_aware`: `bool` (default: `False`)
214
+
215
+ Whether to support object versioning. If enabled this will require the
216
+ user to have the necessary permissions for dealing with versioned objects.
217
+
218
+
219
+ ### Azure Blob Storage
220
+
221
+ DataChain uses [adlfs](https://fsspec.github.io/adlfs/) to interact with Azure Blob Storage. Authentication can be achieved by using any of the method described at [adlfs documentation](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials). You can also pass the following configuration parameters to the adlfs filesystem as client_config dictionary.
222
+
223
+ - `account_name`: `str` (default: `None`)
224
+
225
+ The storage account name. This is used to authenticate requests
226
+ signed with an account key and to construct the storage endpoint. It
227
+ is required unless a connection string is given, or if a custom
228
+ domain is used with anonymous authentication.
229
+
230
+ - `account_key`: `str` (default: `None`)
231
+
232
+ The storage account key. This is used for shared key authentication.
233
+ If any of account key, sas token or client_id is specified, anonymous access
234
+ will be used.
235
+
236
+ - `sas_token`: `str` (default: `None`)
237
+
238
+ A shared access signature token to use to authenticate requests
239
+ instead of the account key. If account key and sas token are both
240
+ specified, account key will be used to sign. If any of account key, sas token
241
+ or client_id are specified, anonymous access will be used.
242
+
243
+ - `request_session`: `requests.Session` (default: `None`)
244
+
245
+ The session object to use for http requests.
246
+
247
+ - `connection_string`: `str` (default: `None`)
248
+
249
+ If specified, this will override all other parameters besides
250
+ request session. See
251
+ http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/
252
+ for the connection string format.
253
+
254
+ - `credential`: `azure.core.credentials_async.AsyncTokenCredential` or SAS token (default: `None`)
255
+
256
+ The credentials with which to authenticate. Optional if the account URL already has a SAS token.
257
+ Can include an instance of TokenCredential class from azure.identity.aio.
258
+
259
+ - `blocksize`: `int` (default: `None`)
260
+
261
+ The block size to use for download/upload operations. Defaults to hardcoded value of
262
+ `BlockBlobService.MAX_BLOCK_SIZE`
263
+
264
+ - `client_id`: `str` (default: `None`)
265
+
266
+ Client ID to use when authenticating using an AD Service Principal client/secret.
267
+
268
+ - `client_secret`: `str` (default: `None`)
269
+
270
+ Client secret to use when authenticating using an AD Service Principal client/secret.
271
+
272
+ - `tenant_id`: `str` (default: `None`)
273
+
274
+ Tenant ID to use when authenticating using an AD Service Principal client/secret.
275
+
276
+ - `anon`: `boolean` (default: `None`)
277
+
278
+ The value to use for whether to attempt anonymous access if no other credential is
279
+ passed. By default (`None`), the `AZURE_STORAGE_ANON` environment variable is
280
+ checked. False values (`false`, `0`, `f`) will resolve to `False` and
281
+ anonymous access will not be attempted. Otherwise the value for `anon` resolves
282
+ to `True`.
283
+
284
+ - `default_fill_cache`: `bool` (default: `True`)
285
+
286
+ Whether to use cache filling with open by default
287
+
288
+ - `default_cache_type`: `string` (default: `"bytes"`)
289
+
290
+ If given, the default cache_type value used for `open()`. Set to `None` if no caching
291
+ is desired. Docs in fsspec.
292
+
293
+ - `version_aware`: `bool` (default: `False`)
294
+
295
+ Whether to support blob versioning. If enable this will require the user to have the
296
+ necessary permissions for dealing with versioned blobs.
297
+
298
+ - `assume_container_exists`: `bool` (default: `None`)
299
+
300
+ Set this to `True` to not check for existence of containers at all, assuming they exist.
301
+ `None` (default) means to warn in case of a failure when checking for existence of a container.
302
+ `False` throws if retrieving container properties fails, which might happen if your
303
+ authentication is only valid at the storage container level, and not the
304
+ storage account level.
305
+
306
+ - `max_concurrency`: `int` (default: `None`)
307
+
308
+ The number of concurrent connections to use when uploading or downloading a blob.
309
+ If `None` it will be inferred from `fsspec.asyn._get_batch_size()`.
310
+
311
+ - `timeout`: `int` (default: `None`)
312
+
313
+ Sets the server-side timeout when uploading or downloading a blob.
314
+
315
+ - `connection_timeout`: `int` (default: `None`)
316
+
317
+ The number of seconds the client will wait to establish a connection to the server
318
+ when uploading or downloading a blob.
319
+
320
+ - `read_timeout`: `int` (default: `None`)
321
+
322
+ The number of seconds the client will wait, between consecutive read operations,
323
+ for a response from the server while uploading or downloading a blob.
324
+
325
+ - `account_host`: `str` (default: `None`)
326
+
327
+ The storage account host. This string is the entire url to the for the storage
328
+ after the `https://`, i.e. `"https://{account_host}"`. This parameter is only
329
+ required for Azure clouds where account urls do not end with `"blob.core.windows.net"`.
330
+ Note that the `account_name` parameter is still required.
331
+
332
+
333
+ ### Hugging Face
334
+
335
+ DataChain uses [huggingface_hub](https://pypi.org/project/huggingface-hub/) to interact with Hugging Face. You can pass the following parameters to client config to interact with Hugging Face.
336
+
337
+ - `token`: `str` or `bool` (default: `None`)
338
+
339
+ A valid user access token (string). Defaults to the locally saved
340
+ token, which is the recommended method for authentication (see
341
+ https://huggingface.co/docs/huggingface_hub/quick-start#authentication).
342
+ To disable authentication, pass `False`.
343
+
344
+ - `endpoint`: `str` (default: `None`)
345
+
346
+ Endpoint of the Hub. Defaults to `https://huggingface.co`.
@@ -84,6 +84,7 @@ nav:
84
84
  - Torch: references/torch.md
85
85
  - Functions: references/func.md
86
86
  - Toolkit: references/toolkit.md
87
+ - 📡 Interacting with remote storage: references/remotes.md
87
88
  - 🤝 Contributing: contributing.md
88
89
 
89
90
  - DataChain Website ↗: https://datachain.ai" target="_blank"
@@ -1,13 +1,18 @@
1
1
  """Automation using nox."""
2
+ # /// script
3
+ # dependencies = ["nox"]
4
+ # ///
2
5
 
3
6
  import glob
4
- import os
5
7
 
6
8
  import nox
7
9
 
8
10
  nox.options.default_venv_backend = "uv|virtualenv"
9
11
  nox.options.reuse_existing_virtualenvs = True
10
12
  nox.options.sessions = "lint", "tests"
13
+
14
+ project = nox.project.load_toml()
15
+ python_versions = nox.project.python_versions(project)
11
16
  locations = "src", "tests"
12
17
 
13
18
 
@@ -29,12 +34,12 @@ def bench(session: nox.Session) -> None:
29
34
  )
30
35
 
31
36
 
32
- @nox.session(python=["3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"])
37
+ @nox.session(python=python_versions)
33
38
  def tests(session: nox.Session) -> None:
34
39
  session.install(".[tests]")
35
40
  env = {"COVERAGE_FILE": f".coverage.{session.python}"}
36
- if session.python == "3.12":
37
- # improve performance of tests in Python 3.12 when used with coverage
41
+ if session.python in ("3.12", "3.13"):
42
+ # improve performance of tests in Python>=3.12 when used with coverage
38
43
  # https://github.com/nedbat/coveragepy/issues/1665
39
44
  # https://github.com/python/cpython/issues/107674
40
45
  env["COVERAGE_CORE"] = "sysmon"
@@ -68,21 +73,7 @@ def build(session: nox.Session) -> None:
68
73
  session.run("twine", "check", *dists, silent=True)
69
74
 
70
75
 
71
- @nox.session
72
- def dev(session: nox.Session) -> None:
73
- """Sets up a python development environment for the project."""
74
- args = session.posargs or ("venv",)
75
- venv_dir = os.fsdecode(os.path.abspath(args[0]))
76
-
77
- session.log(f"Setting up virtual environment in {venv_dir}")
78
- session.install("virtualenv")
79
- session.run("virtualenv", venv_dir, silent=True)
80
-
81
- python = os.path.join(venv_dir, "bin/python")
82
- session.run(python, "-m", "pip", "install", "-e", ".[dev]", external=True)
83
-
84
-
85
- @nox.session(python=["3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"])
76
+ @nox.session(python=python_versions)
86
77
  def examples(session: nox.Session) -> None:
87
78
  session.install(".[examples]")
88
79
  session.run(
@@ -93,3 +84,7 @@ def examples(session: nox.Session) -> None:
93
84
  "examples",
94
85
  *session.posargs,
95
86
  )
87
+
88
+
89
+ if __name__ == "__main__":
90
+ nox.main()
@@ -14,6 +14,7 @@ classifiers = [
14
14
  "Programming Language :: Python :: 3.10",
15
15
  "Programming Language :: Python :: 3.11",
16
16
  "Programming Language :: Python :: 3.12",
17
+ "Programming Language :: Python :: 3.13",
17
18
  "Development Status :: 2 - Pre-Alpha"
18
19
  ]
19
20
  requires-python = ">=3.9"
@@ -51,7 +52,8 @@ dependencies = [
51
52
  "platformdirs",
52
53
  "dvc-studio-client>=0.21,<1",
53
54
  "tabulate",
54
- "websockets"
55
+ "websockets",
56
+ "tomli;python_version<'3.11'"
55
57
  ]
56
58
 
57
59
  [project.optional-dependencies]
@@ -81,11 +83,8 @@ hf = [
81
83
  "datasets[audio,vision]>=2.21.0"
82
84
  ]
83
85
  video = [
84
- # Use 'av<14' because of incompatibility with imageio
85
- # See https://github.com/PyAV-Org/PyAV/discussions/1700
86
- "av<14",
87
86
  "ffmpeg-python",
88
- "imageio[ffmpeg]",
87
+ "imageio[ffmpeg,pyav]>=2.37.0",
89
88
  "opencv-python"
90
89
  ]
91
90
  tests = [
@@ -118,7 +117,7 @@ examples = [
118
117
  "defusedxml",
119
118
  "accelerate",
120
119
  "huggingface_hub[hf_transfer]",
121
- "ultralytics==8.3.74",
120
+ "ultralytics==8.3.82",
122
121
  "open_clip_torch"
123
122
  ]
124
123
 
@@ -215,6 +215,7 @@ def handle_show_command(args, catalog):
215
215
  columns=args.columns,
216
216
  no_collapse=args.no_collapse,
217
217
  schema=args.schema,
218
+ include_hidden=args.hidden,
218
219
  )
219
220
 
220
221
 
@@ -1,6 +1,8 @@
1
1
  from collections.abc import Sequence
2
2
  from typing import TYPE_CHECKING, Optional
3
3
 
4
+ from datachain.lib.signal_schema import SignalSchema
5
+
4
6
  if TYPE_CHECKING:
5
7
  from datachain.catalog import Catalog
6
8
 
@@ -14,6 +16,7 @@ def show(
14
16
  columns: Sequence[str] = (),
15
17
  no_collapse: bool = False,
16
18
  schema: bool = False,
19
+ include_hidden: bool = False,
17
20
  ) -> None:
18
21
  from datachain import Session
19
22
  from datachain.lib.dc import DataChain
@@ -23,6 +26,13 @@ def show(
23
26
  dataset = catalog.get_dataset(name)
24
27
  dataset_version = dataset.get_version(version or dataset.latest_version)
25
28
 
29
+ if include_hidden:
30
+ hidden_fields = []
31
+ else:
32
+ hidden_fields = SignalSchema.get_flatten_hidden_fields(
33
+ dataset_version.feature_schema
34
+ )
35
+
26
36
  query = (
27
37
  DatasetQuery(name=name, version=version, catalog=catalog)
28
38
  .select(*columns)
@@ -30,7 +40,8 @@ def show(
30
40
  .offset(offset)
31
41
  )
32
42
  records = query.to_db_records()
33
- show_records(records, collapse_columns=not no_collapse)
43
+ show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
44
+
34
45
  if schema and dataset_version.feature_schema:
35
46
  print("\nSchema:")
36
47
  session = Session.get(catalog=catalog)
@@ -98,3 +98,9 @@ def add_show_args(parser: ArgumentParser) -> None:
98
98
  default=False,
99
99
  help="Do not collapse the columns",
100
100
  )
101
+ parser.add_argument(
102
+ "--hidden",
103
+ action="store_true",
104
+ default=False,
105
+ help="Show hidden fields",
106
+ )
@@ -26,6 +26,7 @@ class DataModel(BaseModel):
26
26
  """Pydantic model wrapper that registers model with `DataChain`."""
27
27
 
28
28
  _version: ClassVar[int] = 1
29
+ _hidden_fields: ClassVar[list[str]] = []
29
30
 
30
31
  @classmethod
31
32
  def __pydantic_init_subclass__(cls):
@@ -41,6 +42,11 @@ class DataModel(BaseModel):
41
42
  for val in models:
42
43
  ModelStore.register(val)
43
44
 
45
+ @classmethod
46
+ def hidden_fields(cls) -> list[str]:
47
+ """Returns a list of fields that should be hidden from the user."""
48
+ return cls._hidden_fields
49
+
44
50
 
45
51
  def is_chain_type(t: type) -> bool:
46
52
  """Return true if type is supported by `DataChain`."""