datachain 0.8.13__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (325) hide show
  1. {datachain-0.8.13 → datachain-0.9.0}/.github/workflows/tests-studio.yml +3 -0
  2. {datachain-0.8.13 → datachain-0.9.0}/.github/workflows/tests.yml +3 -0
  3. {datachain-0.8.13 → datachain-0.9.0}/.pre-commit-config.yaml +2 -2
  4. {datachain-0.8.13 → datachain-0.9.0}/PKG-INFO +13 -5
  5. datachain-0.9.0/docs/references/data-types/arrowrow.md +3 -0
  6. datachain-0.9.0/docs/references/data-types/bbox.md +5 -0
  7. datachain-0.9.0/docs/references/data-types/file.md +35 -0
  8. datachain-0.9.0/docs/references/data-types/imagefile.md +15 -0
  9. datachain-0.8.13/docs/references/datatype.md → datachain-0.9.0/docs/references/data-types/index.md +1 -1
  10. datachain-0.9.0/docs/references/data-types/pose.md +5 -0
  11. datachain-0.9.0/docs/references/data-types/segment.md +3 -0
  12. datachain-0.9.0/docs/references/data-types/tarvfile.md +3 -0
  13. datachain-0.9.0/docs/references/data-types/textfile.md +13 -0
  14. datachain-0.9.0/docs/references/data-types/videofile.md +29 -0
  15. datachain-0.9.0/docs/references/index.md +23 -0
  16. datachain-0.9.0/docs/references/toolkit.md +5 -0
  17. datachain-0.9.0/examples/get_started/common_sql_functions.py +54 -0
  18. {datachain-0.8.13 → datachain-0.9.0}/mkdocs.yml +12 -2
  19. {datachain-0.8.13 → datachain-0.9.0}/noxfile.py +7 -1
  20. {datachain-0.8.13 → datachain-0.9.0}/pyproject.toml +16 -5
  21. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/__init__.py +10 -0
  22. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/catalog/catalog.py +32 -9
  23. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/__init__.py +2 -0
  24. datachain-0.9.0/src/datachain/cli/commands/datasets.py +175 -0
  25. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/parser/__init__.py +62 -12
  26. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/parser/job.py +14 -4
  27. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/parser/studio.py +8 -0
  28. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/parser/utils.py +20 -1
  29. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/dataset.py +7 -4
  30. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/diff/__init__.py +78 -128
  31. datachain-0.9.0/src/datachain/fs/reference.py +21 -0
  32. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/__init__.py +3 -1
  33. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/conditional.py +66 -2
  34. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/job.py +1 -1
  35. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/arrow.py +1 -11
  36. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/dc.py +2 -0
  37. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/file.py +292 -5
  38. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/hf.py +1 -1
  39. datachain-0.9.0/src/datachain/lib/video.py +223 -0
  40. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/dataset.py +28 -3
  41. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/remote/studio.py +13 -6
  42. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/studio.py +34 -12
  43. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/utils.py +12 -2
  44. {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/PKG-INFO +13 -5
  45. {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/SOURCES.txt +16 -3
  46. {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/requires.txt +13 -4
  47. {datachain-0.8.13 → datachain-0.9.0}/tests/conftest.py +11 -5
  48. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_catalog.py +44 -0
  49. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_pull.py +42 -4
  50. {datachain-0.8.13 → datachain-0.9.0}/tests/test_cli_e2e.py +1 -1
  51. {datachain-0.8.13 → datachain-0.9.0}/tests/test_cli_studio.py +33 -12
  52. datachain-0.9.0/tests/unit/lib/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  53. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_diff.py +16 -5
  54. datachain-0.9.0/tests/unit/lib/test_video.py +229 -0
  55. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_conditional.py +32 -0
  56. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_cli_parsing.py +2 -1
  57. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_func.py +26 -0
  58. datachain-0.8.13/docs/references/file.md +0 -22
  59. datachain-0.8.13/docs/references/index.md +0 -14
  60. datachain-0.8.13/examples/get_started/common_sql_functions.py +0 -113
  61. datachain-0.8.13/src/datachain/cli/commands/datasets.py +0 -109
  62. {datachain-0.8.13 → datachain-0.9.0}/.cruft.json +0 -0
  63. {datachain-0.8.13 → datachain-0.9.0}/.gitattributes +0 -0
  64. {datachain-0.8.13 → datachain-0.9.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  65. {datachain-0.8.13 → datachain-0.9.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  66. {datachain-0.8.13 → datachain-0.9.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  67. {datachain-0.8.13 → datachain-0.9.0}/.github/codecov.yaml +0 -0
  68. {datachain-0.8.13 → datachain-0.9.0}/.github/dependabot.yml +0 -0
  69. {datachain-0.8.13 → datachain-0.9.0}/.github/workflows/benchmarks.yml +0 -0
  70. {datachain-0.8.13 → datachain-0.9.0}/.github/workflows/release.yml +0 -0
  71. {datachain-0.8.13 → datachain-0.9.0}/.github/workflows/update-template.yaml +0 -0
  72. {datachain-0.8.13 → datachain-0.9.0}/.gitignore +0 -0
  73. {datachain-0.8.13 → datachain-0.9.0}/CODE_OF_CONDUCT.rst +0 -0
  74. {datachain-0.8.13 → datachain-0.9.0}/LICENSE +0 -0
  75. {datachain-0.8.13 → datachain-0.9.0}/README.rst +0 -0
  76. {datachain-0.8.13 → datachain-0.9.0}/docs/assets/captioned_cartoons.png +0 -0
  77. {datachain-0.8.13 → datachain-0.9.0}/docs/assets/datachain-white.svg +0 -0
  78. {datachain-0.8.13 → datachain-0.9.0}/docs/assets/datachain.svg +0 -0
  79. {datachain-0.8.13 → datachain-0.9.0}/docs/contributing.md +0 -0
  80. {datachain-0.8.13 → datachain-0.9.0}/docs/css/github-permalink-style.css +0 -0
  81. {datachain-0.8.13 → datachain-0.9.0}/docs/examples.md +0 -0
  82. {datachain-0.8.13 → datachain-0.9.0}/docs/index.md +0 -0
  83. {datachain-0.8.13 → datachain-0.9.0}/docs/overrides/main.html +0 -0
  84. {datachain-0.8.13 → datachain-0.9.0}/docs/quick-start.md +0 -0
  85. {datachain-0.8.13 → datachain-0.9.0}/docs/references/datachain.md +0 -0
  86. {datachain-0.8.13 → datachain-0.9.0}/docs/references/func.md +0 -0
  87. {datachain-0.8.13 → datachain-0.9.0}/docs/references/torch.md +0 -0
  88. {datachain-0.8.13 → datachain-0.9.0}/docs/references/udf.md +0 -0
  89. {datachain-0.8.13 → datachain-0.9.0}/docs/tutorials.md +0 -0
  90. {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  91. {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  92. {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/openimage-detect.py +0 -0
  93. {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  94. {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  95. {datachain-0.8.13 → datachain-0.9.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  96. {datachain-0.8.13 → datachain-0.9.0}/examples/get_started/json-csv-reader.py +0 -0
  97. {datachain-0.8.13 → datachain-0.9.0}/examples/get_started/torch-loader.py +0 -0
  98. {datachain-0.8.13 → datachain-0.9.0}/examples/get_started/udfs/parallel.py +0 -0
  99. {datachain-0.8.13 → datachain-0.9.0}/examples/get_started/udfs/simple.py +0 -0
  100. {datachain-0.8.13 → datachain-0.9.0}/examples/get_started/udfs/stateful.py +0 -0
  101. {datachain-0.8.13 → datachain-0.9.0}/examples/llm_and_nlp/claude-query.py +0 -0
  102. {datachain-0.8.13 → datachain-0.9.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  103. {datachain-0.8.13 → datachain-0.9.0}/examples/multimodal/clip_inference.py +0 -0
  104. {datachain-0.8.13 → datachain-0.9.0}/examples/multimodal/hf_pipeline.py +0 -0
  105. {datachain-0.8.13 → datachain-0.9.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  106. {datachain-0.8.13 → datachain-0.9.0}/examples/multimodal/wds.py +0 -0
  107. {datachain-0.8.13 → datachain-0.9.0}/examples/multimodal/wds_filtered.py +0 -0
  108. {datachain-0.8.13 → datachain-0.9.0}/setup.cfg +0 -0
  109. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/__main__.py +0 -0
  110. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/asyn.py +0 -0
  111. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cache.py +0 -0
  112. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/catalog/__init__.py +0 -0
  113. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/catalog/datasource.py +0 -0
  114. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/catalog/loader.py +0 -0
  115. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/__init__.py +0 -0
  116. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/du.py +0 -0
  117. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/index.py +0 -0
  118. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/ls.py +0 -0
  119. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/misc.py +0 -0
  120. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/query.py +0 -0
  121. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/commands/show.py +0 -0
  122. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/cli/utils.py +0 -0
  123. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/__init__.py +0 -0
  124. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/azure.py +0 -0
  125. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/fileslice.py +0 -0
  126. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/fsspec.py +0 -0
  127. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/gcs.py +0 -0
  128. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/hf.py +0 -0
  129. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/local.py +0 -0
  130. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/client/s3.py +0 -0
  131. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/config.py +0 -0
  132. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/__init__.py +0 -0
  133. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/db_engine.py +0 -0
  134. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/job.py +0 -0
  135. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/metastore.py +0 -0
  136. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/schema.py +0 -0
  137. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/serializer.py +0 -0
  138. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/sqlite.py +0 -0
  139. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/data_storage/warehouse.py +0 -0
  140. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/error.py +0 -0
  141. {datachain-0.8.13/src/datachain/lib → datachain-0.9.0/src/datachain/fs}/__init__.py +0 -0
  142. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/aggregate.py +0 -0
  143. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/array.py +0 -0
  144. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/base.py +0 -0
  145. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/func.py +0 -0
  146. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/numeric.py +0 -0
  147. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/path.py +0 -0
  148. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/random.py +0 -0
  149. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/string.py +0 -0
  150. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/func/window.py +0 -0
  151. {datachain-0.8.13/src/datachain/lib/convert → datachain-0.9.0/src/datachain/lib}/__init__.py +0 -0
  152. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/clip.py +0 -0
  153. {datachain-0.8.13/src/datachain/remote → datachain-0.9.0/src/datachain/lib/convert}/__init__.py +0 -0
  154. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/convert/flatten.py +0 -0
  155. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  156. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  157. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/convert/unflatten.py +0 -0
  158. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  159. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/data_model.py +0 -0
  160. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/dataset_info.py +0 -0
  161. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/image.py +0 -0
  162. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/listing.py +0 -0
  163. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/listing_info.py +0 -0
  164. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/meta_formats.py +0 -0
  165. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/model_store.py +0 -0
  166. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/pytorch.py +0 -0
  167. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/settings.py +0 -0
  168. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/signal_schema.py +0 -0
  169. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/tar.py +0 -0
  170. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/text.py +0 -0
  171. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/udf.py +0 -0
  172. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/udf_signature.py +0 -0
  173. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/utils.py +0 -0
  174. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/webdataset.py +0 -0
  175. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/lib/webdataset_laion.py +0 -0
  176. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/listing.py +0 -0
  177. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/__init__.py +0 -0
  178. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/bbox.py +0 -0
  179. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/pose.py +0 -0
  180. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/segment.py +0 -0
  181. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  182. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  183. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/ultralytics/pose.py +0 -0
  184. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/model/ultralytics/segment.py +0 -0
  185. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/node.py +0 -0
  186. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/nodes_fetcher.py +0 -0
  187. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/nodes_thread_pool.py +0 -0
  188. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/progress.py +0 -0
  189. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/py.typed +0 -0
  190. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/__init__.py +0 -0
  191. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/batch.py +0 -0
  192. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/dispatch.py +0 -0
  193. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/metrics.py +0 -0
  194. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/params.py +0 -0
  195. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/queue.py +0 -0
  196. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/schema.py +0 -0
  197. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/session.py +0 -0
  198. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/udf.py +0 -0
  199. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/query/utils.py +0 -0
  200. {datachain-0.8.13/src/datachain/sql/functions → datachain-0.9.0/src/datachain/remote}/__init__.py +0 -0
  201. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/__init__.py +0 -0
  202. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/default/__init__.py +0 -0
  203. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/default/base.py +0 -0
  204. {datachain-0.8.13/tests/benchmarks → datachain-0.9.0/src/datachain/sql/functions}/__init__.py +0 -0
  205. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/aggregate.py +0 -0
  206. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/array.py +0 -0
  207. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/conditional.py +0 -0
  208. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/numeric.py +0 -0
  209. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/path.py +0 -0
  210. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/random.py +0 -0
  211. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/functions/string.py +0 -0
  212. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/selectable.py +0 -0
  213. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  214. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/sqlite/base.py +0 -0
  215. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/sqlite/types.py +0 -0
  216. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/sqlite/vector.py +0 -0
  217. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/types.py +0 -0
  218. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/sql/utils.py +0 -0
  219. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/telemetry.py +0 -0
  220. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/toolkit/__init__.py +0 -0
  221. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/toolkit/split.py +0 -0
  222. {datachain-0.8.13 → datachain-0.9.0}/src/datachain/torch/__init__.py +0 -0
  223. {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  224. {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/entry_points.txt +0 -0
  225. {datachain-0.8.13 → datachain-0.9.0}/src/datachain.egg-info/top_level.txt +0 -0
  226. {datachain-0.8.13 → datachain-0.9.0}/tests/__init__.py +0 -0
  227. {datachain-0.8.13/tests/examples → datachain-0.9.0/tests/benchmarks}/__init__.py +0 -0
  228. {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/conftest.py +0 -0
  229. {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  230. {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  231. {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/datasets/.gitignore +0 -0
  232. {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  233. {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/test_datachain.py +0 -0
  234. {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/test_ls.py +0 -0
  235. {datachain-0.8.13 → datachain-0.9.0}/tests/benchmarks/test_version.py +0 -0
  236. {datachain-0.8.13 → datachain-0.9.0}/tests/data.py +0 -0
  237. {datachain-0.8.13/tests/func → datachain-0.9.0/tests/examples}/__init__.py +0 -0
  238. {datachain-0.8.13 → datachain-0.9.0}/tests/examples/test_examples.py +0 -0
  239. {datachain-0.8.13 → datachain-0.9.0}/tests/examples/test_wds_e2e.py +0 -0
  240. {datachain-0.8.13 → datachain-0.9.0}/tests/examples/wds_data.py +0 -0
  241. {datachain-0.8.13/tests/unit → datachain-0.9.0/tests/func}/__init__.py +0 -0
  242. {datachain-0.8.13 → datachain-0.9.0}/tests/func/fake-service-account-credentials.json +0 -0
  243. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_client.py +0 -0
  244. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_data_storage.py +0 -0
  245. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_datachain.py +0 -0
  246. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_datachain_merge.py +0 -0
  247. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_dataset_query.py +0 -0
  248. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_datasets.py +0 -0
  249. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_feature_pickling.py +0 -0
  250. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_file.py +0 -0
  251. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_hf.py +0 -0
  252. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_listing.py +0 -0
  253. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_ls.py +0 -0
  254. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_meta_formats.py +0 -0
  255. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_metrics.py +0 -0
  256. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_pytorch.py +0 -0
  257. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_query.py +0 -0
  258. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_session.py +0 -0
  259. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_toolkit.py +0 -0
  260. {datachain-0.8.13 → datachain-0.9.0}/tests/func/test_warehouse.py +0 -0
  261. {datachain-0.8.13 → datachain-0.9.0}/tests/scripts/feature_class.py +0 -0
  262. {datachain-0.8.13 → datachain-0.9.0}/tests/scripts/feature_class_exception.py +0 -0
  263. {datachain-0.8.13 → datachain-0.9.0}/tests/scripts/feature_class_parallel.py +0 -0
  264. {datachain-0.8.13 → datachain-0.9.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  265. {datachain-0.8.13 → datachain-0.9.0}/tests/scripts/name_len_slow.py +0 -0
  266. {datachain-0.8.13 → datachain-0.9.0}/tests/test_atomicity.py +0 -0
  267. {datachain-0.8.13 → datachain-0.9.0}/tests/test_query_e2e.py +0 -0
  268. {datachain-0.8.13 → datachain-0.9.0}/tests/test_telemetry.py +0 -0
  269. {datachain-0.8.13/tests/unit/lib → datachain-0.9.0/tests/unit}/__init__.py +0 -0
  270. {datachain-0.8.13/tests/unit/sql → datachain-0.9.0/tests/unit/lib}/__init__.py +0 -0
  271. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/conftest.py +0 -0
  272. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_arrow.py +0 -0
  273. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_clip.py +0 -0
  274. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_datachain.py +0 -0
  275. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  276. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  277. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_feature.py +0 -0
  278. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_feature_utils.py +0 -0
  279. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_file.py +0 -0
  280. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_hf.py +0 -0
  281. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_image.py +0 -0
  282. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_listing_info.py +0 -0
  283. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_models.py +0 -0
  284. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  285. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_schema.py +0 -0
  286. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_signal_schema.py +0 -0
  287. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  288. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_text.py +0 -0
  289. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_udf_signature.py +0 -0
  290. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_utils.py +0 -0
  291. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/lib/test_webdataset.py +0 -0
  292. {datachain-0.8.13/tests/unit/sql/sqlite → datachain-0.9.0/tests/unit/sql}/__init__.py +0 -0
  293. /datachain-0.8.13/src/datachain/lib/vfile.py → /datachain-0.9.0/tests/unit/sql/sqlite/__init__.py +0 -0
  294. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  295. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  296. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_array.py +0 -0
  297. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_path.py +0 -0
  298. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_random.py +0 -0
  299. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_selectable.py +0 -0
  300. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/sql/test_string.py +0 -0
  301. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_asyn.py +0 -0
  302. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_cache.py +0 -0
  303. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_catalog.py +0 -0
  304. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_catalog_loader.py +0 -0
  305. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_client.py +0 -0
  306. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_client_gcs.py +0 -0
  307. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_client_s3.py +0 -0
  308. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_config.py +0 -0
  309. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_data_storage.py +0 -0
  310. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_database_engine.py +0 -0
  311. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_dataset.py +0 -0
  312. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_dispatch.py +0 -0
  313. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_fileslice.py +0 -0
  314. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_listing.py +0 -0
  315. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_metastore.py +0 -0
  316. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_module_exports.py +0 -0
  317. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_pytorch.py +0 -0
  318. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_query.py +0 -0
  319. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_query_metrics.py +0 -0
  320. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_query_params.py +0 -0
  321. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_serializer.py +0 -0
  322. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_session.py +0 -0
  323. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_utils.py +0 -0
  324. {datachain-0.8.13 → datachain-0.9.0}/tests/unit/test_warehouse.py +0 -0
  325. {datachain-0.8.13 → datachain-0.9.0}/tests/utils.py +0 -0
@@ -75,6 +75,9 @@ jobs:
75
75
  path: './backend/datachain'
76
76
  fetch-depth: 0
77
77
 
78
+ - name: Set up FFmpeg
79
+ uses: AnimMouse/setup-ffmpeg@v1
80
+
78
81
  - name: Set up Python ${{ matrix.pyv }}
79
82
  uses: actions/setup-python@v5
80
83
  with:
@@ -78,6 +78,9 @@ jobs:
78
78
  fetch-depth: 0
79
79
  ref: ${{ github.event.pull_request.head.sha || github.ref }}
80
80
 
81
+ - name: Set up FFmpeg
82
+ uses: AnimMouse/setup-ffmpeg@v1
83
+
81
84
  - name: Set up Python ${{ matrix.pyv }}
82
85
  uses: actions/setup-python@v5
83
86
  with:
@@ -24,13 +24,13 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.9.3'
27
+ rev: 'v0.9.6'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
31
31
  - id: ruff-format
32
32
  - repo: https://github.com/codespell-project/codespell
33
- rev: v2.4.0
33
+ rev: v2.4.1
34
34
  hooks:
35
35
  - id: codespell
36
36
  additional_dependencies: ["tomli"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.8.13
3
+ Version: 0.9.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -21,10 +21,12 @@ Requires-Dist: tomlkit
21
21
  Requires-Dist: tqdm
22
22
  Requires-Dist: numpy<3,>=1
23
23
  Requires-Dist: pandas>=2.0.0
24
+ Requires-Dist: packaging
24
25
  Requires-Dist: pyarrow
25
26
  Requires-Dist: typing-extensions
26
27
  Requires-Dist: python-dateutil>=2
27
28
  Requires-Dist: attrs>=21.3.0
29
+ Requires-Dist: fsspec>=2024.2.0
28
30
  Requires-Dist: s3fs>=2024.2.0
29
31
  Requires-Dist: gcsfs>=2024.2.0
30
32
  Requires-Dist: adlfs>=2024.2.0
@@ -42,7 +44,7 @@ Requires-Dist: Pillow<12,>=10.0.0
42
44
  Requires-Dist: msgpack<2,>=1.0.4
43
45
  Requires-Dist: psutil
44
46
  Requires-Dist: huggingface_hub
45
- Requires-Dist: iterative-telemetry>=0.0.9
47
+ Requires-Dist: iterative-telemetry>=0.0.10
46
48
  Requires-Dist: platformdirs
47
49
  Requires-Dist: dvc-studio-client<1,>=0.21
48
50
  Requires-Dist: tabulate
@@ -54,6 +56,7 @@ Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
54
56
  Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
55
57
  Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
56
58
  Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
59
+ Requires-Dist: eval-type-backport; extra == "docs"
57
60
  Provides-Extra: torch
58
61
  Requires-Dist: torch>=2.1.0; extra == "torch"
59
62
  Requires-Dist: torchvision; extra == "torch"
@@ -66,8 +69,13 @@ Requires-Dist: usearch; extra == "vector"
66
69
  Provides-Extra: hf
67
70
  Requires-Dist: numba>=0.60.0; extra == "hf"
68
71
  Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
72
+ Provides-Extra: video
73
+ Requires-Dist: av<14; extra == "video"
74
+ Requires-Dist: ffmpeg-python; extra == "video"
75
+ Requires-Dist: imageio[ffmpeg]; extra == "video"
76
+ Requires-Dist: opencv-python; extra == "video"
69
77
  Provides-Extra: tests
70
- Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
78
+ Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
71
79
  Requires-Dist: pytest<9,>=8; extra == "tests"
72
80
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
73
81
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
@@ -83,7 +91,7 @@ Requires-Dist: requests-mock; extra == "tests"
83
91
  Requires-Dist: scipy; extra == "tests"
84
92
  Provides-Extra: dev
85
93
  Requires-Dist: datachain[docs,tests]; extra == "dev"
86
- Requires-Dist: mypy==1.14.1; extra == "dev"
94
+ Requires-Dist: mypy==1.15.0; extra == "dev"
87
95
  Requires-Dist: types-python-dateutil; extra == "dev"
88
96
  Requires-Dist: types-pytz; extra == "dev"
89
97
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -94,7 +102,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
94
102
  Requires-Dist: defusedxml; extra == "examples"
95
103
  Requires-Dist: accelerate; extra == "examples"
96
104
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
97
- Requires-Dist: ultralytics==8.3.68; extra == "examples"
105
+ Requires-Dist: ultralytics==8.3.74; extra == "examples"
98
106
  Requires-Dist: open_clip_torch; extra == "examples"
99
107
 
100
108
  ================
@@ -0,0 +1,3 @@
1
+ # ArrowRow
2
+
3
+ ::: datachain.lib.file.ArrowRow
@@ -0,0 +1,5 @@
1
+ # Bounding Box
2
+
3
+ ::: datachain.model.bbox.BBox
4
+
5
+ ::: datachain.model.bbox.OBBox
@@ -0,0 +1,35 @@
1
+ # File
2
+
3
+ `File` is a special [`DataModel`](index.md#datachain.lib.data_model.DataModel),
4
+ which is automatically generated when a `DataChain` is created from files,
5
+ such as in [`DataChain.from_storage`](../datachain.md#datachain.lib.dc.DataChain.from_storage):
6
+
7
+ ```python
8
+ from datachain import DataChain
9
+
10
+ dc = DataChain.from_storage("gs://datachain-demo/dogs-and-cats")
11
+ dc.print_schema()
12
+ ```
13
+
14
+ Output:
15
+
16
+ ```
17
+ file: File@v1
18
+ source: str
19
+ path: str
20
+ size: int
21
+ version: str
22
+ etag: str
23
+ is_latest: bool
24
+ last_modified: datetime
25
+ location: Union[dict, list[dict], NoneType]
26
+ ```
27
+
28
+ `File` classes include various metadata fields describing the underlying file,
29
+ along with methods to read and manipulate file contents.
30
+
31
+ ::: datachain.lib.file.File
32
+
33
+ ::: datachain.lib.file.FileError
34
+
35
+ ::: datachain.lib.file.TarVFile
@@ -0,0 +1,15 @@
1
+ # ImageFile
2
+
3
+ `ImageFile` is inherited from [`File`](file.md) with additional methods for working with image files.
4
+
5
+ `ImageFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.DataChain.from_storage), using `type="image"` param:
6
+
7
+ ```python
8
+ from datachain import DataChain
9
+
10
+ dc = DataChain.from_storage("s3://bucket-name/", type="image")
11
+ ```
12
+
13
+ ::: datachain.lib.file.ImageFile
14
+
15
+ ::: datachain.lib.file.Image
@@ -1,4 +1,4 @@
1
- # DataType
1
+ # Data Types
2
2
 
3
3
  Data types supported by `DataChain` must be of type
4
4
  [`DataType`](#datachain.lib.data_model.DataType). `DataType` includes most Python types
@@ -0,0 +1,5 @@
1
+ # Pose
2
+
3
+ ::: datachain.model.pose.Pose
4
+
5
+ ::: datachain.model.pose.Pose3D
@@ -0,0 +1,3 @@
1
+ # Segment
2
+
3
+ ::: datachain.model.segment.Segment
@@ -0,0 +1,3 @@
1
+ # TarVFile
2
+
3
+ ::: datachain.lib.file.TarVFile
@@ -0,0 +1,13 @@
1
+ # TextFile
2
+
3
+ `TextFile` is inherited from [`File`](file.md) with additional methods for working with text files.
4
+
5
+ `TextFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.DataChain.from_storage), using `type="text"` param:
6
+
7
+ ```python
8
+ from datachain import DataChain
9
+
10
+ dc = DataChain.from_storage("s3://bucket-name/", type="text")
11
+ ```
12
+
13
+ ::: datachain.lib.file.TextFile
@@ -0,0 +1,29 @@
1
+ # VideoFile
2
+
3
+ `VideoFile` extends [`File`](file.md) and provides additional methods for working with video files.
4
+
5
+ `VideoFile` instances are created when a `DataChain` is initialized [from storage](../datachain.md#datachain.lib.dc.DataChain.from_storage) with the `type="video"` parameter:
6
+
7
+ ```python
8
+ from datachain import DataChain
9
+
10
+ dc = DataChain.from_storage("s3://bucket-name/", type="video")
11
+ ```
12
+
13
+ There are additional models for working with video files:
14
+
15
+ - `VideoFrame` - represents a single frame of a video file.
16
+ - `VideoFragment` - represents a fragment of a video file.
17
+
18
+ These are virtual models that do not create physical files.
19
+ Instead, they are used to represent the data in the `VideoFile` these models are referring to.
20
+ If you need to save the data, you can use the `save` method of these models,
21
+ allowing you to save data locally or upload it to a storage service.
22
+
23
+ ::: datachain.lib.file.VideoFile
24
+
25
+ ::: datachain.lib.file.VideoFrame
26
+
27
+ ::: datachain.lib.file.VideoFragment
28
+
29
+ ::: datachain.lib.file.Video
@@ -0,0 +1,23 @@
1
+ ---
2
+ title: API Reference
3
+ ---
4
+
5
+ # API Reference
6
+
7
+ DataChain's API is organized into several modules:
8
+
9
+ - [DataChain](./datachain.md) - Core chain operations and dataset management
10
+ - [Data Types](./data-types/index.md) - Supported data types and schema definitions
11
+ - [File](./data-types/file.md) - File handling and storage operations
12
+ - [TextFile](./data-types/textfile.md) - Text file
13
+ - [ImageFile](./data-types/imagefile.md) - Image file
14
+ - [VideoFile](./data-types/imagefile.md) - Video file
15
+ - [TarVFile](./data-types/tarvfile.md) - Virtual file model for files extracted from tar archives
16
+ - [ArrowRow](./data-types/arrowrow.md) - Working with Arrow-supported file
17
+ - [BBox](./data-types/bbox.md) - Bounding box data type
18
+ - [Pose](./data-types/pose.md) - Pose data type
19
+ - [Segment](./data-types/segment.md) - Segment data type
20
+ - [UDF](./udf.md) - User-defined functions and transformations
21
+ - [Functions](./func.md) - Built-in functions for data manipulation and analysis
22
+ - [Torch](./torch.md) - PyTorch data loading utilities
23
+ - [Toolkit](./toolkit.md) - Functions for common DS/ML operations
@@ -0,0 +1,5 @@
1
+ # Toolkit
2
+
3
+ Here you can find the toolkit functions on top of DataChain for common DS/ML operations (e.g. train/test split). Import these functions from `datachain.toolkit`.
4
+
5
+ ::: datachain.toolkit
@@ -0,0 +1,54 @@
1
+ from datachain import C, DataChain
2
+ from datachain.func import array, greatest, least, path, string
3
+
4
+
5
+ def num_chars_udf(file):
6
+ parts = file.name.split(".")
7
+ if len(parts) > 1:
8
+ return (list(parts[1]),)
9
+ return ([],)
10
+
11
+
12
+ dc = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", anon=True)
13
+ dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
14
+ "file.path", "num_chars"
15
+ ).show(5)
16
+
17
+ (
18
+ dc.mutate(
19
+ length=string.length(path.name(C("file.path"))),
20
+ parts=string.split(path.name(C("file.path")), "."),
21
+ )
22
+ .select("file.path", "length", "parts")
23
+ .show(5)
24
+ )
25
+
26
+ (
27
+ dc.mutate(
28
+ stem=path.file_stem(C("file.path")),
29
+ ext=path.file_ext(C("file.path")),
30
+ )
31
+ .select("file.path", "stem", "ext")
32
+ .show(5)
33
+ )
34
+
35
+ parts = string.split(path.name(C("file.path")), ".")
36
+ chain = dc.mutate(
37
+ isdog=array.contains(parts, "dog"),
38
+ iscat=array.contains(parts, "cat"),
39
+ )
40
+ chain.select("file.path", "isdog", "iscat").show(5)
41
+
42
+ chain = dc.mutate(
43
+ a=array.length(string.split("file.path", "/")),
44
+ b=array.length(string.split(path.name("file.path"), "0")),
45
+ )
46
+
47
+ (
48
+ chain.mutate(
49
+ greatest=greatest(chain.column("a"), C("b")),
50
+ least=least(chain.column("a"), C("b")),
51
+ )
52
+ .select("a", "b", "greatest", "least")
53
+ .show(10)
54
+ )
@@ -69,11 +69,21 @@ nav:
69
69
  - 🐍 API Reference:
70
70
  - Overview: references/index.md
71
71
  - DataChain: references/datachain.md
72
- - DataType: references/datatype.md
73
- - File: references/file.md
72
+ - Data Types:
73
+ - Overview: references/data-types/index.md
74
+ - File: references/data-types/file.md
75
+ - TextFile: references/data-types/textfile.md
76
+ - ImageFile: references/data-types/imagefile.md
77
+ - VideoFile: references/data-types/videofile.md
78
+ - TarVFile: references/data-types/tarvfile.md
79
+ - ArrowRow: references/data-types/arrowrow.md
80
+ - BBox: references/data-types/bbox.md
81
+ - Pose: references/data-types/pose.md
82
+ - Segment: references/data-types/segment.md
74
83
  - UDF: references/udf.md
75
84
  - Torch: references/torch.md
76
85
  - Functions: references/func.md
86
+ - Toolkit: references/toolkit.md
77
87
  - 🤝 Contributing: contributing.md
78
88
 
79
89
  - DataChain Website ↗: https://datachain.ai" target="_blank"
@@ -32,6 +32,12 @@ def bench(session: nox.Session) -> None:
32
32
  @nox.session(python=["3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"])
33
33
  def tests(session: nox.Session) -> None:
34
34
  session.install(".[tests]")
35
+ env = {"COVERAGE_FILE": f".coverage.{session.python}"}
36
+ if session.python == "3.12":
37
+ # improve performance of tests in Python 3.12 when used with coverage
38
+ # https://github.com/nedbat/coveragepy/issues/1665
39
+ # https://github.com/python/cpython/issues/107674
40
+ env["COVERAGE_CORE"] = "sysmon"
35
41
  session.run(
36
42
  "pytest",
37
43
  "--cov",
@@ -41,7 +47,7 @@ def tests(session: nox.Session) -> None:
41
47
  "--numprocesses=logical",
42
48
  "--dist=loadgroup",
43
49
  *session.posargs,
44
- env={"COVERAGE_FILE": f".coverage.{session.python}"},
50
+ env=env,
45
51
  )
46
52
 
47
53
 
@@ -24,10 +24,12 @@ dependencies = [
24
24
  "tqdm",
25
25
  "numpy>=1,<3",
26
26
  "pandas>=2.0.0",
27
+ "packaging",
27
28
  "pyarrow",
28
29
  "typing-extensions",
29
30
  "python-dateutil>=2",
30
31
  "attrs>=21.3.0",
32
+ "fsspec>=2024.2.0",
31
33
  "s3fs>=2024.2.0",
32
34
  "gcsfs>=2024.2.0",
33
35
  "adlfs>=2024.2.0",
@@ -45,7 +47,7 @@ dependencies = [
45
47
  "msgpack>=1.0.4,<2",
46
48
  "psutil",
47
49
  "huggingface_hub",
48
- "iterative-telemetry>=0.0.9",
50
+ "iterative-telemetry>=0.0.10",
49
51
  "platformdirs",
50
52
  "dvc-studio-client>=0.21,<1",
51
53
  "tabulate",
@@ -59,7 +61,8 @@ docs = [
59
61
  "mkdocs-material==9.5.22",
60
62
  "mkdocs-section-index>=0.3.6",
61
63
  "mkdocstrings-python>=1.6.3",
62
- "mkdocs-literate-nav>=0.6.1"
64
+ "mkdocs-literate-nav>=0.6.1",
65
+ "eval-type-backport"
63
66
  ]
64
67
  torch = [
65
68
  "torch>=2.1.0",
@@ -77,8 +80,16 @@ hf = [
77
80
  "numba>=0.60.0",
78
81
  "datasets[audio,vision]>=2.21.0"
79
82
  ]
83
+ video = [
84
+ # Use 'av<14' because of incompatibility with imageio
85
+ # See https://github.com/PyAV-Org/PyAV/discussions/1700
86
+ "av<14",
87
+ "ffmpeg-python",
88
+ "imageio[ffmpeg]",
89
+ "opencv-python"
90
+ ]
80
91
  tests = [
81
- "datachain[torch,remote,vector,hf]",
92
+ "datachain[torch,remote,vector,hf,video]",
82
93
  "pytest>=8,<9",
83
94
  "pytest-sugar>=0.9.6",
84
95
  "pytest-cov>=4.1.0",
@@ -95,7 +106,7 @@ tests = [
95
106
  ]
96
107
  dev = [
97
108
  "datachain[docs,tests]",
98
- "mypy==1.14.1",
109
+ "mypy==1.15.0",
99
110
  "types-python-dateutil",
100
111
  "types-pytz",
101
112
  "types-PyYAML",
@@ -107,7 +118,7 @@ examples = [
107
118
  "defusedxml",
108
119
  "accelerate",
109
120
  "huggingface_hub[hf_transfer]",
110
- "ultralytics==8.3.68",
121
+ "ultralytics==8.3.74",
111
122
  "open_clip_torch"
112
123
  ]
113
124
 
@@ -4,9 +4,14 @@ from datachain.lib.file import (
4
4
  ArrowRow,
5
5
  File,
6
6
  FileError,
7
+ Image,
7
8
  ImageFile,
8
9
  TarVFile,
9
10
  TextFile,
11
+ Video,
12
+ VideoFile,
13
+ VideoFragment,
14
+ VideoFrame,
10
15
  )
11
16
  from datachain.lib.model_store import ModelStore
12
17
  from datachain.lib.udf import Aggregator, Generator, Mapper
@@ -27,6 +32,7 @@ __all__ = [
27
32
  "File",
28
33
  "FileError",
29
34
  "Generator",
35
+ "Image",
30
36
  "ImageFile",
31
37
  "Mapper",
32
38
  "ModelStore",
@@ -34,6 +40,10 @@ __all__ = [
34
40
  "Sys",
35
41
  "TarVFile",
36
42
  "TextFile",
43
+ "Video",
44
+ "VideoFile",
45
+ "VideoFragment",
46
+ "VideoFrame",
37
47
  "is_chain_type",
38
48
  "metrics",
39
49
  "param",
@@ -89,10 +89,6 @@ PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be av
89
89
  PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
90
90
 
91
91
 
92
- def raise_remote_error(error_message: str) -> NoReturn:
93
- raise DataChainError(f"Error from server: {error_message}")
94
-
95
-
96
92
  def noop(_: str):
97
93
  pass
98
94
 
@@ -211,14 +207,14 @@ class DatasetRowsFetcher(NodesThreadPool):
211
207
  self.remote_ds_name, self.remote_ds_version
212
208
  )
213
209
  if not export_status_response.ok:
214
- raise_remote_error(export_status_response.message)
210
+ raise DataChainError(export_status_response.message)
215
211
 
216
212
  export_status = export_status_response.data["status"] # type: ignore [index]
217
213
 
218
214
  if export_status == "failed":
219
- raise_remote_error("Dataset export failed in Studio")
215
+ raise DataChainError("Dataset export failed in Studio")
220
216
  if export_status == "removed":
221
- raise_remote_error("Dataset export removed in Studio")
217
+ raise DataChainError("Dataset export removed in Studio")
222
218
 
223
219
  self.last_status_check = time.time()
224
220
 
@@ -1101,6 +1097,31 @@ class Catalog:
1101
1097
  def get_dataset(self, name: str) -> DatasetRecord:
1102
1098
  return self.metastore.get_dataset(name)
1103
1099
 
1100
+ def get_dataset_with_remote_fallback(
1101
+ self, name: str, version: Optional[int] = None
1102
+ ) -> DatasetRecord:
1103
+ try:
1104
+ ds = self.get_dataset(name)
1105
+ if version and not ds.has_version(version):
1106
+ raise DatasetVersionNotFoundError(
1107
+ f"Dataset {name} does not have version {version}"
1108
+ )
1109
+ return ds
1110
+
1111
+ except (DatasetNotFoundError, DatasetVersionNotFoundError):
1112
+ print("Dataset not found in local catalog, trying to get from studio")
1113
+
1114
+ remote_ds_uri = f"{DATASET_PREFIX}{name}"
1115
+ if version:
1116
+ remote_ds_uri += f"@v{version}"
1117
+
1118
+ self.pull_dataset(
1119
+ remote_ds_uri=remote_ds_uri,
1120
+ local_ds_name=name,
1121
+ local_ds_version=version,
1122
+ )
1123
+ return self.get_dataset(name)
1124
+
1104
1125
  def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
1105
1126
  """Returns dataset that contains version with specific uuid"""
1106
1127
  for dataset in self.ls_datasets():
@@ -1113,7 +1134,7 @@ class Catalog:
1113
1134
 
1114
1135
  info_response = studio_client.dataset_info(name)
1115
1136
  if not info_response.ok:
1116
- raise_remote_error(info_response.message)
1137
+ raise DataChainError(info_response.message)
1117
1138
 
1118
1139
  dataset_info = info_response.data
1119
1140
  assert isinstance(dataset_info, dict)
@@ -1209,6 +1230,8 @@ class Catalog:
1209
1230
  **kwargs,
1210
1231
  ) -> str:
1211
1232
  client_config = client_config or self.client_config
1233
+ if client_config.get("anon"):
1234
+ content_disposition = None
1212
1235
  client = Client.get_client(source, self.cache, **client_config)
1213
1236
  return client.url(
1214
1237
  path,
@@ -1407,7 +1430,7 @@ class Catalog:
1407
1430
  remote_ds_name, remote_ds_version.version
1408
1431
  )
1409
1432
  if not export_response.ok:
1410
- raise_remote_error(export_response.message)
1433
+ raise DataChainError(export_response.message)
1411
1434
 
1412
1435
  signed_urls = export_response.data
1413
1436
 
@@ -160,6 +160,8 @@ def handle_dataset_command(args, catalog):
160
160
  local=args.local,
161
161
  all=args.all,
162
162
  team=args.team,
163
+ latest_only=not args.versions,
164
+ name=args.name,
163
165
  ),
164
166
  "rm": lambda: rm_dataset(
165
167
  catalog,