datachain 0.8.3__tar.gz → 0.8.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (304) hide show
  1. {datachain-0.8.3 → datachain-0.8.4}/.github/workflows/tests-studio.yml +1 -1
  2. {datachain-0.8.3 → datachain-0.8.4}/.github/workflows/tests.yml +1 -1
  3. {datachain-0.8.3 → datachain-0.8.4}/.pre-commit-config.yaml +1 -1
  4. {datachain-0.8.3 → datachain-0.8.4}/PKG-INFO +5 -5
  5. {datachain-0.8.3 → datachain-0.8.4}/mkdocs.yml +1 -0
  6. {datachain-0.8.3 → datachain-0.8.4}/pyproject.toml +4 -4
  7. datachain-0.8.4/src/datachain/cli/__init__.py +311 -0
  8. datachain-0.8.4/src/datachain/cli/commands/__init__.py +29 -0
  9. datachain-0.8.4/src/datachain/cli/commands/datasets.py +129 -0
  10. datachain-0.8.4/src/datachain/cli/commands/du.py +14 -0
  11. datachain-0.8.4/src/datachain/cli/commands/index.py +12 -0
  12. datachain-0.8.4/src/datachain/cli/commands/ls.py +169 -0
  13. datachain-0.8.4/src/datachain/cli/commands/misc.py +28 -0
  14. datachain-0.8.4/src/datachain/cli/commands/query.py +53 -0
  15. datachain-0.8.4/src/datachain/cli/commands/show.py +38 -0
  16. datachain-0.8.4/src/datachain/cli/parser/__init__.py +547 -0
  17. datachain-0.8.4/src/datachain/cli/parser/job.py +120 -0
  18. datachain-0.8.4/src/datachain/cli/parser/studio.py +126 -0
  19. datachain-0.8.4/src/datachain/cli/parser/utils.py +63 -0
  20. datachain-0.8.3/src/datachain/cli_utils.py → datachain-0.8.4/src/datachain/cli/utils.py +27 -1
  21. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/client/fsspec.py +8 -2
  22. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/__init__.py +2 -2
  23. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/conditional.py +52 -0
  24. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/func.py +5 -1
  25. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/arrow.py +4 -0
  26. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/dc.py +3 -0
  27. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/file.py +1 -1
  28. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/listing.py +19 -1
  29. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/signal_schema.py +89 -27
  30. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/progress.py +2 -2
  31. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/studio.py +58 -38
  32. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/utils.py +1 -1
  33. {datachain-0.8.3 → datachain-0.8.4}/src/datachain.egg-info/PKG-INFO +5 -5
  34. {datachain-0.8.3 → datachain-0.8.4}/src/datachain.egg-info/SOURCES.txt +14 -2
  35. {datachain-0.8.3 → datachain-0.8.4}/src/datachain.egg-info/requires.txt +4 -4
  36. {datachain-0.8.3 → datachain-0.8.4}/tests/conftest.py +1 -1
  37. {datachain-0.8.3 → datachain-0.8.4}/tests/test_cli_e2e.py +6 -6
  38. {datachain-0.8.3 → datachain-0.8.4}/tests/test_cli_studio.py +18 -15
  39. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_arrow.py +9 -0
  40. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_datachain.py +13 -5
  41. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_signal_schema.py +280 -32
  42. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/sql/test_conditional.py +43 -0
  43. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_cli_parsing.py +2 -17
  44. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_config.py +9 -9
  45. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_func.py +19 -1
  46. datachain-0.8.3/src/datachain/cli.py +0 -1475
  47. {datachain-0.8.3 → datachain-0.8.4}/.cruft.json +0 -0
  48. {datachain-0.8.3 → datachain-0.8.4}/.gitattributes +0 -0
  49. {datachain-0.8.3 → datachain-0.8.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  50. {datachain-0.8.3 → datachain-0.8.4}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  51. {datachain-0.8.3 → datachain-0.8.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  52. {datachain-0.8.3 → datachain-0.8.4}/.github/codecov.yaml +0 -0
  53. {datachain-0.8.3 → datachain-0.8.4}/.github/dependabot.yml +0 -0
  54. {datachain-0.8.3 → datachain-0.8.4}/.github/workflows/benchmarks.yml +0 -0
  55. {datachain-0.8.3 → datachain-0.8.4}/.github/workflows/release.yml +0 -0
  56. {datachain-0.8.3 → datachain-0.8.4}/.github/workflows/update-template.yaml +0 -0
  57. {datachain-0.8.3 → datachain-0.8.4}/.gitignore +0 -0
  58. {datachain-0.8.3 → datachain-0.8.4}/CODE_OF_CONDUCT.rst +0 -0
  59. {datachain-0.8.3 → datachain-0.8.4}/LICENSE +0 -0
  60. {datachain-0.8.3 → datachain-0.8.4}/README.rst +0 -0
  61. {datachain-0.8.3 → datachain-0.8.4}/docs/assets/captioned_cartoons.png +0 -0
  62. {datachain-0.8.3 → datachain-0.8.4}/docs/assets/datachain-white.svg +0 -0
  63. {datachain-0.8.3 → datachain-0.8.4}/docs/assets/datachain.svg +0 -0
  64. {datachain-0.8.3 → datachain-0.8.4}/docs/contributing.md +0 -0
  65. {datachain-0.8.3 → datachain-0.8.4}/docs/css/github-permalink-style.css +0 -0
  66. {datachain-0.8.3 → datachain-0.8.4}/docs/examples.md +0 -0
  67. {datachain-0.8.3 → datachain-0.8.4}/docs/index.md +0 -0
  68. {datachain-0.8.3 → datachain-0.8.4}/docs/overrides/main.html +0 -0
  69. {datachain-0.8.3 → datachain-0.8.4}/docs/quick-start.md +0 -0
  70. {datachain-0.8.3 → datachain-0.8.4}/docs/references/datachain.md +0 -0
  71. {datachain-0.8.3 → datachain-0.8.4}/docs/references/datatype.md +0 -0
  72. {datachain-0.8.3 → datachain-0.8.4}/docs/references/file.md +0 -0
  73. {datachain-0.8.3 → datachain-0.8.4}/docs/references/index.md +0 -0
  74. {datachain-0.8.3 → datachain-0.8.4}/docs/references/sql.md +0 -0
  75. {datachain-0.8.3 → datachain-0.8.4}/docs/references/torch.md +0 -0
  76. {datachain-0.8.3 → datachain-0.8.4}/docs/references/udf.md +0 -0
  77. {datachain-0.8.3 → datachain-0.8.4}/docs/tutorials.md +0 -0
  78. {datachain-0.8.3 → datachain-0.8.4}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  79. {datachain-0.8.3 → datachain-0.8.4}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  80. {datachain-0.8.3 → datachain-0.8.4}/examples/computer_vision/openimage-detect.py +0 -0
  81. {datachain-0.8.3 → datachain-0.8.4}/examples/computer_vision/ultralytics-bbox.py +0 -0
  82. {datachain-0.8.3 → datachain-0.8.4}/examples/computer_vision/ultralytics-pose.py +0 -0
  83. {datachain-0.8.3 → datachain-0.8.4}/examples/computer_vision/ultralytics-segment.py +0 -0
  84. {datachain-0.8.3 → datachain-0.8.4}/examples/get_started/common_sql_functions.py +0 -0
  85. {datachain-0.8.3 → datachain-0.8.4}/examples/get_started/json-csv-reader.py +0 -0
  86. {datachain-0.8.3 → datachain-0.8.4}/examples/get_started/torch-loader.py +0 -0
  87. {datachain-0.8.3 → datachain-0.8.4}/examples/get_started/udfs/parallel.py +0 -0
  88. {datachain-0.8.3 → datachain-0.8.4}/examples/get_started/udfs/simple.py +0 -0
  89. {datachain-0.8.3 → datachain-0.8.4}/examples/get_started/udfs/stateful.py +0 -0
  90. {datachain-0.8.3 → datachain-0.8.4}/examples/llm_and_nlp/claude-query.py +0 -0
  91. {datachain-0.8.3 → datachain-0.8.4}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  92. {datachain-0.8.3 → datachain-0.8.4}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  93. {datachain-0.8.3 → datachain-0.8.4}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  94. {datachain-0.8.3 → datachain-0.8.4}/examples/multimodal/clip_inference.py +0 -0
  95. {datachain-0.8.3 → datachain-0.8.4}/examples/multimodal/hf_pipeline.py +0 -0
  96. {datachain-0.8.3 → datachain-0.8.4}/examples/multimodal/openai_image_desc_lib.py +0 -0
  97. {datachain-0.8.3 → datachain-0.8.4}/examples/multimodal/wds.py +0 -0
  98. {datachain-0.8.3 → datachain-0.8.4}/examples/multimodal/wds_filtered.py +0 -0
  99. {datachain-0.8.3 → datachain-0.8.4}/noxfile.py +0 -0
  100. {datachain-0.8.3 → datachain-0.8.4}/setup.cfg +0 -0
  101. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/__init__.py +0 -0
  102. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/__main__.py +0 -0
  103. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/asyn.py +0 -0
  104. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/cache.py +0 -0
  105. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/catalog/__init__.py +0 -0
  106. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/catalog/catalog.py +0 -0
  107. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/catalog/datasource.py +0 -0
  108. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/catalog/loader.py +0 -0
  109. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/client/__init__.py +0 -0
  110. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/client/azure.py +0 -0
  111. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/client/fileslice.py +0 -0
  112. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/client/gcs.py +0 -0
  113. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/client/hf.py +0 -0
  114. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/client/local.py +0 -0
  115. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/client/s3.py +0 -0
  116. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/config.py +0 -0
  117. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/data_storage/__init__.py +0 -0
  118. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/data_storage/db_engine.py +0 -0
  119. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/data_storage/job.py +0 -0
  120. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/data_storage/metastore.py +0 -0
  121. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/data_storage/schema.py +0 -0
  122. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/data_storage/serializer.py +0 -0
  123. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/data_storage/sqlite.py +0 -0
  124. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/data_storage/warehouse.py +0 -0
  125. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/dataset.py +0 -0
  126. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/error.py +0 -0
  127. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/aggregate.py +0 -0
  128. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/array.py +0 -0
  129. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/base.py +0 -0
  130. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/numeric.py +0 -0
  131. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/path.py +0 -0
  132. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/random.py +0 -0
  133. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/string.py +0 -0
  134. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/func/window.py +0 -0
  135. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/job.py +0 -0
  136. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/__init__.py +0 -0
  137. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/clip.py +0 -0
  138. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/convert/__init__.py +0 -0
  139. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/convert/flatten.py +0 -0
  140. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/convert/python_to_sql.py +0 -0
  141. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/convert/sql_to_python.py +0 -0
  142. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/convert/unflatten.py +0 -0
  143. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  144. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/data_model.py +0 -0
  145. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/dataset_info.py +0 -0
  146. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/diff.py +0 -0
  147. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/hf.py +0 -0
  148. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/image.py +0 -0
  149. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/listing_info.py +0 -0
  150. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/meta_formats.py +0 -0
  151. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/model_store.py +0 -0
  152. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/pytorch.py +0 -0
  153. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/settings.py +0 -0
  154. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/tar.py +0 -0
  155. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/text.py +0 -0
  156. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/udf.py +0 -0
  157. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/udf_signature.py +0 -0
  158. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/utils.py +0 -0
  159. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/vfile.py +0 -0
  160. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/webdataset.py +0 -0
  161. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/lib/webdataset_laion.py +0 -0
  162. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/listing.py +0 -0
  163. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/model/__init__.py +0 -0
  164. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/model/bbox.py +0 -0
  165. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/model/pose.py +0 -0
  166. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/model/segment.py +0 -0
  167. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/model/ultralytics/__init__.py +0 -0
  168. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/model/ultralytics/bbox.py +0 -0
  169. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/model/ultralytics/pose.py +0 -0
  170. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/model/ultralytics/segment.py +0 -0
  171. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/node.py +0 -0
  172. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/nodes_fetcher.py +0 -0
  173. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/nodes_thread_pool.py +0 -0
  174. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/py.typed +0 -0
  175. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/__init__.py +0 -0
  176. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/batch.py +0 -0
  177. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/dataset.py +0 -0
  178. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/dispatch.py +0 -0
  179. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/metrics.py +0 -0
  180. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/params.py +0 -0
  181. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/queue.py +0 -0
  182. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/schema.py +0 -0
  183. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/session.py +0 -0
  184. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/udf.py +0 -0
  185. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/query/utils.py +0 -0
  186. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/remote/__init__.py +0 -0
  187. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/remote/studio.py +0 -0
  188. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/__init__.py +0 -0
  189. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/default/__init__.py +0 -0
  190. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/default/base.py +0 -0
  191. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/functions/__init__.py +0 -0
  192. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/functions/aggregate.py +0 -0
  193. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/functions/array.py +0 -0
  194. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/functions/conditional.py +0 -0
  195. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/functions/numeric.py +0 -0
  196. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/functions/path.py +0 -0
  197. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/functions/random.py +0 -0
  198. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/functions/string.py +0 -0
  199. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/selectable.py +0 -0
  200. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/sqlite/__init__.py +0 -0
  201. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/sqlite/base.py +0 -0
  202. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/sqlite/types.py +0 -0
  203. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/sqlite/vector.py +0 -0
  204. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/types.py +0 -0
  205. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/sql/utils.py +0 -0
  206. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/telemetry.py +0 -0
  207. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/toolkit/__init__.py +0 -0
  208. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/toolkit/split.py +0 -0
  209. {datachain-0.8.3 → datachain-0.8.4}/src/datachain/torch/__init__.py +0 -0
  210. {datachain-0.8.3 → datachain-0.8.4}/src/datachain.egg-info/dependency_links.txt +0 -0
  211. {datachain-0.8.3 → datachain-0.8.4}/src/datachain.egg-info/entry_points.txt +0 -0
  212. {datachain-0.8.3 → datachain-0.8.4}/src/datachain.egg-info/top_level.txt +0 -0
  213. {datachain-0.8.3 → datachain-0.8.4}/tests/__init__.py +0 -0
  214. {datachain-0.8.3 → datachain-0.8.4}/tests/benchmarks/__init__.py +0 -0
  215. {datachain-0.8.3 → datachain-0.8.4}/tests/benchmarks/conftest.py +0 -0
  216. {datachain-0.8.3 → datachain-0.8.4}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  217. {datachain-0.8.3 → datachain-0.8.4}/tests/benchmarks/datasets/.dvc/config +0 -0
  218. {datachain-0.8.3 → datachain-0.8.4}/tests/benchmarks/datasets/.gitignore +0 -0
  219. {datachain-0.8.3 → datachain-0.8.4}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  220. {datachain-0.8.3 → datachain-0.8.4}/tests/benchmarks/test_datachain.py +0 -0
  221. {datachain-0.8.3 → datachain-0.8.4}/tests/benchmarks/test_ls.py +0 -0
  222. {datachain-0.8.3 → datachain-0.8.4}/tests/benchmarks/test_version.py +0 -0
  223. {datachain-0.8.3 → datachain-0.8.4}/tests/data.py +0 -0
  224. {datachain-0.8.3 → datachain-0.8.4}/tests/examples/__init__.py +0 -0
  225. {datachain-0.8.3 → datachain-0.8.4}/tests/examples/test_examples.py +0 -0
  226. {datachain-0.8.3 → datachain-0.8.4}/tests/examples/test_wds_e2e.py +0 -0
  227. {datachain-0.8.3 → datachain-0.8.4}/tests/examples/wds_data.py +0 -0
  228. {datachain-0.8.3 → datachain-0.8.4}/tests/func/__init__.py +0 -0
  229. {datachain-0.8.3 → datachain-0.8.4}/tests/func/fake-service-account-credentials.json +0 -0
  230. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_catalog.py +0 -0
  231. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_client.py +0 -0
  232. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_datachain.py +0 -0
  233. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_dataset_query.py +0 -0
  234. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_datasets.py +0 -0
  235. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_feature_pickling.py +0 -0
  236. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_listing.py +0 -0
  237. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_ls.py +0 -0
  238. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_meta_formats.py +0 -0
  239. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_metrics.py +0 -0
  240. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_pull.py +0 -0
  241. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_pytorch.py +0 -0
  242. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_query.py +0 -0
  243. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_session.py +0 -0
  244. {datachain-0.8.3 → datachain-0.8.4}/tests/func/test_toolkit.py +0 -0
  245. {datachain-0.8.3 → datachain-0.8.4}/tests/scripts/feature_class.py +0 -0
  246. {datachain-0.8.3 → datachain-0.8.4}/tests/scripts/feature_class_exception.py +0 -0
  247. {datachain-0.8.3 → datachain-0.8.4}/tests/scripts/feature_class_parallel.py +0 -0
  248. {datachain-0.8.3 → datachain-0.8.4}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  249. {datachain-0.8.3 → datachain-0.8.4}/tests/scripts/name_len_slow.py +0 -0
  250. {datachain-0.8.3 → datachain-0.8.4}/tests/test_atomicity.py +0 -0
  251. {datachain-0.8.3 → datachain-0.8.4}/tests/test_query_e2e.py +0 -0
  252. {datachain-0.8.3 → datachain-0.8.4}/tests/test_telemetry.py +0 -0
  253. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/__init__.py +0 -0
  254. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/__init__.py +0 -0
  255. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/conftest.py +0 -0
  256. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_clip.py +0 -0
  257. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  258. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_datachain_merge.py +0 -0
  259. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_diff.py +0 -0
  260. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_feature.py +0 -0
  261. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_feature_utils.py +0 -0
  262. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_file.py +0 -0
  263. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_hf.py +0 -0
  264. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_image.py +0 -0
  265. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_listing_info.py +0 -0
  266. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_models.py +0 -0
  267. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_schema.py +0 -0
  268. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_sql_to_python.py +0 -0
  269. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_text.py +0 -0
  270. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_udf_signature.py +0 -0
  271. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_utils.py +0 -0
  272. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/lib/test_webdataset.py +0 -0
  273. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/sql/__init__.py +0 -0
  274. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/sql/sqlite/__init__.py +0 -0
  275. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/sql/sqlite/test_types.py +0 -0
  276. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/sql/sqlite/test_utils.py +0 -0
  277. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/sql/test_array.py +0 -0
  278. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/sql/test_path.py +0 -0
  279. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/sql/test_random.py +0 -0
  280. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/sql/test_selectable.py +0 -0
  281. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/sql/test_string.py +0 -0
  282. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_asyn.py +0 -0
  283. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_cache.py +0 -0
  284. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_catalog.py +0 -0
  285. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_catalog_loader.py +0 -0
  286. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_client.py +0 -0
  287. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_client_gcs.py +0 -0
  288. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_client_s3.py +0 -0
  289. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_data_storage.py +0 -0
  290. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_database_engine.py +0 -0
  291. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_dataset.py +0 -0
  292. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_dispatch.py +0 -0
  293. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_fileslice.py +0 -0
  294. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_listing.py +0 -0
  295. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_metastore.py +0 -0
  296. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_module_exports.py +0 -0
  297. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_query.py +0 -0
  298. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_query_metrics.py +0 -0
  299. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_query_params.py +0 -0
  300. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_serializer.py +0 -0
  301. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_session.py +0 -0
  302. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_utils.py +0 -0
  303. {datachain-0.8.3 → datachain-0.8.4}/tests/unit/test_warehouse.py +0 -0
  304. {datachain-0.8.3 → datachain-0.8.4}/tests/utils.py +0 -0
@@ -32,7 +32,7 @@ jobs:
32
32
  POSTGRES_DB: database
33
33
  POSTGRES_HOST_AUTH_METHOD: trust
34
34
  clickhouse:
35
- image: clickhouse/clickhouse-server:24.6
35
+ image: clickhouse/clickhouse-server:24.8
36
36
  ports:
37
37
  - 8123:8123
38
38
  - 9010:9000
@@ -138,7 +138,7 @@ jobs:
138
138
  matrix:
139
139
  os: [ubuntu-latest, windows-latest]
140
140
  pyv: ['3.9', '3.12']
141
- group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
141
+ group: ['get_started', 'computer_vision', 'llm_and_nlp', 'multimodal']
142
142
  exclude:
143
143
  - {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
144
144
  - {os: ubuntu-latest, pyv: '3.12', group: 'multimodal'}
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.8.4'
27
+ rev: 'v0.8.6'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.8.3
3
+ Version: 0.8.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -50,7 +50,7 @@ Requires-Dist: websockets
50
50
  Provides-Extra: docs
51
51
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
52
52
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
53
- Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
53
+ Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
54
54
  Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
55
55
  Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
56
56
  Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
@@ -84,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
84
84
  Requires-Dist: scipy; extra == "tests"
85
85
  Provides-Extra: dev
86
86
  Requires-Dist: datachain[docs,tests]; extra == "dev"
87
- Requires-Dist: mypy==1.14.0; extra == "dev"
87
+ Requires-Dist: mypy==1.14.1; extra == "dev"
88
88
  Requires-Dist: types-python-dateutil; extra == "dev"
89
89
  Requires-Dist: types-pytz; extra == "dev"
90
90
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -95,11 +95,11 @@ Requires-Dist: datachain[tests]; extra == "examples"
95
95
  Requires-Dist: defusedxml; extra == "examples"
96
96
  Requires-Dist: accelerate; extra == "examples"
97
97
  Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
98
- Requires-Dist: unstructured[pdf]; extra == "examples"
98
+ Requires-Dist: unstructured[pdf]<0.16.12; extra == "examples"
99
99
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
100
100
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
101
101
  Requires-Dist: onnx==1.16.1; extra == "examples"
102
- Requires-Dist: ultralytics==8.3.53; extra == "examples"
102
+ Requires-Dist: ultralytics==8.3.55; extra == "examples"
103
103
 
104
104
  ================
105
105
  |logo| DataChain
@@ -136,6 +136,7 @@ plugins:
136
136
  show_root_heading: true
137
137
  show_signature_annotations: true
138
138
  show_symbol_type_heading: true
139
+ show_symbol_type_toc: true
139
140
  signature_crossrefs: true
140
141
  import:
141
142
  - https://docs.python.org/3/objects.inv
@@ -56,7 +56,7 @@ dependencies = [
56
56
  docs = [
57
57
  "mkdocs>=1.5.2",
58
58
  "mkdocs-gen-files>=0.5.0",
59
- "mkdocs-material>=9.3.1",
59
+ "mkdocs-material==9.5.22",
60
60
  "mkdocs-section-index>=0.3.6",
61
61
  "mkdocstrings-python>=1.6.3",
62
62
  "mkdocs-literate-nav>=0.6.1"
@@ -96,7 +96,7 @@ tests = [
96
96
  ]
97
97
  dev = [
98
98
  "datachain[docs,tests]",
99
- "mypy==1.14.0",
99
+ "mypy==1.14.1",
100
100
  "types-python-dateutil",
101
101
  "types-pytz",
102
102
  "types-PyYAML",
@@ -108,11 +108,11 @@ examples = [
108
108
  "defusedxml",
109
109
  "accelerate",
110
110
  "unstructured_ingest[embed-huggingface]",
111
- "unstructured[pdf]",
111
+ "unstructured[pdf]<0.16.12",
112
112
  "pdfplumber==0.11.4",
113
113
  "huggingface_hub[hf_transfer]",
114
114
  "onnx==1.16.1",
115
- "ultralytics==8.3.53"
115
+ "ultralytics==8.3.55"
116
116
  ]
117
117
 
118
118
  [project.urls]
@@ -0,0 +1,311 @@
1
+ import logging
2
+ import os
3
+ import sys
4
+ import traceback
5
+ from multiprocessing import freeze_support
6
+ from typing import Optional
7
+
8
+ from datachain.cli.utils import get_logging_level
9
+ from datachain.telemetry import telemetry
10
+
11
+ from .commands import (
12
+ clear_cache,
13
+ completion,
14
+ dataset_stats,
15
+ du,
16
+ edit_dataset,
17
+ garbage_collect,
18
+ index,
19
+ list_datasets,
20
+ ls,
21
+ query,
22
+ rm_dataset,
23
+ show,
24
+ )
25
+ from .parser import get_parser
26
+
27
+ logger = logging.getLogger("datachain")
28
+
29
+
30
+ def main(argv: Optional[list[str]] = None) -> int:
31
+ from datachain.catalog import get_catalog
32
+
33
+ # Required for Windows multiprocessing support
34
+ freeze_support()
35
+
36
+ datachain_parser = get_parser()
37
+ args = datachain_parser.parse_args(argv)
38
+
39
+ if args.command in ("internal-run-udf", "internal-run-udf-worker"):
40
+ return handle_udf(args.command)
41
+
42
+ logger.addHandler(logging.StreamHandler())
43
+ logging_level = get_logging_level(args)
44
+ logger.setLevel(logging_level)
45
+
46
+ client_config = {
47
+ "aws_endpoint_url": args.aws_endpoint_url,
48
+ "anon": args.anon,
49
+ }
50
+
51
+ if args.debug_sql:
52
+ # This also sets this environment variable for any subprocesses
53
+ os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
54
+
55
+ error = None
56
+
57
+ try:
58
+ catalog = get_catalog(client_config=client_config)
59
+ return handle_command(args, catalog, client_config)
60
+ except BrokenPipeError as exc:
61
+ error, return_code = handle_broken_pipe_error(exc)
62
+ return return_code
63
+ except (KeyboardInterrupt, Exception) as exc:
64
+ error, return_code = handle_general_exception(exc, args, logging_level)
65
+ return return_code
66
+ finally:
67
+ telemetry.send_cli_call(args.command, error=error)
68
+
69
+
70
+ def handle_command(args, catalog, client_config) -> int:
71
+ """Handle the different CLI commands."""
72
+ from datachain.studio import process_jobs_args, process_studio_cli_args
73
+
74
+ command_handlers = {
75
+ "cp": lambda: handle_cp_command(args, catalog),
76
+ "clone": lambda: handle_clone_command(args, catalog),
77
+ "dataset": lambda: handle_dataset_command(args, catalog),
78
+ "ds": lambda: handle_dataset_command(args, catalog),
79
+ "ls": lambda: handle_ls_command(args, client_config),
80
+ "show": lambda: handle_show_command(args, catalog),
81
+ "du": lambda: handle_du_command(args, catalog, client_config),
82
+ "find": lambda: handle_find_command(args, catalog),
83
+ "index": lambda: handle_index_command(args, catalog),
84
+ "completion": lambda: handle_completion_command(args),
85
+ "query": lambda: handle_query_command(args, catalog),
86
+ "clear-cache": lambda: clear_cache(catalog),
87
+ "gc": lambda: garbage_collect(catalog),
88
+ "studio": lambda: process_studio_cli_args(args),
89
+ "job": lambda: process_jobs_args(args),
90
+ }
91
+
92
+ handler = command_handlers.get(args.command)
93
+ if handler:
94
+ handler()
95
+ return 0
96
+ print(f"invalid command: {args.command}", file=sys.stderr)
97
+ return 1
98
+
99
+
100
+ def handle_cp_command(args, catalog):
101
+ catalog.cp(
102
+ args.sources,
103
+ args.output,
104
+ force=bool(args.force),
105
+ update=bool(args.update),
106
+ recursive=bool(args.recursive),
107
+ edatachain_file=None,
108
+ edatachain_only=False,
109
+ no_edatachain_file=True,
110
+ no_glob=args.no_glob,
111
+ )
112
+
113
+
114
+ def handle_clone_command(args, catalog):
115
+ catalog.clone(
116
+ args.sources,
117
+ args.output,
118
+ force=bool(args.force),
119
+ update=bool(args.update),
120
+ recursive=bool(args.recursive),
121
+ no_glob=args.no_glob,
122
+ no_cp=args.no_cp,
123
+ edatachain=args.edatachain,
124
+ edatachain_file=args.edatachain_file,
125
+ )
126
+
127
+
128
+ def handle_dataset_command(args, catalog):
129
+ dataset_commands = {
130
+ "pull": lambda: catalog.pull_dataset(
131
+ args.dataset,
132
+ args.output,
133
+ local_ds_name=args.local_name,
134
+ local_ds_version=args.local_version,
135
+ cp=args.cp,
136
+ force=bool(args.force),
137
+ edatachain=args.edatachain,
138
+ edatachain_file=args.edatachain_file,
139
+ ),
140
+ "edit": lambda: edit_dataset(
141
+ catalog,
142
+ args.name,
143
+ new_name=args.new_name,
144
+ description=args.description,
145
+ labels=args.labels,
146
+ studio=args.studio,
147
+ local=args.local,
148
+ all=args.all,
149
+ team=args.team,
150
+ ),
151
+ "ls": lambda: list_datasets(
152
+ catalog=catalog,
153
+ studio=args.studio,
154
+ local=args.local,
155
+ all=args.all,
156
+ team=args.team,
157
+ ),
158
+ "rm": lambda: rm_dataset(
159
+ catalog,
160
+ args.name,
161
+ version=args.version,
162
+ force=args.force,
163
+ studio=args.studio,
164
+ local=args.local,
165
+ all=args.all,
166
+ team=args.team,
167
+ ),
168
+ "remove": lambda: rm_dataset(
169
+ catalog,
170
+ args.name,
171
+ version=args.version,
172
+ force=args.force,
173
+ studio=args.studio,
174
+ local=args.local,
175
+ all=args.all,
176
+ team=args.team,
177
+ ),
178
+ "stats": lambda: dataset_stats(
179
+ catalog,
180
+ args.name,
181
+ args.version,
182
+ show_bytes=args.bytes,
183
+ si=args.si,
184
+ ),
185
+ }
186
+
187
+ handler = dataset_commands.get(args.datasets_cmd)
188
+ if handler:
189
+ return handler()
190
+ raise Exception(f"Unexpected command {args.datasets_cmd}")
191
+
192
+
193
+ def handle_ls_command(args, client_config):
194
+ ls(
195
+ args.sources,
196
+ long=bool(args.long),
197
+ studio=args.studio,
198
+ local=args.local,
199
+ all=args.all,
200
+ team=args.team,
201
+ update=bool(args.update),
202
+ client_config=client_config,
203
+ )
204
+
205
+
206
+ def handle_show_command(args, catalog):
207
+ show(
208
+ catalog,
209
+ args.name,
210
+ args.version,
211
+ limit=args.limit,
212
+ offset=args.offset,
213
+ columns=args.columns,
214
+ no_collapse=args.no_collapse,
215
+ schema=args.schema,
216
+ )
217
+
218
+
219
+ def handle_du_command(args, catalog, client_config):
220
+ du(
221
+ catalog,
222
+ args.sources,
223
+ show_bytes=args.bytes,
224
+ depth=args.depth,
225
+ si=args.si,
226
+ update=bool(args.update),
227
+ client_config=client_config,
228
+ )
229
+
230
+
231
+ def handle_find_command(args, catalog):
232
+ results_found = False
233
+ for result in catalog.find(
234
+ args.sources,
235
+ update=bool(args.update),
236
+ names=args.name,
237
+ inames=args.iname,
238
+ paths=args.path,
239
+ ipaths=args.ipath,
240
+ size=args.size,
241
+ typ=args.type,
242
+ columns=args.columns,
243
+ ):
244
+ print(result)
245
+ results_found = True
246
+ if not results_found:
247
+ print("No results")
248
+
249
+
250
+ def handle_index_command(args, catalog):
251
+ index(
252
+ catalog,
253
+ args.sources,
254
+ update=bool(args.update),
255
+ )
256
+
257
+
258
+ def handle_completion_command(args):
259
+ print(completion(args.shell))
260
+
261
+
262
+ def handle_query_command(args, catalog):
263
+ query(
264
+ catalog,
265
+ args.script,
266
+ parallel=args.parallel,
267
+ params=args.param,
268
+ )
269
+
270
+
271
+ def handle_broken_pipe_error(exc):
272
+ # Python flushes standard streams on exit; redirect remaining output
273
+ # to devnull to avoid another BrokenPipeError at shutdown
274
+ # See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
275
+ error = str(exc)
276
+ devnull = os.open(os.devnull, os.O_WRONLY)
277
+ os.dup2(devnull, sys.stdout.fileno())
278
+ return error, 141 # 128 + 13 (SIGPIPE)
279
+
280
+
281
+ def handle_general_exception(exc, args, logging_level):
282
+ error = str(exc)
283
+ if isinstance(exc, KeyboardInterrupt):
284
+ msg = "Operation cancelled by the user"
285
+ else:
286
+ msg = str(exc)
287
+ print("Error:", msg, file=sys.stderr)
288
+ if logging_level <= logging.DEBUG:
289
+ traceback.print_exception(
290
+ type(exc),
291
+ exc,
292
+ exc.__traceback__,
293
+ file=sys.stderr,
294
+ )
295
+ if args.pdb:
296
+ import pdb # noqa: T100
297
+
298
+ pdb.post_mortem()
299
+ return error, 1
300
+
301
+
302
+ def handle_udf(command):
303
+ if command == "internal-run-udf":
304
+ from datachain.query.dispatch import udf_entrypoint
305
+
306
+ return udf_entrypoint()
307
+
308
+ if command == "internal-run-udf-worker":
309
+ from datachain.query.dispatch import udf_worker_entrypoint
310
+
311
+ return udf_worker_entrypoint()
@@ -0,0 +1,29 @@
1
+ from .datasets import (
2
+ dataset_stats,
3
+ edit_dataset,
4
+ list_datasets,
5
+ list_datasets_local,
6
+ rm_dataset,
7
+ )
8
+ from .du import du
9
+ from .index import index
10
+ from .ls import ls
11
+ from .misc import clear_cache, completion, garbage_collect
12
+ from .query import query
13
+ from .show import show
14
+
15
+ __all__ = [
16
+ "clear_cache",
17
+ "completion",
18
+ "dataset_stats",
19
+ "du",
20
+ "edit_dataset",
21
+ "garbage_collect",
22
+ "index",
23
+ "list_datasets",
24
+ "list_datasets_local",
25
+ "ls",
26
+ "query",
27
+ "rm_dataset",
28
+ "show",
29
+ ]
@@ -0,0 +1,129 @@
1
+ import sys
2
+ from typing import TYPE_CHECKING, Optional
3
+
4
+ from tabulate import tabulate
5
+
6
+ from datachain import utils
7
+
8
+ if TYPE_CHECKING:
9
+ from datachain.catalog import Catalog
10
+
11
+ from datachain.cli.utils import determine_flavors
12
+ from datachain.config import Config
13
+ from datachain.error import DatasetNotFoundError
14
+
15
+
16
+ def list_datasets(
17
+ catalog: "Catalog",
18
+ studio: bool = False,
19
+ local: bool = False,
20
+ all: bool = True,
21
+ team: Optional[str] = None,
22
+ ):
23
+ from datachain.studio import list_datasets
24
+
25
+ token = Config().read().get("studio", {}).get("token")
26
+ all, local, studio = determine_flavors(studio, local, all, token)
27
+
28
+ local_datasets = set(list_datasets_local(catalog)) if all or local else set()
29
+ studio_datasets = (
30
+ set(list_datasets(team=team)) if (all or studio) and token else set()
31
+ )
32
+
33
+ rows = [
34
+ _datasets_tabulate_row(
35
+ name=name,
36
+ version=version,
37
+ both=(all or (local and studio)) and token,
38
+ local=(name, version) in local_datasets,
39
+ studio=(name, version) in studio_datasets,
40
+ )
41
+ for name, version in local_datasets.union(studio_datasets)
42
+ ]
43
+
44
+ print(tabulate(rows, headers="keys"))
45
+
46
+
47
+ def list_datasets_local(catalog: "Catalog"):
48
+ for d in catalog.ls_datasets():
49
+ for v in d.versions:
50
+ yield (d.name, v.version)
51
+
52
+
53
+ def _datasets_tabulate_row(name, version, both, local, studio):
54
+ row = {
55
+ "Name": name,
56
+ "Version": version,
57
+ }
58
+ if both:
59
+ row["Studio"] = "\u2714" if studio else "\u2716"
60
+ row["Local"] = "\u2714" if local else "\u2716"
61
+ return row
62
+
63
+
64
+ def rm_dataset(
65
+ catalog: "Catalog",
66
+ name: str,
67
+ version: Optional[int] = None,
68
+ force: Optional[bool] = False,
69
+ studio: bool = False,
70
+ local: bool = False,
71
+ all: bool = True,
72
+ team: Optional[str] = None,
73
+ ):
74
+ from datachain.studio import remove_studio_dataset
75
+
76
+ token = Config().read().get("studio", {}).get("token")
77
+ all, local, studio = determine_flavors(studio, local, all, token)
78
+
79
+ if all or local:
80
+ try:
81
+ catalog.remove_dataset(name, version=version, force=force)
82
+ except DatasetNotFoundError:
83
+ print("Dataset not found in local", file=sys.stderr)
84
+
85
+ if (all or studio) and token:
86
+ remove_studio_dataset(team, name, version, force)
87
+
88
+
89
+ def edit_dataset(
90
+ catalog: "Catalog",
91
+ name: str,
92
+ new_name: Optional[str] = None,
93
+ description: Optional[str] = None,
94
+ labels: Optional[list[str]] = None,
95
+ studio: bool = False,
96
+ local: bool = False,
97
+ all: bool = True,
98
+ team: Optional[str] = None,
99
+ ):
100
+ from datachain.studio import edit_studio_dataset
101
+
102
+ token = Config().read().get("studio", {}).get("token")
103
+ all, local, studio = determine_flavors(studio, local, all, token)
104
+
105
+ if all or local:
106
+ try:
107
+ catalog.edit_dataset(name, new_name, description, labels)
108
+ except DatasetNotFoundError:
109
+ print("Dataset not found in local", file=sys.stderr)
110
+
111
+ if (all or studio) and token:
112
+ edit_studio_dataset(team, name, new_name, description, labels)
113
+
114
+
115
+ def dataset_stats(
116
+ catalog: "Catalog",
117
+ name: str,
118
+ version: int,
119
+ show_bytes=False,
120
+ si=False,
121
+ ):
122
+ stats = catalog.dataset_stats(name, version)
123
+
124
+ if stats:
125
+ print(f"Number of objects: {stats.num_objects}")
126
+ if show_bytes:
127
+ print(f"Total objects size: {stats.size}")
128
+ else:
129
+ print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")
@@ -0,0 +1,14 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from datachain import utils
4
+
5
+ if TYPE_CHECKING:
6
+ from datachain.catalog import Catalog
7
+
8
+
9
+ def du(catalog: "Catalog", sources, show_bytes=False, si=False, **kwargs):
10
+ for path, size in catalog.du(sources, **kwargs):
11
+ if show_bytes:
12
+ print(f"{size} {path}")
13
+ else:
14
+ print(f"{utils.sizeof_fmt(size, si=si): >7} {path}")
@@ -0,0 +1,12 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from datachain.catalog import Catalog
5
+
6
+
7
+ def index(
8
+ catalog: "Catalog",
9
+ sources,
10
+ **kwargs,
11
+ ):
12
+ catalog.index(sources, **kwargs)