datachain 0.7.8__tar.gz → 0.7.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (280) hide show
  1. {datachain-0.7.8/src/datachain.egg-info → datachain-0.7.9}/PKG-INFO +2 -2
  2. {datachain-0.7.8 → datachain-0.7.9}/examples/llm_and_nlp/hf-dataset-llm-eval.py +6 -3
  3. {datachain-0.7.8 → datachain-0.7.9}/pyproject.toml +1 -1
  4. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/cli.py +9 -3
  5. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/metastore.py +3 -2
  6. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/dc.py +1 -0
  7. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/pytorch.py +54 -37
  8. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/remote/studio.py +44 -25
  9. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/studio.py +2 -2
  10. {datachain-0.7.8 → datachain-0.7.9/src/datachain.egg-info}/PKG-INFO +2 -2
  11. {datachain-0.7.8 → datachain-0.7.9}/src/datachain.egg-info/requires.txt +1 -1
  12. {datachain-0.7.8 → datachain-0.7.9}/tests/conftest.py +1 -1
  13. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_catalog.py +32 -0
  14. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_ls.py +2 -2
  15. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_pull.py +13 -13
  16. {datachain-0.7.8 → datachain-0.7.9}/tests/test_cli_studio.py +4 -2
  17. {datachain-0.7.8 → datachain-0.7.9}/.cruft.json +0 -0
  18. {datachain-0.7.8 → datachain-0.7.9}/.gitattributes +0 -0
  19. {datachain-0.7.8 → datachain-0.7.9}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  20. {datachain-0.7.8 → datachain-0.7.9}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  21. {datachain-0.7.8 → datachain-0.7.9}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  22. {datachain-0.7.8 → datachain-0.7.9}/.github/codecov.yaml +0 -0
  23. {datachain-0.7.8 → datachain-0.7.9}/.github/dependabot.yml +0 -0
  24. {datachain-0.7.8 → datachain-0.7.9}/.github/workflows/benchmarks.yml +0 -0
  25. {datachain-0.7.8 → datachain-0.7.9}/.github/workflows/release.yml +0 -0
  26. {datachain-0.7.8 → datachain-0.7.9}/.github/workflows/tests-studio.yml +0 -0
  27. {datachain-0.7.8 → datachain-0.7.9}/.github/workflows/tests.yml +0 -0
  28. {datachain-0.7.8 → datachain-0.7.9}/.github/workflows/update-template.yaml +0 -0
  29. {datachain-0.7.8 → datachain-0.7.9}/.gitignore +0 -0
  30. {datachain-0.7.8 → datachain-0.7.9}/.pre-commit-config.yaml +0 -0
  31. {datachain-0.7.8 → datachain-0.7.9}/CODE_OF_CONDUCT.rst +0 -0
  32. {datachain-0.7.8 → datachain-0.7.9}/CONTRIBUTING.rst +0 -0
  33. {datachain-0.7.8 → datachain-0.7.9}/LICENSE +0 -0
  34. {datachain-0.7.8 → datachain-0.7.9}/README.rst +0 -0
  35. {datachain-0.7.8 → datachain-0.7.9}/docs/assets/captioned_cartoons.png +0 -0
  36. {datachain-0.7.8 → datachain-0.7.9}/docs/assets/datachain-white.svg +0 -0
  37. {datachain-0.7.8 → datachain-0.7.9}/docs/assets/datachain.svg +0 -0
  38. {datachain-0.7.8 → datachain-0.7.9}/docs/index.md +0 -0
  39. {datachain-0.7.8 → datachain-0.7.9}/docs/overrides/main.html +0 -0
  40. {datachain-0.7.8 → datachain-0.7.9}/docs/references/datachain.md +0 -0
  41. {datachain-0.7.8 → datachain-0.7.9}/docs/references/datatype.md +0 -0
  42. {datachain-0.7.8 → datachain-0.7.9}/docs/references/file.md +0 -0
  43. {datachain-0.7.8 → datachain-0.7.9}/docs/references/index.md +0 -0
  44. {datachain-0.7.8 → datachain-0.7.9}/docs/references/sql.md +0 -0
  45. {datachain-0.7.8 → datachain-0.7.9}/docs/references/torch.md +0 -0
  46. {datachain-0.7.8 → datachain-0.7.9}/docs/references/udf.md +0 -0
  47. {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  48. {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  49. {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/openimage-detect.py +0 -0
  50. {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/ultralytics-bbox.py +0 -0
  51. {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/ultralytics-pose.py +0 -0
  52. {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/ultralytics-segment.py +0 -0
  53. {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/common_sql_functions.py +0 -0
  54. {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/json-csv-reader.py +0 -0
  55. {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/torch-loader.py +0 -0
  56. {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/udfs/parallel.py +0 -0
  57. {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/udfs/simple.py +0 -0
  58. {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/udfs/stateful.py +0 -0
  59. {datachain-0.7.8 → datachain-0.7.9}/examples/llm_and_nlp/claude-query.py +0 -0
  60. {datachain-0.7.8 → datachain-0.7.9}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  61. {datachain-0.7.8 → datachain-0.7.9}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  62. {datachain-0.7.8 → datachain-0.7.9}/examples/multimodal/clip_inference.py +0 -0
  63. {datachain-0.7.8 → datachain-0.7.9}/examples/multimodal/hf_pipeline.py +0 -0
  64. {datachain-0.7.8 → datachain-0.7.9}/examples/multimodal/openai_image_desc_lib.py +0 -0
  65. {datachain-0.7.8 → datachain-0.7.9}/examples/multimodal/wds.py +0 -0
  66. {datachain-0.7.8 → datachain-0.7.9}/examples/multimodal/wds_filtered.py +0 -0
  67. {datachain-0.7.8 → datachain-0.7.9}/mkdocs.yml +0 -0
  68. {datachain-0.7.8 → datachain-0.7.9}/noxfile.py +0 -0
  69. {datachain-0.7.8 → datachain-0.7.9}/setup.cfg +0 -0
  70. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/__init__.py +0 -0
  71. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/__main__.py +0 -0
  72. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/asyn.py +0 -0
  73. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/cache.py +0 -0
  74. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/catalog/__init__.py +0 -0
  75. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/catalog/catalog.py +0 -0
  76. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/catalog/datasource.py +0 -0
  77. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/catalog/loader.py +0 -0
  78. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/cli_utils.py +0 -0
  79. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/__init__.py +0 -0
  80. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/azure.py +0 -0
  81. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/fileslice.py +0 -0
  82. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/fsspec.py +0 -0
  83. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/gcs.py +0 -0
  84. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/hf.py +0 -0
  85. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/local.py +0 -0
  86. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/s3.py +0 -0
  87. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/config.py +0 -0
  88. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/__init__.py +0 -0
  89. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/db_engine.py +0 -0
  90. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/job.py +0 -0
  91. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/schema.py +0 -0
  92. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/serializer.py +0 -0
  93. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/sqlite.py +0 -0
  94. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/warehouse.py +0 -0
  95. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/dataset.py +0 -0
  96. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/error.py +0 -0
  97. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/__init__.py +0 -0
  98. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/aggregate.py +0 -0
  99. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/array.py +0 -0
  100. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/base.py +0 -0
  101. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/conditional.py +0 -0
  102. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/func.py +0 -0
  103. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/numeric.py +0 -0
  104. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/path.py +0 -0
  105. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/random.py +0 -0
  106. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/string.py +0 -0
  107. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/window.py +0 -0
  108. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/job.py +0 -0
  109. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/__init__.py +0 -0
  110. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/arrow.py +0 -0
  111. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/clip.py +0 -0
  112. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/__init__.py +0 -0
  113. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/flatten.py +0 -0
  114. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/python_to_sql.py +0 -0
  115. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/sql_to_python.py +0 -0
  116. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/unflatten.py +0 -0
  117. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  118. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/data_model.py +0 -0
  119. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/dataset_info.py +0 -0
  120. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/file.py +0 -0
  121. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/hf.py +0 -0
  122. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/image.py +0 -0
  123. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/listing.py +0 -0
  124. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/listing_info.py +0 -0
  125. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/meta_formats.py +0 -0
  126. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/model_store.py +0 -0
  127. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/settings.py +0 -0
  128. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/signal_schema.py +0 -0
  129. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/tar.py +0 -0
  130. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/text.py +0 -0
  131. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/udf.py +0 -0
  132. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/udf_signature.py +0 -0
  133. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/utils.py +0 -0
  134. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/vfile.py +0 -0
  135. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/webdataset.py +0 -0
  136. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/webdataset_laion.py +0 -0
  137. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/listing.py +0 -0
  138. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/__init__.py +0 -0
  139. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/bbox.py +0 -0
  140. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/pose.py +0 -0
  141. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/segment.py +0 -0
  142. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/ultralytics/__init__.py +0 -0
  143. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/ultralytics/bbox.py +0 -0
  144. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/ultralytics/pose.py +0 -0
  145. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/ultralytics/segment.py +0 -0
  146. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/node.py +0 -0
  147. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/nodes_fetcher.py +0 -0
  148. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/nodes_thread_pool.py +0 -0
  149. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/progress.py +0 -0
  150. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/py.typed +0 -0
  151. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/__init__.py +0 -0
  152. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/batch.py +0 -0
  153. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/dataset.py +0 -0
  154. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/dispatch.py +0 -0
  155. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/metrics.py +0 -0
  156. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/params.py +0 -0
  157. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/queue.py +0 -0
  158. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/schema.py +0 -0
  159. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/session.py +0 -0
  160. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/remote/__init__.py +0 -0
  161. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/__init__.py +0 -0
  162. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/default/__init__.py +0 -0
  163. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/default/base.py +0 -0
  164. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/__init__.py +0 -0
  165. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/aggregate.py +0 -0
  166. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/array.py +0 -0
  167. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/conditional.py +0 -0
  168. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/numeric.py +0 -0
  169. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/path.py +0 -0
  170. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/random.py +0 -0
  171. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/string.py +0 -0
  172. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/selectable.py +0 -0
  173. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/sqlite/__init__.py +0 -0
  174. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/sqlite/base.py +0 -0
  175. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/sqlite/types.py +0 -0
  176. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/sqlite/vector.py +0 -0
  177. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/types.py +0 -0
  178. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/utils.py +0 -0
  179. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/telemetry.py +0 -0
  180. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/toolkit/__init__.py +0 -0
  181. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/toolkit/split.py +0 -0
  182. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/torch/__init__.py +0 -0
  183. {datachain-0.7.8 → datachain-0.7.9}/src/datachain/utils.py +0 -0
  184. {datachain-0.7.8 → datachain-0.7.9}/src/datachain.egg-info/SOURCES.txt +0 -0
  185. {datachain-0.7.8 → datachain-0.7.9}/src/datachain.egg-info/dependency_links.txt +0 -0
  186. {datachain-0.7.8 → datachain-0.7.9}/src/datachain.egg-info/entry_points.txt +0 -0
  187. {datachain-0.7.8 → datachain-0.7.9}/src/datachain.egg-info/top_level.txt +0 -0
  188. {datachain-0.7.8 → datachain-0.7.9}/tests/__init__.py +0 -0
  189. {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/__init__.py +0 -0
  190. {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/conftest.py +0 -0
  191. {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  192. {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/datasets/.dvc/config +0 -0
  193. {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/datasets/.gitignore +0 -0
  194. {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  195. {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/test_datachain.py +0 -0
  196. {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/test_ls.py +0 -0
  197. {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/test_version.py +0 -0
  198. {datachain-0.7.8 → datachain-0.7.9}/tests/data.py +0 -0
  199. {datachain-0.7.8 → datachain-0.7.9}/tests/examples/__init__.py +0 -0
  200. {datachain-0.7.8 → datachain-0.7.9}/tests/examples/test_examples.py +0 -0
  201. {datachain-0.7.8 → datachain-0.7.9}/tests/examples/test_wds_e2e.py +0 -0
  202. {datachain-0.7.8 → datachain-0.7.9}/tests/examples/wds_data.py +0 -0
  203. {datachain-0.7.8 → datachain-0.7.9}/tests/func/__init__.py +0 -0
  204. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_client.py +0 -0
  205. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_datachain.py +0 -0
  206. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_dataset_query.py +0 -0
  207. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_datasets.py +0 -0
  208. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_feature_pickling.py +0 -0
  209. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_listing.py +0 -0
  210. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_meta_formats.py +0 -0
  211. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_metrics.py +0 -0
  212. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_pytorch.py +0 -0
  213. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_query.py +0 -0
  214. {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_toolkit.py +0 -0
  215. {datachain-0.7.8 → datachain-0.7.9}/tests/scripts/feature_class.py +0 -0
  216. {datachain-0.7.8 → datachain-0.7.9}/tests/scripts/feature_class_exception.py +0 -0
  217. {datachain-0.7.8 → datachain-0.7.9}/tests/scripts/feature_class_parallel.py +0 -0
  218. {datachain-0.7.8 → datachain-0.7.9}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  219. {datachain-0.7.8 → datachain-0.7.9}/tests/scripts/name_len_slow.py +0 -0
  220. {datachain-0.7.8 → datachain-0.7.9}/tests/test_atomicity.py +0 -0
  221. {datachain-0.7.8 → datachain-0.7.9}/tests/test_cli_e2e.py +0 -0
  222. {datachain-0.7.8 → datachain-0.7.9}/tests/test_query_e2e.py +0 -0
  223. {datachain-0.7.8 → datachain-0.7.9}/tests/test_telemetry.py +0 -0
  224. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/__init__.py +0 -0
  225. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/__init__.py +0 -0
  226. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/conftest.py +0 -0
  227. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_arrow.py +0 -0
  228. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_clip.py +0 -0
  229. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_datachain.py +0 -0
  230. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  231. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_datachain_merge.py +0 -0
  232. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_feature.py +0 -0
  233. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_feature_utils.py +0 -0
  234. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_file.py +0 -0
  235. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_hf.py +0 -0
  236. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_image.py +0 -0
  237. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_listing_info.py +0 -0
  238. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_models.py +0 -0
  239. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_schema.py +0 -0
  240. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_signal_schema.py +0 -0
  241. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_sql_to_python.py +0 -0
  242. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_text.py +0 -0
  243. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_udf_signature.py +0 -0
  244. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_utils.py +0 -0
  245. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_webdataset.py +0 -0
  246. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/__init__.py +0 -0
  247. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/sqlite/__init__.py +0 -0
  248. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/sqlite/test_types.py +0 -0
  249. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/sqlite/test_utils.py +0 -0
  250. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_array.py +0 -0
  251. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_conditional.py +0 -0
  252. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_path.py +0 -0
  253. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_random.py +0 -0
  254. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_selectable.py +0 -0
  255. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_string.py +0 -0
  256. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_asyn.py +0 -0
  257. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_cache.py +0 -0
  258. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_catalog.py +0 -0
  259. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_catalog_loader.py +0 -0
  260. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_cli_parsing.py +0 -0
  261. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_client.py +0 -0
  262. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_client_s3.py +0 -0
  263. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_config.py +0 -0
  264. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_data_storage.py +0 -0
  265. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_database_engine.py +0 -0
  266. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_dataset.py +0 -0
  267. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_dispatch.py +0 -0
  268. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_fileslice.py +0 -0
  269. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_func.py +0 -0
  270. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_listing.py +0 -0
  271. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_metastore.py +0 -0
  272. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_module_exports.py +0 -0
  273. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_query.py +0 -0
  274. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_query_metrics.py +0 -0
  275. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_query_params.py +0 -0
  276. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_serializer.py +0 -0
  277. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_session.py +0 -0
  278. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_utils.py +0 -0
  279. {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_warehouse.py +0 -0
  280. {datachain-0.7.8 → datachain-0.7.9}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.8
3
+ Version: 0.7.9
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -98,7 +98,7 @@ Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
98
98
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
99
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
100
  Requires-Dist: onnx==1.16.1; extra == "examples"
101
- Requires-Dist: ultralytics==8.3.29; extra == "examples"
101
+ Requires-Dist: ultralytics==8.3.37; extra == "examples"
102
102
 
103
103
  ================
104
104
  |logo| DataChain
@@ -15,9 +15,11 @@ class DialogEval(DataModel):
15
15
 
16
16
  # DataChain function to evaluate dialog.
17
17
  # DataChain is using types for inputs, results to automatically infer schema.
18
- def eval_dialog(user_input: str, bot_response: str) -> DialogEval:
19
- client = InferenceClient("meta-llama/Llama-3.1-70B-Instruct")
20
-
18
+ def eval_dialog(
19
+ client: InferenceClient,
20
+ user_input: str,
21
+ bot_response: str,
22
+ ) -> DialogEval:
21
23
  completion = client.chat_completion(
22
24
  messages=[
23
25
  {
@@ -44,6 +46,7 @@ def eval_dialog(user_input: str, bot_response: str) -> DialogEval:
44
46
  "hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv"
45
47
  )
46
48
  .settings(parallel=10)
49
+ .setup(client=lambda: InferenceClient("meta-llama/Llama-3.1-70B-Instruct"))
47
50
  .map(response=eval_dialog)
48
51
  .to_parquet("hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet")
49
52
  )
@@ -111,7 +111,7 @@ examples = [
111
111
  "pdfplumber==0.11.4",
112
112
  "huggingface_hub[hf_transfer]",
113
113
  "onnx==1.16.1",
114
- "ultralytics==8.3.29"
114
+ "ultralytics==8.3.37"
115
115
  ]
116
116
 
117
117
  [project.urls]
@@ -16,7 +16,7 @@ from tabulate import tabulate
16
16
  from datachain import Session, utils
17
17
  from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
18
18
  from datachain.config import Config
19
- from datachain.error import DataChainError
19
+ from datachain.error import DataChainError, DatasetNotFoundError
20
20
  from datachain.lib.dc import DataChain
21
21
  from datachain.studio import (
22
22
  edit_studio_dataset,
@@ -1056,7 +1056,10 @@ def rm_dataset(
1056
1056
  all, local, studio = _determine_flavors(studio, local, all, token)
1057
1057
 
1058
1058
  if all or local:
1059
- catalog.remove_dataset(name, version=version, force=force)
1059
+ try:
1060
+ catalog.remove_dataset(name, version=version, force=force)
1061
+ except DatasetNotFoundError:
1062
+ print("Dataset not found in local", file=sys.stderr)
1060
1063
 
1061
1064
  if (all or studio) and token:
1062
1065
  remove_studio_dataset(team, name, version, force)
@@ -1077,7 +1080,10 @@ def edit_dataset(
1077
1080
  all, local, studio = _determine_flavors(studio, local, all, token)
1078
1081
 
1079
1082
  if all or local:
1080
- catalog.edit_dataset(name, new_name, description, labels)
1083
+ try:
1084
+ catalog.edit_dataset(name, new_name, description, labels)
1085
+ except DatasetNotFoundError:
1086
+ print("Dataset not found in local", file=sys.stderr)
1081
1087
 
1082
1088
  if (all or studio) and token:
1083
1089
  edit_studio_dataset(team, name, new_name, description, labels)
@@ -725,9 +725,10 @@ class AbstractDBMetastore(AbstractMetastore):
725
725
 
726
726
  def list_datasets(self) -> Iterator["DatasetListRecord"]:
727
727
  """Lists all datasets."""
728
- yield from self._parse_dataset_list(
729
- self.db.execute(self._base_list_datasets_query())
728
+ query = self._base_list_datasets_query().order_by(
729
+ self._datasets.c.name, self._datasets_versions.c.version
730
730
  )
731
+ yield from self._parse_dataset_list(self.db.execute(query))
731
732
 
732
733
  def list_datasets_by_prefix(
733
734
  self, prefix: str, conn=None
@@ -1446,6 +1446,7 @@ class DataChain:
1446
1446
  tokenizer=tokenizer,
1447
1447
  tokenizer_kwargs=tokenizer_kwargs,
1448
1448
  num_samples=num_samples,
1449
+ dc_settings=chain._settings,
1449
1450
  )
1450
1451
 
1451
1452
  def remove_file_signals(self) -> "Self": # noqa: D102
@@ -10,8 +10,10 @@ from torchvision.transforms import v2
10
10
  from tqdm import tqdm
11
11
 
12
12
  from datachain import Session
13
+ from datachain.asyn import AsyncMapper
13
14
  from datachain.catalog import Catalog, get_catalog
14
15
  from datachain.lib.dc import DataChain
16
+ from datachain.lib.settings import Settings
15
17
  from datachain.lib.text import convert_text
16
18
 
17
19
  if TYPE_CHECKING:
@@ -30,6 +32,8 @@ def label_to_int(value: str, classes: list) -> int:
30
32
 
31
33
 
32
34
  class PytorchDataset(IterableDataset):
35
+ prefetch: int = 2
36
+
33
37
  def __init__(
34
38
  self,
35
39
  name: str,
@@ -39,6 +43,7 @@ class PytorchDataset(IterableDataset):
39
43
  tokenizer: Optional[Callable] = None,
40
44
  tokenizer_kwargs: Optional[dict[str, Any]] = None,
41
45
  num_samples: int = 0,
46
+ dc_settings: Optional[Settings] = None,
42
47
  ):
43
48
  """
44
49
  Pytorch IterableDataset that streams DataChain datasets.
@@ -66,6 +71,11 @@ class PytorchDataset(IterableDataset):
66
71
  catalog = get_catalog()
67
72
  self._init_catalog(catalog)
68
73
 
74
+ dc_settings = dc_settings or Settings()
75
+ self.cache = dc_settings.cache
76
+ if (prefetch := dc_settings.prefetch) is not None:
77
+ self.prefetch = prefetch
78
+
69
79
  def _init_catalog(self, catalog: "Catalog"):
70
80
  # For compatibility with multiprocessing,
71
81
  # we can only store params in __init__(), as Catalog isn't picklable
@@ -82,51 +92,58 @@ class PytorchDataset(IterableDataset):
82
92
  wh = wh_cls(*wh_args, **wh_kwargs)
83
93
  return Catalog(ms, wh, **self._catalog_params)
84
94
 
85
- def __iter__(self) -> Iterator[Any]:
86
- if self.catalog is None:
87
- self.catalog = self._get_catalog()
88
- session = Session.get(catalog=self.catalog)
89
- total_rank, total_workers = self.get_rank_and_workers()
95
+ def _rows_iter(self, total_rank: int, total_workers: int):
96
+ catalog = self._get_catalog()
97
+ session = Session("PyTorch", catalog=catalog)
90
98
  ds = DataChain.from_dataset(
91
99
  name=self.name, version=self.version, session=session
92
- )
100
+ ).settings(cache=self.cache, prefetch=self.prefetch)
93
101
  ds = ds.remove_file_signals()
94
102
 
95
103
  if self.num_samples > 0:
96
104
  ds = ds.sample(self.num_samples)
97
105
  ds = ds.chunk(total_rank, total_workers)
106
+ yield from ds.collect()
107
+
108
+ def __iter__(self) -> Iterator[Any]:
109
+ total_rank, total_workers = self.get_rank_and_workers()
110
+ rows = self._rows_iter(total_rank, total_workers)
111
+ if self.prefetch > 0:
112
+ from datachain.lib.udf import _prefetch_input
113
+
114
+ rows = AsyncMapper(_prefetch_input, rows, workers=self.prefetch).iterate()
115
+
98
116
  desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
99
- with tqdm(desc=desc, unit=" rows") as pbar:
100
- for row_features in ds.collect():
101
- row = []
102
- for fr in row_features:
103
- if hasattr(fr, "read"):
104
- row.append(fr.read()) # type: ignore[unreachable]
105
- else:
106
- row.append(fr)
107
- # Apply transforms
108
- if self.transform:
109
- try:
110
- if isinstance(self.transform, v2.Transform):
111
- row = self.transform(row)
112
- for i, val in enumerate(row):
113
- if isinstance(val, Image.Image):
114
- row[i] = self.transform(val)
115
- except ValueError:
116
- logger.warning(
117
- "Skipping transform due to unsupported data types."
118
- )
119
- self.transform = None
120
- if self.tokenizer:
121
- for i, val in enumerate(row):
122
- if isinstance(val, str) or (
123
- isinstance(val, list) and isinstance(val[0], str)
124
- ):
125
- row[i] = convert_text(
126
- val, self.tokenizer, self.tokenizer_kwargs
127
- ).squeeze(0) # type: ignore[union-attr]
128
- yield row
129
- pbar.update(1)
117
+ with tqdm(rows, desc=desc, unit=" rows", position=total_rank) as rows_it:
118
+ yield from map(self._process_row, rows_it)
119
+
120
+ def _process_row(self, row_features):
121
+ row = []
122
+ for fr in row_features:
123
+ if hasattr(fr, "read"):
124
+ row.append(fr.read()) # type: ignore[unreachable]
125
+ else:
126
+ row.append(fr)
127
+ # Apply transforms
128
+ if self.transform:
129
+ try:
130
+ if isinstance(self.transform, v2.Transform):
131
+ row = self.transform(row)
132
+ for i, val in enumerate(row):
133
+ if isinstance(val, Image.Image):
134
+ row[i] = self.transform(val)
135
+ except ValueError:
136
+ logger.warning("Skipping transform due to unsupported data types.")
137
+ self.transform = None
138
+ if self.tokenizer:
139
+ for i, val in enumerate(row):
140
+ if isinstance(val, str) or (
141
+ isinstance(val, list) and isinstance(val[0], str)
142
+ ):
143
+ row[i] = convert_text(
144
+ val, self.tokenizer, self.tokenizer_kwargs
145
+ ).squeeze(0) # type: ignore[union-attr]
146
+ return row
130
147
 
131
148
  @staticmethod
132
149
  def get_rank_and_workers() -> tuple[int, int]:
@@ -119,18 +119,27 @@ class StudioClient:
119
119
  "\tpip install 'datachain[remote]'"
120
120
  ) from None
121
121
 
122
- def _send_request_msgpack(self, route: str, data: dict[str, Any]) -> Response[Any]:
122
+ def _send_request_msgpack(
123
+ self, route: str, data: dict[str, Any], method: Optional[str] = "POST"
124
+ ) -> Response[Any]:
123
125
  import msgpack
124
126
  import requests
125
127
 
126
- response = requests.post(
127
- f"{self.url}/{route}",
128
- json={**data, "team_name": self.team},
128
+ kwargs = (
129
+ {"params": {**data, "team_name": self.team}}
130
+ if method == "GET"
131
+ else {"json": {**data, "team_name": self.team}}
132
+ )
133
+
134
+ response = requests.request(
135
+ method=method, # type: ignore[arg-type]
136
+ url=f"{self.url}/{route}",
129
137
  headers={
130
138
  "Content-Type": "application/json",
131
139
  "Authorization": f"token {self.token}",
132
140
  },
133
141
  timeout=self.timeout,
142
+ **kwargs, # type: ignore[arg-type]
134
143
  )
135
144
  ok = response.ok
136
145
  if not ok:
@@ -148,7 +157,9 @@ class StudioClient:
148
157
  return Response(response_data, ok, message)
149
158
 
150
159
  @retry_with_backoff(retries=5)
151
- def _send_request(self, route: str, data: dict[str, Any]) -> Response[Any]:
160
+ def _send_request(
161
+ self, route: str, data: dict[str, Any], method: Optional[str] = "POST"
162
+ ) -> Response[Any]:
152
163
  """
153
164
  Function that communicate Studio API.
154
165
  It will raise an exception, and try to retry, if 5xx status code is
@@ -157,14 +168,21 @@ class StudioClient:
157
168
  """
158
169
  import requests
159
170
 
160
- response = requests.post(
161
- f"{self.url}/{route}",
162
- json={**data, "team_name": self.team},
171
+ kwargs = (
172
+ {"params": {**data, "team_name": self.team}}
173
+ if method == "GET"
174
+ else {"json": {**data, "team_name": self.team}}
175
+ )
176
+
177
+ response = requests.request(
178
+ method=method, # type: ignore[arg-type]
179
+ url=f"{self.url}/{route}",
163
180
  headers={
164
181
  "Content-Type": "application/json",
165
182
  "Authorization": f"token {self.token}",
166
183
  },
167
184
  timeout=self.timeout,
185
+ **kwargs, # type: ignore[arg-type]
168
186
  )
169
187
  try:
170
188
  response.raise_for_status()
@@ -222,7 +240,7 @@ class StudioClient:
222
240
  yield path, response
223
241
 
224
242
  def ls_datasets(self) -> Response[LsData]:
225
- return self._send_request("datachain/ls-datasets", {})
243
+ return self._send_request("datachain/datasets", {}, method="GET")
226
244
 
227
245
  def edit_dataset(
228
246
  self,
@@ -232,20 +250,14 @@ class StudioClient:
232
250
  labels: Optional[list[str]] = None,
233
251
  ) -> Response[DatasetInfoData]:
234
252
  body = {
253
+ "new_name": new_name,
235
254
  "dataset_name": name,
255
+ "description": description,
256
+ "labels": labels,
236
257
  }
237
258
 
238
- if new_name is not None:
239
- body["new_name"] = new_name
240
-
241
- if description is not None:
242
- body["description"] = description
243
-
244
- if labels is not None:
245
- body["labels"] = labels # type: ignore[assignment]
246
-
247
259
  return self._send_request(
248
- "datachain/edit-dataset",
260
+ "datachain/datasets",
249
261
  body,
250
262
  )
251
263
 
@@ -256,12 +268,13 @@ class StudioClient:
256
268
  force: Optional[bool] = False,
257
269
  ) -> Response[DatasetInfoData]:
258
270
  return self._send_request(
259
- "datachain/rm-dataset",
271
+ "datachain/datasets",
260
272
  {
261
273
  "dataset_name": name,
262
274
  "version": version,
263
275
  "force": force,
264
276
  },
277
+ method="DELETE",
265
278
  )
266
279
 
267
280
  def dataset_info(self, name: str) -> Response[DatasetInfoData]:
@@ -272,7 +285,9 @@ class StudioClient:
272
285
 
273
286
  return dataset_info
274
287
 
275
- response = self._send_request("datachain/dataset-info", {"dataset_name": name})
288
+ response = self._send_request(
289
+ "datachain/datasets/info", {"dataset_name": name}, method="GET"
290
+ )
276
291
  if response.ok:
277
292
  response.data = _parse_dataset_info(response.data)
278
293
  return response
@@ -282,14 +297,16 @@ class StudioClient:
282
297
  ) -> Response[DatasetRowsData]:
283
298
  req_data = {"dataset_name": name, "dataset_version": version}
284
299
  return self._send_request_msgpack(
285
- "datachain/dataset-rows",
300
+ "datachain/datasets/rows",
286
301
  {**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
302
+ method="GET",
287
303
  )
288
304
 
289
305
  def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
290
306
  response = self._send_request(
291
- "datachain/dataset-stats",
307
+ "datachain/datasets/stats",
292
308
  {"dataset_name": name, "dataset_version": version},
309
+ method="GET",
293
310
  )
294
311
  if response.ok:
295
312
  response.data = DatasetStats(**response.data)
@@ -299,16 +316,18 @@ class StudioClient:
299
316
  self, name: str, version: int
300
317
  ) -> Response[DatasetExportSignedUrls]:
301
318
  return self._send_request(
302
- "datachain/dataset-export",
319
+ "datachain/datasets/export",
303
320
  {"dataset_name": name, "dataset_version": version},
321
+ method="GET",
304
322
  )
305
323
 
306
324
  def dataset_export_status(
307
325
  self, name: str, version: int
308
326
  ) -> Response[DatasetExportStatus]:
309
327
  return self._send_request(
310
- "datachain/dataset-export-status",
328
+ "datachain/datasets/export-status",
311
329
  {"dataset_name": name, "dataset_version": version},
330
+ method="GET",
312
331
  )
313
332
 
314
333
  def upload_file(self, file_name: str, content: bytes) -> Response[FileUploadData]:
@@ -155,7 +155,7 @@ def edit_studio_dataset(
155
155
  if not response.ok:
156
156
  raise_remote_error(response.message)
157
157
 
158
- print(f"Dataset {name} updated")
158
+ print(f"Dataset '{name}' updated in Studio")
159
159
 
160
160
 
161
161
  def remove_studio_dataset(
@@ -169,7 +169,7 @@ def remove_studio_dataset(
169
169
  if not response.ok:
170
170
  raise_remote_error(response.message)
171
171
 
172
- print(f"Dataset {name} removed")
172
+ print(f"Dataset '{name}' removed from Studio")
173
173
 
174
174
 
175
175
  def save_config(hostname, token):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.8
3
+ Version: 0.7.9
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -98,7 +98,7 @@ Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
98
98
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
99
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
100
  Requires-Dist: onnx==1.16.1; extra == "examples"
101
- Requires-Dist: ultralytics==8.3.29; extra == "examples"
101
+ Requires-Dist: ultralytics==8.3.37; extra == "examples"
102
102
 
103
103
  ================
104
104
  |logo| DataChain
@@ -55,7 +55,7 @@ unstructured[embed-huggingface,pdf]<0.16.0
55
55
  pdfplumber==0.11.4
56
56
  huggingface_hub[hf_transfer]
57
57
  onnx==1.16.1
58
- ultralytics==8.3.29
58
+ ultralytics==8.3.37
59
59
 
60
60
  [hf]
61
61
  numba>=0.60.0
@@ -649,7 +649,7 @@ def studio_datasets(requests_mock):
649
649
  },
650
650
  ]
651
651
 
652
- requests_mock.post(f"{STUDIO_URL}/api/datachain/ls-datasets", json=datasets)
652
+ requests_mock.get(f"{STUDIO_URL}/api/datachain/datasets", json=datasets)
653
653
 
654
654
 
655
655
  @pytest.fixture
@@ -772,6 +772,38 @@ def test_dataset_stats(test_session):
772
772
  assert dataset_version2.size == 18
773
773
 
774
774
 
775
+ def test_ls_datasets_ordered(test_session):
776
+ ids = [1, 2, 3]
777
+ values = tuple(zip(["a", "b", "c"], ids))
778
+
779
+ assert not list(test_session.catalog.ls_datasets())
780
+
781
+ dc = DataChain.from_values(
782
+ ids=ids,
783
+ file=[File(path=name, size=size) for name, size in values],
784
+ session=test_session,
785
+ )
786
+ dc.save("cats")
787
+ dc.save("dogs")
788
+ dc.save("cats")
789
+ dc.save("cats")
790
+ dc.save("cats")
791
+ datasets = list(test_session.catalog.ls_datasets())
792
+
793
+ assert [
794
+ (d.name, v.version)
795
+ for d in datasets
796
+ for v in d.versions
797
+ if not d.name.startswith("session_")
798
+ ] == [
799
+ ("cats", 1),
800
+ ("cats", 2),
801
+ ("cats", 3),
802
+ ("cats", 4),
803
+ ("dogs", 1),
804
+ ]
805
+
806
+
775
807
  def test_ls_datasets_no_json(test_session):
776
808
  ids = [1, 2, 3]
777
809
  values = tuple(zip(["a", "b", "c"], [1, 2, 3]))
@@ -193,7 +193,7 @@ class MockResponse:
193
193
  self.ok = ok
194
194
 
195
195
 
196
- def mock_post(url, data=None, json=None, **kwargs):
196
+ def mock_post(method, url, data=None, json=None, **kwargs):
197
197
  source = json["source"]
198
198
  path = re.sub(r"\w+://[^/]+/?", "", source).rstrip("/")
199
199
  data = [
@@ -238,7 +238,7 @@ dog3
238
238
  def test_ls_remote_sources(cloud_type, capsys, monkeypatch, studio_config):
239
239
  src = f"{cloud_type}://bucket"
240
240
  with monkeypatch.context() as m:
241
- m.setattr("requests.post", mock_post)
241
+ m.setattr("requests.request", mock_post)
242
242
  ls([src, f"{src}/dogs/others", f"{src}/dogs"], studio=True)
243
243
  captured = capsys.readouterr()
244
244
  assert captured.out == ls_remote_sources_output.format(src=src)
@@ -150,28 +150,28 @@ def remote_dataset_chunk_url():
150
150
 
151
151
  @pytest.fixture
152
152
  def remote_dataset_info(requests_mock, remote_dataset):
153
- requests_mock.post(f"{STUDIO_URL}/api/datachain/dataset-info", json=remote_dataset)
153
+ requests_mock.get(f"{STUDIO_URL}/api/datachain/datasets/info", json=remote_dataset)
154
154
 
155
155
 
156
156
  @pytest.fixture
157
157
  def remote_dataset_stats(requests_mock):
158
- requests_mock.post(
159
- f"{STUDIO_URL}/api/datachain/dataset-stats",
158
+ requests_mock.get(
159
+ f"{STUDIO_URL}/api/datachain/datasets/stats",
160
160
  json={"num_objects": 5, "size": 1000},
161
161
  )
162
162
 
163
163
 
164
164
  @pytest.fixture
165
165
  def dataset_export(requests_mock, remote_dataset_chunk_url):
166
- requests_mock.post(
167
- f"{STUDIO_URL}/api/datachain/dataset-export", json=[remote_dataset_chunk_url]
166
+ requests_mock.get(
167
+ f"{STUDIO_URL}/api/datachain/datasets/export", json=[remote_dataset_chunk_url]
168
168
  )
169
169
 
170
170
 
171
171
  @pytest.fixture
172
172
  def dataset_export_status(requests_mock):
173
- requests_mock.post(
174
- f"{STUDIO_URL}/api/datachain/dataset-export-status",
173
+ requests_mock.get(
174
+ f"{STUDIO_URL}/api/datachain/datasets/export-status",
175
175
  json={"status": "completed"},
176
176
  )
177
177
 
@@ -303,8 +303,8 @@ def test_pull_dataset_not_found_in_remote(
303
303
  requests_mock,
304
304
  cloud_test_catalog,
305
305
  ):
306
- requests_mock.post(
307
- f"{STUDIO_URL}/api/datachain/dataset-info",
306
+ requests_mock.get(
307
+ f"{STUDIO_URL}/api/datachain/datasets/info",
308
308
  status_code=404,
309
309
  json={"message": "Dataset not found"},
310
310
  )
@@ -322,8 +322,8 @@ def test_pull_dataset_error_on_fetching_stats(
322
322
  cloud_test_catalog,
323
323
  remote_dataset_info,
324
324
  ):
325
- requests_mock.post(
326
- f"{STUDIO_URL}/api/datachain/dataset-stats",
325
+ requests_mock.get(
326
+ f"{STUDIO_URL}/api/datachain/datasets/stats",
327
327
  status_code=400,
328
328
  json={"message": "Internal error"},
329
329
  )
@@ -345,8 +345,8 @@ def test_pull_dataset_exporting_dataset_failed_in_remote(
345
345
  dataset_export,
346
346
  export_status,
347
347
  ):
348
- requests_mock.post(
349
- f"{STUDIO_URL}/api/datachain/dataset-export-status",
348
+ requests_mock.get(
349
+ f"{STUDIO_URL}/api/datachain/datasets/export-status",
350
350
  json={"status": export_status},
351
351
  )
352
352
 
@@ -169,7 +169,7 @@ def test_studio_datasets(capsys, studio_datasets, mocker):
169
169
 
170
170
  def test_studio_edit_dataset(capsys, mocker):
171
171
  with requests_mock.mock() as m:
172
- m.post(f"{STUDIO_URL}/api/datachain/edit-dataset", json={})
172
+ m.post(f"{STUDIO_URL}/api/datachain/datasets", json={})
173
173
 
174
174
  # Studio token is required
175
175
  assert (
@@ -217,6 +217,8 @@ def test_studio_edit_dataset(capsys, mocker):
217
217
  "dataset_name": "name",
218
218
  "new_name": "new-name",
219
219
  "team_name": "team_name",
220
+ "description": None,
221
+ "labels": None,
220
222
  }
221
223
 
222
224
  # With all arguments
@@ -251,7 +253,7 @@ def test_studio_edit_dataset(capsys, mocker):
251
253
 
252
254
  def test_studio_rm_dataset(capsys, mocker):
253
255
  with requests_mock.mock() as m:
254
- m.post(f"{STUDIO_URL}/api/datachain/rm-dataset", json={})
256
+ m.delete(f"{STUDIO_URL}/api/datachain/datasets", json={})
255
257
 
256
258
  # Studio token is required
257
259
  assert main(["datasets", "rm", "name", "--team", "team_name", "--studio"]) == 1
File without changes
File without changes
File without changes
File without changes
File without changes