datachain 0.9.1__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (322) hide show
  1. {datachain-0.9.1 → datachain-0.10.0}/PKG-INFO +3 -3
  2. {datachain-0.9.1 → datachain-0.10.0}/README.rst +2 -2
  3. {datachain-0.9.1 → datachain-0.10.0}/docs/quick-start.md +2 -2
  4. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/fsspec.py +1 -1
  5. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/dc.py +60 -4
  6. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/file.py +16 -5
  7. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/dataset.py +2 -2
  8. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/session.py +15 -3
  9. {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/PKG-INFO +3 -3
  10. {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/SOURCES.txt +1 -0
  11. {datachain-0.9.1 → datachain-0.10.0}/tests/conftest.py +2 -2
  12. datachain-0.10.0/tests/func/test_cloud_transfer.py +68 -0
  13. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_datachain.py +16 -6
  14. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_pull.py +1 -1
  15. {datachain-0.9.1 → datachain-0.10.0}/.cruft.json +0 -0
  16. {datachain-0.9.1 → datachain-0.10.0}/.gitattributes +0 -0
  17. {datachain-0.9.1 → datachain-0.10.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  18. {datachain-0.9.1 → datachain-0.10.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  19. {datachain-0.9.1 → datachain-0.10.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  20. {datachain-0.9.1 → datachain-0.10.0}/.github/codecov.yaml +0 -0
  21. {datachain-0.9.1 → datachain-0.10.0}/.github/dependabot.yml +0 -0
  22. {datachain-0.9.1 → datachain-0.10.0}/.github/workflows/benchmarks.yml +0 -0
  23. {datachain-0.9.1 → datachain-0.10.0}/.github/workflows/release.yml +0 -0
  24. {datachain-0.9.1 → datachain-0.10.0}/.github/workflows/tests-studio.yml +0 -0
  25. {datachain-0.9.1 → datachain-0.10.0}/.github/workflows/tests.yml +0 -0
  26. {datachain-0.9.1 → datachain-0.10.0}/.github/workflows/update-template.yaml +0 -0
  27. {datachain-0.9.1 → datachain-0.10.0}/.gitignore +0 -0
  28. {datachain-0.9.1 → datachain-0.10.0}/.pre-commit-config.yaml +0 -0
  29. {datachain-0.9.1 → datachain-0.10.0}/CODE_OF_CONDUCT.rst +0 -0
  30. {datachain-0.9.1 → datachain-0.10.0}/LICENSE +0 -0
  31. {datachain-0.9.1 → datachain-0.10.0}/docs/assets/captioned_cartoons.png +0 -0
  32. {datachain-0.9.1 → datachain-0.10.0}/docs/assets/datachain-white.svg +0 -0
  33. {datachain-0.9.1 → datachain-0.10.0}/docs/assets/datachain.svg +0 -0
  34. {datachain-0.9.1 → datachain-0.10.0}/docs/contributing.md +0 -0
  35. {datachain-0.9.1 → datachain-0.10.0}/docs/css/github-permalink-style.css +0 -0
  36. {datachain-0.9.1 → datachain-0.10.0}/docs/examples.md +0 -0
  37. {datachain-0.9.1 → datachain-0.10.0}/docs/index.md +0 -0
  38. {datachain-0.9.1 → datachain-0.10.0}/docs/overrides/main.html +0 -0
  39. {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/arrowrow.md +0 -0
  40. {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/bbox.md +0 -0
  41. {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/file.md +0 -0
  42. {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/imagefile.md +0 -0
  43. {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/index.md +0 -0
  44. {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/pose.md +0 -0
  45. {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/segment.md +0 -0
  46. {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/tarvfile.md +0 -0
  47. {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/textfile.md +0 -0
  48. {datachain-0.9.1 → datachain-0.10.0}/docs/references/data-types/videofile.md +0 -0
  49. {datachain-0.9.1 → datachain-0.10.0}/docs/references/datachain.md +0 -0
  50. {datachain-0.9.1 → datachain-0.10.0}/docs/references/func.md +0 -0
  51. {datachain-0.9.1 → datachain-0.10.0}/docs/references/index.md +0 -0
  52. {datachain-0.9.1 → datachain-0.10.0}/docs/references/toolkit.md +0 -0
  53. {datachain-0.9.1 → datachain-0.10.0}/docs/references/torch.md +0 -0
  54. {datachain-0.9.1 → datachain-0.10.0}/docs/references/udf.md +0 -0
  55. {datachain-0.9.1 → datachain-0.10.0}/docs/tutorials.md +0 -0
  56. {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  57. {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  58. {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/openimage-detect.py +0 -0
  59. {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  60. {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  61. {datachain-0.9.1 → datachain-0.10.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  62. {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/common_sql_functions.py +0 -0
  63. {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/json-csv-reader.py +0 -0
  64. {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/torch-loader.py +0 -0
  65. {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/udfs/parallel.py +0 -0
  66. {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/udfs/simple.py +0 -0
  67. {datachain-0.9.1 → datachain-0.10.0}/examples/get_started/udfs/stateful.py +0 -0
  68. {datachain-0.9.1 → datachain-0.10.0}/examples/llm_and_nlp/claude-query.py +0 -0
  69. {datachain-0.9.1 → datachain-0.10.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  70. {datachain-0.9.1 → datachain-0.10.0}/examples/multimodal/clip_inference.py +0 -0
  71. {datachain-0.9.1 → datachain-0.10.0}/examples/multimodal/hf_pipeline.py +0 -0
  72. {datachain-0.9.1 → datachain-0.10.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  73. {datachain-0.9.1 → datachain-0.10.0}/examples/multimodal/wds.py +0 -0
  74. {datachain-0.9.1 → datachain-0.10.0}/examples/multimodal/wds_filtered.py +0 -0
  75. {datachain-0.9.1 → datachain-0.10.0}/mkdocs.yml +0 -0
  76. {datachain-0.9.1 → datachain-0.10.0}/noxfile.py +0 -0
  77. {datachain-0.9.1 → datachain-0.10.0}/pyproject.toml +0 -0
  78. {datachain-0.9.1 → datachain-0.10.0}/setup.cfg +0 -0
  79. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/__init__.py +0 -0
  80. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/__main__.py +0 -0
  81. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/asyn.py +0 -0
  82. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cache.py +0 -0
  83. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/catalog/__init__.py +0 -0
  84. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/catalog/catalog.py +0 -0
  85. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/catalog/datasource.py +0 -0
  86. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/catalog/loader.py +0 -0
  87. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/__init__.py +0 -0
  88. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/__init__.py +0 -0
  89. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/datasets.py +0 -0
  90. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/du.py +0 -0
  91. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/index.py +0 -0
  92. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/ls.py +0 -0
  93. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/misc.py +0 -0
  94. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/query.py +0 -0
  95. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/commands/show.py +0 -0
  96. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/parser/__init__.py +0 -0
  97. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/parser/job.py +0 -0
  98. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/parser/studio.py +0 -0
  99. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/parser/utils.py +0 -0
  100. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/cli/utils.py +0 -0
  101. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/__init__.py +0 -0
  102. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/azure.py +0 -0
  103. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/fileslice.py +0 -0
  104. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/gcs.py +0 -0
  105. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/hf.py +0 -0
  106. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/local.py +0 -0
  107. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/client/s3.py +0 -0
  108. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/config.py +0 -0
  109. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/__init__.py +0 -0
  110. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/db_engine.py +0 -0
  111. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/job.py +0 -0
  112. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/metastore.py +0 -0
  113. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/schema.py +0 -0
  114. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/serializer.py +0 -0
  115. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/sqlite.py +0 -0
  116. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/data_storage/warehouse.py +0 -0
  117. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/dataset.py +0 -0
  118. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/diff/__init__.py +0 -0
  119. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/error.py +0 -0
  120. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/fs/__init__.py +0 -0
  121. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/fs/reference.py +0 -0
  122. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/__init__.py +0 -0
  123. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/aggregate.py +0 -0
  124. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/array.py +0 -0
  125. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/base.py +0 -0
  126. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/conditional.py +0 -0
  127. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/func.py +0 -0
  128. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/numeric.py +0 -0
  129. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/path.py +0 -0
  130. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/random.py +0 -0
  131. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/string.py +0 -0
  132. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/func/window.py +0 -0
  133. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/job.py +0 -0
  134. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/__init__.py +0 -0
  135. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/arrow.py +0 -0
  136. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/clip.py +0 -0
  137. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/__init__.py +0 -0
  138. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/flatten.py +0 -0
  139. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  140. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  141. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/unflatten.py +0 -0
  142. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  143. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/data_model.py +0 -0
  144. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/dataset_info.py +0 -0
  145. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/hf.py +0 -0
  146. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/image.py +0 -0
  147. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/listing.py +0 -0
  148. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/listing_info.py +0 -0
  149. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/meta_formats.py +0 -0
  150. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/model_store.py +0 -0
  151. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/pytorch.py +0 -0
  152. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/settings.py +0 -0
  153. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/signal_schema.py +0 -0
  154. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/tar.py +0 -0
  155. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/text.py +0 -0
  156. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/udf.py +0 -0
  157. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/udf_signature.py +0 -0
  158. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/utils.py +0 -0
  159. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/video.py +0 -0
  160. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/webdataset.py +0 -0
  161. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/lib/webdataset_laion.py +0 -0
  162. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/listing.py +0 -0
  163. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/__init__.py +0 -0
  164. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/bbox.py +0 -0
  165. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/pose.py +0 -0
  166. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/segment.py +0 -0
  167. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  168. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  169. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/ultralytics/pose.py +0 -0
  170. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/model/ultralytics/segment.py +0 -0
  171. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/node.py +0 -0
  172. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/nodes_fetcher.py +0 -0
  173. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/nodes_thread_pool.py +0 -0
  174. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/progress.py +0 -0
  175. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/py.typed +0 -0
  176. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/__init__.py +0 -0
  177. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/batch.py +0 -0
  178. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/dispatch.py +0 -0
  179. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/metrics.py +0 -0
  180. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/params.py +0 -0
  181. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/queue.py +0 -0
  182. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/schema.py +0 -0
  183. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/udf.py +0 -0
  184. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/query/utils.py +0 -0
  185. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/remote/__init__.py +0 -0
  186. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/remote/studio.py +0 -0
  187. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/__init__.py +0 -0
  188. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/default/__init__.py +0 -0
  189. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/default/base.py +0 -0
  190. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/__init__.py +0 -0
  191. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/aggregate.py +0 -0
  192. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/array.py +0 -0
  193. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/conditional.py +0 -0
  194. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/numeric.py +0 -0
  195. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/path.py +0 -0
  196. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/random.py +0 -0
  197. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/functions/string.py +0 -0
  198. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/selectable.py +0 -0
  199. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  200. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/sqlite/base.py +0 -0
  201. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/sqlite/types.py +0 -0
  202. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/sqlite/vector.py +0 -0
  203. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/types.py +0 -0
  204. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/sql/utils.py +0 -0
  205. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/studio.py +0 -0
  206. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/telemetry.py +0 -0
  207. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/toolkit/__init__.py +0 -0
  208. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/toolkit/split.py +0 -0
  209. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/torch/__init__.py +0 -0
  210. {datachain-0.9.1 → datachain-0.10.0}/src/datachain/utils.py +0 -0
  211. {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  212. {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/entry_points.txt +0 -0
  213. {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/requires.txt +0 -0
  214. {datachain-0.9.1 → datachain-0.10.0}/src/datachain.egg-info/top_level.txt +0 -0
  215. {datachain-0.9.1 → datachain-0.10.0}/tests/__init__.py +0 -0
  216. {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/__init__.py +0 -0
  217. {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/conftest.py +0 -0
  218. {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  219. {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  220. {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/datasets/.gitignore +0 -0
  221. {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  222. {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/test_datachain.py +0 -0
  223. {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/test_ls.py +0 -0
  224. {datachain-0.9.1 → datachain-0.10.0}/tests/benchmarks/test_version.py +0 -0
  225. {datachain-0.9.1 → datachain-0.10.0}/tests/data.py +0 -0
  226. {datachain-0.9.1 → datachain-0.10.0}/tests/examples/__init__.py +0 -0
  227. {datachain-0.9.1 → datachain-0.10.0}/tests/examples/test_examples.py +0 -0
  228. {datachain-0.9.1 → datachain-0.10.0}/tests/examples/test_wds_e2e.py +0 -0
  229. {datachain-0.9.1 → datachain-0.10.0}/tests/examples/wds_data.py +0 -0
  230. {datachain-0.9.1 → datachain-0.10.0}/tests/func/__init__.py +0 -0
  231. {datachain-0.9.1 → datachain-0.10.0}/tests/func/fake-service-account-credentials.json +0 -0
  232. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_catalog.py +0 -0
  233. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_client.py +0 -0
  234. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_data_storage.py +0 -0
  235. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_datachain_merge.py +0 -0
  236. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_dataset_query.py +0 -0
  237. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_datasets.py +0 -0
  238. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_feature_pickling.py +0 -0
  239. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_file.py +0 -0
  240. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_hf.py +0 -0
  241. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_listing.py +0 -0
  242. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_ls.py +0 -0
  243. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_meta_formats.py +0 -0
  244. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_metrics.py +0 -0
  245. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_pytorch.py +0 -0
  246. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_query.py +0 -0
  247. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_session.py +0 -0
  248. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_toolkit.py +0 -0
  249. {datachain-0.9.1 → datachain-0.10.0}/tests/func/test_warehouse.py +0 -0
  250. {datachain-0.9.1 → datachain-0.10.0}/tests/scripts/feature_class.py +0 -0
  251. {datachain-0.9.1 → datachain-0.10.0}/tests/scripts/feature_class_exception.py +0 -0
  252. {datachain-0.9.1 → datachain-0.10.0}/tests/scripts/feature_class_parallel.py +0 -0
  253. {datachain-0.9.1 → datachain-0.10.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  254. {datachain-0.9.1 → datachain-0.10.0}/tests/scripts/name_len_slow.py +0 -0
  255. {datachain-0.9.1 → datachain-0.10.0}/tests/test_atomicity.py +0 -0
  256. {datachain-0.9.1 → datachain-0.10.0}/tests/test_cli_e2e.py +0 -0
  257. {datachain-0.9.1 → datachain-0.10.0}/tests/test_cli_studio.py +0 -0
  258. {datachain-0.9.1 → datachain-0.10.0}/tests/test_query_e2e.py +0 -0
  259. {datachain-0.9.1 → datachain-0.10.0}/tests/test_telemetry.py +0 -0
  260. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/__init__.py +0 -0
  261. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/__init__.py +0 -0
  262. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/conftest.py +0 -0
  263. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  264. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_arrow.py +0 -0
  265. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_clip.py +0 -0
  266. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_datachain.py +0 -0
  267. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  268. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  269. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_diff.py +0 -0
  270. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_feature.py +0 -0
  271. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_feature_utils.py +0 -0
  272. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_file.py +0 -0
  273. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_hf.py +0 -0
  274. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_image.py +0 -0
  275. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_listing_info.py +0 -0
  276. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_models.py +0 -0
  277. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  278. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_schema.py +0 -0
  279. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_signal_schema.py +0 -0
  280. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  281. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_text.py +0 -0
  282. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_udf_signature.py +0 -0
  283. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_utils.py +0 -0
  284. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_video.py +0 -0
  285. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/lib/test_webdataset.py +0 -0
  286. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/__init__.py +0 -0
  287. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  288. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  289. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  290. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_array.py +0 -0
  291. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_conditional.py +0 -0
  292. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_path.py +0 -0
  293. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_random.py +0 -0
  294. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_selectable.py +0 -0
  295. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/sql/test_string.py +0 -0
  296. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_asyn.py +0 -0
  297. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_cache.py +0 -0
  298. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_catalog.py +0 -0
  299. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_catalog_loader.py +0 -0
  300. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_cli_parsing.py +0 -0
  301. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_client.py +0 -0
  302. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_client_gcs.py +0 -0
  303. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_client_s3.py +0 -0
  304. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_config.py +0 -0
  305. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_data_storage.py +0 -0
  306. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_database_engine.py +0 -0
  307. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_dataset.py +0 -0
  308. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_dispatch.py +0 -0
  309. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_fileslice.py +0 -0
  310. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_func.py +0 -0
  311. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_listing.py +0 -0
  312. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_metastore.py +0 -0
  313. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_module_exports.py +0 -0
  314. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_pytorch.py +0 -0
  315. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_query.py +0 -0
  316. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_query_metrics.py +0 -0
  317. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_query_params.py +0 -0
  318. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_serializer.py +0 -0
  319. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_session.py +0 -0
  320. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_utils.py +0 -0
  321. {datachain-0.9.1 → datachain-0.10.0}/tests/unit/test_warehouse.py +0 -0
  322. {datachain-0.9.1 → datachain-0.10.0}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.9.1
3
+ Version: 0.10.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -175,7 +175,7 @@ high confidence scores.
175
175
 
176
176
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
177
  & (Column("meta.inference.class_") == "cat"))
178
- likely_cats.export_files("high-confidence-cats/", signal="file")
178
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
179
179
 
180
180
 
181
181
  Example: LLM based text-file evaluation
@@ -216,7 +216,7 @@ Python code:
216
216
  )
217
217
 
218
218
  successful_chain = chain.filter(Column("is_success") == True)
219
- successful_chain.export_files("./output_mistral")
219
+ successful_chain.to_storage("./output_mistral")
220
220
 
221
221
  print(f"{successful_chain.count()} files were exported")
222
222
 
@@ -68,7 +68,7 @@ high confidence scores.
68
68
 
69
69
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
70
70
  & (Column("meta.inference.class_") == "cat"))
71
- likely_cats.export_files("high-confidence-cats/", signal="file")
71
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
72
72
 
73
73
 
74
74
  Example: LLM based text-file evaluation
@@ -109,7 +109,7 @@ Python code:
109
109
  )
110
110
 
111
111
  successful_chain = chain.filter(Column("is_success") == True)
112
- successful_chain.export_files("./output_mistral")
112
+ successful_chain.to_storage("./output_mistral")
113
113
 
114
114
  print(f"{successful_chain.count()} files were exported")
115
115
 
@@ -47,7 +47,7 @@ annotated = images_id.merge(meta, on="id", right_on="meta.id")
47
47
 
48
48
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
49
49
  & (Column("meta.inference.class_") == "cat"))
50
- likely_cats.export_files("high-confidence-cats/", signal="file")
50
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
51
51
  ```
52
52
 
53
53
  ## Data curation with a local AI model
@@ -85,7 +85,7 @@ chain = (
85
85
  )
86
86
 
87
87
  positive_chain = chain.filter(Column("is_positive") == True)
88
- positive_chain.export_files("./output")
88
+ positive_chain.to_storage("./output")
89
89
 
90
90
  print(f"{positive_chain.count()} files were exported")
91
91
  ```
@@ -390,7 +390,7 @@ class Client(ABC):
390
390
  ) # type: ignore[return-value]
391
391
 
392
392
  def upload(self, data: bytes, path: str) -> "File":
393
- full_path = self.get_full_path(path)
393
+ full_path = path if path.startswith(self.PREFIX) else self.get_full_path(path)
394
394
 
395
395
  parent = posixpath.dirname(full_path)
396
396
  self.fs.makedirs(parent, exist_ok=True)
@@ -411,6 +411,7 @@ class DataChain:
411
411
  object_name: str = "file",
412
412
  update: bool = False,
413
413
  anon: bool = False,
414
+ client_config: Optional[dict] = None,
414
415
  ) -> "Self":
415
416
  """Get data from a storage as a list of file with all file attributes.
416
417
  It returns the chain itself as usual.
@@ -423,15 +424,32 @@ class DataChain:
423
424
  object_name : Created object column name.
424
425
  update : force storage reindexing. Default is False.
425
426
  anon : If True, we will treat cloud bucket as public one
427
+ client_config : Optional client configuration for the storage client.
426
428
 
427
429
  Example:
430
+ Simple call from s3
428
431
  ```py
429
432
  chain = DataChain.from_storage("s3://my-bucket/my-dir")
430
433
  ```
434
+
435
+ With AWS S3-compatible storage
436
+ ```py
437
+ chain = DataChain.from_storage(
438
+ "s3://my-bucket/my-dir",
439
+ client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
440
+ )
441
+ ```
442
+
443
+ Pass existing session
444
+ ```py
445
+ session = Session.get()
446
+ chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
447
+ ```
431
448
  """
432
449
  file_type = get_file_type(type)
433
450
 
434
- client_config = {"anon": True} if anon else None
451
+ if anon:
452
+ client_config = (client_config or {}) | {"anon": True}
435
453
  session = Session.get(session, client_config=client_config, in_memory=in_memory)
436
454
  cache = session.catalog.cache
437
455
  client_config = session.catalog.client_config
@@ -481,25 +499,56 @@ class DataChain:
481
499
  version: Optional[int] = None,
482
500
  session: Optional[Session] = None,
483
501
  settings: Optional[dict] = None,
484
- fallback_to_remote: bool = True,
502
+ fallback_to_studio: bool = True,
485
503
  ) -> "Self":
486
504
  """Get data from a saved Dataset. It returns the chain itself.
505
+ If dataset or version is not found locally, it will try to pull it from Studio.
487
506
 
488
507
  Parameters:
489
508
  name : dataset name
490
509
  version : dataset version
510
+ session : Session to use for the chain.
511
+ settings : Settings to use for the chain.
512
+ fallback_to_studio : Try to pull dataset from Studio if not found locally.
513
+ Default is True.
491
514
 
492
515
  Example:
493
516
  ```py
494
517
  chain = DataChain.from_dataset("my_cats")
495
518
  ```
519
+
520
+ ```py
521
+ chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
522
+ ```
523
+
524
+ ```py
525
+ chain = DataChain.from_dataset("my_cats", version=1)
526
+ ```
527
+
528
+ ```py
529
+ session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
530
+ settings = {
531
+ "cache": True,
532
+ "parallel": 4,
533
+ "workers": 4,
534
+ "min_task_size": 1000,
535
+ "prefetch": 10,
536
+ }
537
+ chain = DataChain.from_dataset(
538
+ name="my_cats",
539
+ version=1,
540
+ session=session,
541
+ settings=settings,
542
+ fallback_to_studio=True,
543
+ )
544
+ ```
496
545
  """
497
546
  query = DatasetQuery(
498
547
  name=name,
499
548
  version=version,
500
549
  session=session,
501
550
  indexing_column_types=File._datachain_column_types,
502
- fallback_to_remote=fallback_to_remote,
551
+ fallback_to_studio=fallback_to_studio,
503
552
  )
504
553
  telemetry.send_event_once("class", "datachain_init", name=name, version=version)
505
554
  if settings:
@@ -2444,7 +2493,7 @@ class DataChain:
2444
2493
  self._setup = self._setup | kwargs
2445
2494
  return self
2446
2495
 
2447
- def export_files(
2496
+ def to_storage(
2448
2497
  self,
2449
2498
  output: str,
2450
2499
  signal: str = "file",
@@ -2462,6 +2511,13 @@ class DataChain:
2462
2511
  use_cache: If `True`, cache the files before exporting.
2463
2512
  link_type: Method to use for exporting files.
2464
2513
  Falls back to `'copy'` if symlinking fails.
2514
+
2515
+ Example:
2516
+ Cross cloud transfer
2517
+ ```py
2518
+ ds = DataChain.from_storage("s3://mybucket")
2519
+ ds.to_storage("gs://mybucket", placement="filename")
2520
+ ```
2465
2521
  """
2466
2522
  if placement == "filename" and (
2467
2523
  self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
@@ -17,6 +17,7 @@ from urllib.parse import unquote, urlparse
17
17
  from urllib.request import url2pathname
18
18
 
19
19
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
20
+ from fsspec.utils import stringify_path
20
21
  from PIL import Image as PilImage
21
22
  from pydantic import Field, field_validator
22
23
 
@@ -270,8 +271,9 @@ class File(DataModel):
270
271
 
271
272
  def save(self, destination: str):
272
273
  """Writes it's content to destination"""
273
- with open(destination, mode="wb") as f:
274
- f.write(self.read())
274
+ destination = stringify_path(destination)
275
+ client: Client = self._catalog.get_client(str(destination))
276
+ client.upload(self.read(), str(destination))
275
277
 
276
278
  def _symlink_to(self, destination: str):
277
279
  if self.location:
@@ -285,6 +287,7 @@ class File(DataModel):
285
287
  source = self.get_path()
286
288
  else:
287
289
  raise OSError(errno.EXDEV, "can't link across filesystems")
290
+
288
291
  return os.symlink(source, destination)
289
292
 
290
293
  def export(
@@ -299,7 +302,8 @@ class File(DataModel):
299
302
  self._caching_enabled = use_cache
300
303
  dst = self.get_destination_path(output, placement)
301
304
  dst_dir = os.path.dirname(dst)
302
- os.makedirs(dst_dir, exist_ok=True)
305
+ client: Client = self._catalog.get_client(dst_dir)
306
+ client.fs.makedirs(dst_dir, exist_ok=True)
303
307
 
304
308
  if link_type == "symlink":
305
309
  try:
@@ -496,7 +500,10 @@ class TextFile(File):
496
500
 
497
501
  def save(self, destination: str):
498
502
  """Writes it's content to destination"""
499
- with open(destination, mode="w") as f:
503
+ destination = stringify_path(destination)
504
+
505
+ client: Client = self._catalog.get_client(destination)
506
+ with client.fs.open(destination, mode="w") as f:
500
507
  f.write(self.read_text())
501
508
 
502
509
 
@@ -510,7 +517,11 @@ class ImageFile(File):
510
517
 
511
518
  def save(self, destination: str):
512
519
  """Writes it's content to destination"""
513
- self.read().save(destination)
520
+ destination = stringify_path(destination)
521
+
522
+ client: Client = self._catalog.get_client(destination)
523
+ with client.fs.open(destination, mode="wb") as f:
524
+ self.read().save(f)
514
525
 
515
526
 
516
527
  class Image(DataModel):
@@ -1085,7 +1085,7 @@ class DatasetQuery:
1085
1085
  session: Optional[Session] = None,
1086
1086
  indexing_column_types: Optional[dict[str, Any]] = None,
1087
1087
  in_memory: bool = False,
1088
- fallback_to_remote: bool = True,
1088
+ fallback_to_studio: bool = True,
1089
1089
  ) -> None:
1090
1090
  self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
1091
1091
  self.catalog = catalog or self.session.catalog
@@ -1103,7 +1103,7 @@ class DatasetQuery:
1103
1103
 
1104
1104
  self.name = name
1105
1105
 
1106
- if fallback_to_remote and is_token_set():
1106
+ if fallback_to_studio and is_token_set():
1107
1107
  ds = self.catalog.get_dataset_with_remote_fallback(name, version)
1108
1108
  else:
1109
1109
  ds = self.catalog.get_dataset(name)
@@ -139,21 +139,33 @@ class Session:
139
139
 
140
140
  # Access the active (most recent) context from the stack
141
141
  if cls.SESSION_CONTEXTS:
142
- return cls.SESSION_CONTEXTS[-1]
142
+ session = cls.SESSION_CONTEXTS[-1]
143
143
 
144
- if cls.GLOBAL_SESSION_CTX is None:
144
+ elif cls.GLOBAL_SESSION_CTX is None:
145
145
  cls.GLOBAL_SESSION_CTX = Session(
146
146
  cls.GLOBAL_SESSION_NAME,
147
147
  catalog,
148
148
  client_config=client_config,
149
149
  in_memory=in_memory,
150
150
  )
151
+ session = cls.GLOBAL_SESSION_CTX
151
152
 
152
153
  atexit.register(cls._global_cleanup)
153
154
  cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
154
155
  sys.excepthook = cls.except_hook
156
+ else:
157
+ session = cls.GLOBAL_SESSION_CTX
155
158
 
156
- return cls.GLOBAL_SESSION_CTX
159
+ if client_config and session.catalog.client_config != client_config:
160
+ session = Session(
161
+ "session" + uuid4().hex[:4],
162
+ catalog,
163
+ client_config=client_config,
164
+ in_memory=in_memory,
165
+ )
166
+ session.__enter__()
167
+
168
+ return session
157
169
 
158
170
  @staticmethod
159
171
  def except_hook(exc_type, exc_value, exc_traceback):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.9.1
3
+ Version: 0.10.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -175,7 +175,7 @@ high confidence scores.
175
175
 
176
176
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
177
  & (Column("meta.inference.class_") == "cat"))
178
- likely_cats.export_files("high-confidence-cats/", signal="file")
178
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
179
179
 
180
180
 
181
181
  Example: LLM based text-file evaluation
@@ -216,7 +216,7 @@ Python code:
216
216
  )
217
217
 
218
218
  successful_chain = chain.filter(Column("is_success") == True)
219
- successful_chain.export_files("./output_mistral")
219
+ successful_chain.to_storage("./output_mistral")
220
220
 
221
221
  print(f"{successful_chain.count()} files were exported")
222
222
 
@@ -232,6 +232,7 @@ tests/func/__init__.py
232
232
  tests/func/fake-service-account-credentials.json
233
233
  tests/func/test_catalog.py
234
234
  tests/func/test_client.py
235
+ tests/func/test_cloud_transfer.py
235
236
  tests/func/test_data_storage.py
236
237
  tests/func/test_datachain.py
237
238
  tests/func/test_datachain_merge.py
@@ -472,9 +472,9 @@ def cloud_server_credentials(cloud_server, monkeypatch):
472
472
 
473
473
  def get_cloud_test_catalog(cloud_server, tmp_path, metastore, warehouse):
474
474
  cache_dir = tmp_path / ".datachain" / "cache"
475
- cache_dir.mkdir(parents=True)
475
+ cache_dir.mkdir(parents=True, exist_ok=True)
476
476
  tmpfile_dir = tmp_path / ".datachain" / "tmp"
477
- tmpfile_dir.mkdir()
477
+ tmpfile_dir.mkdir(exist_ok=True)
478
478
 
479
479
  catalog = Catalog(
480
480
  metastore=metastore,
@@ -0,0 +1,68 @@
1
+ import pytest
2
+
3
+ from datachain import Session
4
+ from datachain.lib.dc import DataChain
5
+ from tests.conftest import get_cloud_test_catalog, make_cloud_server
6
+
7
+
8
+ def test_cross_cloud_transfer(
9
+ request,
10
+ tmp_upath_factory,
11
+ tree,
12
+ tmp_path,
13
+ metastore,
14
+ warehouse,
15
+ ):
16
+ disabled_remotes = request.config.getoption("--disable-remotes") or []
17
+
18
+ if any(remote in disabled_remotes for remote in ["azure", "gs", "all"]):
19
+ pytest.skip("Skipping all tests for azure, gs or all remotes")
20
+
21
+ azure_path = tmp_upath_factory.mktemp("azure", version_aware=False)
22
+ azure_server = make_cloud_server(azure_path, "azure", tree)
23
+
24
+ gcloud_path = tmp_upath_factory.mktemp("gs", version_aware=False)
25
+ gcloud_server = make_cloud_server(gcloud_path, "gs", tree)
26
+
27
+ # Initialize cloud catalogs
28
+ azure_catalog = get_cloud_test_catalog(azure_server, tmp_path, metastore, warehouse)
29
+ gcloud_catalog = get_cloud_test_catalog(
30
+ gcloud_server, tmp_path, metastore, warehouse
31
+ )
32
+
33
+ # Define test file paths
34
+ test_filename = "image_1.jpg"
35
+ test_content = b"bytes"
36
+
37
+ source_dir = f"{azure_catalog.src_uri}/source-test-images"
38
+ source_file = f"{source_dir}/{test_filename}"
39
+
40
+ dest_dir = f"{gcloud_catalog.src_uri}/destination-test-images"
41
+ dest_file = f"{dest_dir}/{test_filename}"
42
+
43
+ # Get cloud clients
44
+ azure_client = azure_catalog.catalog.get_client(source_file)
45
+ gcloud_client = gcloud_catalog.catalog.get_client(dest_file)
46
+
47
+ try:
48
+ # Create test file in Azure
49
+ with azure_client.fs.open(source_file, "wb") as f:
50
+ f.write(test_content)
51
+
52
+ # Perform cross-cloud transfer
53
+ combined_config = azure_server.client_config | gcloud_server.client_config
54
+ with Session("testSession", client_config=combined_config):
55
+ datachain = DataChain.from_storage(source_dir)
56
+ datachain.to_storage(dest_dir, placement="filename")
57
+
58
+ # Verify transfer
59
+ with gcloud_client.fs.open(dest_file, "rb") as f:
60
+ assert f.read() == test_content
61
+
62
+ finally:
63
+ # Cleanup
64
+ try:
65
+ azure_client.fs.rm(source_dir, recursive=True)
66
+ gcloud_client.fs.rm(dest_dir, recursive=True)
67
+ except FileNotFoundError:
68
+ pass
@@ -64,6 +64,16 @@ def test_catalog_anon(tmp_dir, catalog, anon):
64
64
  assert chain.session.catalog.client_config.get("anon", False) is anon
65
65
 
66
66
 
67
+ def test_from_storage_client_config(tmp_dir, catalog):
68
+ dc = DataChain.from_storage(tmp_dir.as_uri())
69
+ assert dc.session.catalog.client_config == {} # Default client config is set.
70
+
71
+ dc = DataChain.from_storage(tmp_dir.as_uri(), client_config={"anon": True})
72
+ assert dc.session.catalog.client_config == {
73
+ "anon": True
74
+ } # New client config is set.
75
+
76
+
67
77
  def test_from_storage(cloud_test_catalog):
68
78
  ctc = cloud_test_catalog
69
79
  dc = DataChain.from_storage(ctc.src_uri, session=ctc.session)
@@ -292,20 +302,20 @@ def test_read_file(cloud_test_catalog, use_cache):
292
302
  @pytest.mark.parametrize("use_cache", [True, False])
293
303
  @pytest.mark.parametrize("file_type", ["", "binary", "text"])
294
304
  @pytest.mark.parametrize("cloud_type", ["file"], indirect=True)
295
- def test_export_files(
305
+ def test_to_storage(
296
306
  tmp_dir, cloud_test_catalog, test_session, placement, use_map, use_cache, file_type
297
307
  ):
298
308
  ctc = cloud_test_catalog
299
309
  df = DataChain.from_storage(ctc.src_uri, type=file_type, session=test_session)
300
310
  if use_map:
301
- df.export_files(tmp_dir / "output", placement=placement, use_cache=use_cache)
311
+ df.to_storage(tmp_dir / "output", placement=placement, use_cache=use_cache)
302
312
  df.map(
303
313
  res=lambda file: file.export(
304
314
  tmp_dir / "output", placement=placement, use_cache=use_cache
305
315
  )
306
316
  ).exec()
307
317
  else:
308
- df.export_files(tmp_dir / "output", placement=placement)
318
+ df.to_storage(tmp_dir / "output", placement=placement)
309
319
 
310
320
  expected = {
311
321
  "description": "Cats and Dogs",
@@ -341,14 +351,14 @@ def test_export_images_files(test_session, tmp_dir, tmp_path, use_cache):
341
351
  ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images
342
352
  ],
343
353
  session=test_session,
344
- ).export_files(tmp_dir / "output", placement="filename", use_cache=use_cache)
354
+ ).to_storage(tmp_dir / "output", placement="filename", use_cache=use_cache)
345
355
 
346
356
  for img in images:
347
357
  exported_img = Image.open(tmp_dir / "output" / img["name"])
348
358
  assert images_equal(img["data"], exported_img)
349
359
 
350
360
 
351
- def test_export_files_filename_placement_not_unique_files(tmp_dir, test_session):
361
+ def test_to_storage_files_filename_placement_not_unique_files(tmp_dir, test_session):
352
362
  data = b"some\x00data\x00is\x48\x65\x6c\x57\x6f\x72\x6c\x64\xff\xffheRe"
353
363
  bucket_name = "mybucket"
354
364
  files = ["dir1/a.json", "dir1/dir2/a.json"]
@@ -364,7 +374,7 @@ def test_export_files_filename_placement_not_unique_files(tmp_dir, test_session)
364
374
 
365
375
  df = DataChain.from_storage((tmp_dir / bucket_name).as_uri(), session=test_session)
366
376
  with pytest.raises(ValueError):
367
- df.export_files(tmp_dir / "output", placement="filename")
377
+ df.to_storage(tmp_dir / "output", placement="filename")
368
378
 
369
379
 
370
380
  def test_show(capsys, test_session):
@@ -295,7 +295,7 @@ def test_datachain_from_dataset_pull(
295
295
  ds = DataChain.from_dataset(
296
296
  name="dogs",
297
297
  version=1,
298
- fallback_to_remote=True,
298
+ fallback_to_studio=True,
299
299
  )
300
300
 
301
301
  assert ds.dataset.name == "dogs"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes