datachain 0.9.0__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (322) hide show
  1. {datachain-0.9.0 → datachain-0.10.0}/PKG-INFO +3 -3
  2. {datachain-0.9.0 → datachain-0.10.0}/README.rst +2 -2
  3. {datachain-0.9.0 → datachain-0.10.0}/docs/quick-start.md +2 -2
  4. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/client/fsspec.py +1 -1
  5. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/dc.py +60 -4
  6. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/file.py +22 -8
  7. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/dataset.py +2 -2
  8. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/session.py +15 -3
  9. {datachain-0.9.0 → datachain-0.10.0}/src/datachain.egg-info/PKG-INFO +3 -3
  10. {datachain-0.9.0 → datachain-0.10.0}/src/datachain.egg-info/SOURCES.txt +1 -0
  11. {datachain-0.9.0 → datachain-0.10.0}/tests/conftest.py +2 -2
  12. datachain-0.10.0/tests/func/test_cloud_transfer.py +68 -0
  13. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_datachain.py +16 -6
  14. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_file.py +8 -6
  15. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_pull.py +1 -1
  16. {datachain-0.9.0 → datachain-0.10.0}/.cruft.json +0 -0
  17. {datachain-0.9.0 → datachain-0.10.0}/.gitattributes +0 -0
  18. {datachain-0.9.0 → datachain-0.10.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  19. {datachain-0.9.0 → datachain-0.10.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  20. {datachain-0.9.0 → datachain-0.10.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  21. {datachain-0.9.0 → datachain-0.10.0}/.github/codecov.yaml +0 -0
  22. {datachain-0.9.0 → datachain-0.10.0}/.github/dependabot.yml +0 -0
  23. {datachain-0.9.0 → datachain-0.10.0}/.github/workflows/benchmarks.yml +0 -0
  24. {datachain-0.9.0 → datachain-0.10.0}/.github/workflows/release.yml +0 -0
  25. {datachain-0.9.0 → datachain-0.10.0}/.github/workflows/tests-studio.yml +0 -0
  26. {datachain-0.9.0 → datachain-0.10.0}/.github/workflows/tests.yml +0 -0
  27. {datachain-0.9.0 → datachain-0.10.0}/.github/workflows/update-template.yaml +0 -0
  28. {datachain-0.9.0 → datachain-0.10.0}/.gitignore +0 -0
  29. {datachain-0.9.0 → datachain-0.10.0}/.pre-commit-config.yaml +0 -0
  30. {datachain-0.9.0 → datachain-0.10.0}/CODE_OF_CONDUCT.rst +0 -0
  31. {datachain-0.9.0 → datachain-0.10.0}/LICENSE +0 -0
  32. {datachain-0.9.0 → datachain-0.10.0}/docs/assets/captioned_cartoons.png +0 -0
  33. {datachain-0.9.0 → datachain-0.10.0}/docs/assets/datachain-white.svg +0 -0
  34. {datachain-0.9.0 → datachain-0.10.0}/docs/assets/datachain.svg +0 -0
  35. {datachain-0.9.0 → datachain-0.10.0}/docs/contributing.md +0 -0
  36. {datachain-0.9.0 → datachain-0.10.0}/docs/css/github-permalink-style.css +0 -0
  37. {datachain-0.9.0 → datachain-0.10.0}/docs/examples.md +0 -0
  38. {datachain-0.9.0 → datachain-0.10.0}/docs/index.md +0 -0
  39. {datachain-0.9.0 → datachain-0.10.0}/docs/overrides/main.html +0 -0
  40. {datachain-0.9.0 → datachain-0.10.0}/docs/references/data-types/arrowrow.md +0 -0
  41. {datachain-0.9.0 → datachain-0.10.0}/docs/references/data-types/bbox.md +0 -0
  42. {datachain-0.9.0 → datachain-0.10.0}/docs/references/data-types/file.md +0 -0
  43. {datachain-0.9.0 → datachain-0.10.0}/docs/references/data-types/imagefile.md +0 -0
  44. {datachain-0.9.0 → datachain-0.10.0}/docs/references/data-types/index.md +0 -0
  45. {datachain-0.9.0 → datachain-0.10.0}/docs/references/data-types/pose.md +0 -0
  46. {datachain-0.9.0 → datachain-0.10.0}/docs/references/data-types/segment.md +0 -0
  47. {datachain-0.9.0 → datachain-0.10.0}/docs/references/data-types/tarvfile.md +0 -0
  48. {datachain-0.9.0 → datachain-0.10.0}/docs/references/data-types/textfile.md +0 -0
  49. {datachain-0.9.0 → datachain-0.10.0}/docs/references/data-types/videofile.md +0 -0
  50. {datachain-0.9.0 → datachain-0.10.0}/docs/references/datachain.md +0 -0
  51. {datachain-0.9.0 → datachain-0.10.0}/docs/references/func.md +0 -0
  52. {datachain-0.9.0 → datachain-0.10.0}/docs/references/index.md +0 -0
  53. {datachain-0.9.0 → datachain-0.10.0}/docs/references/toolkit.md +0 -0
  54. {datachain-0.9.0 → datachain-0.10.0}/docs/references/torch.md +0 -0
  55. {datachain-0.9.0 → datachain-0.10.0}/docs/references/udf.md +0 -0
  56. {datachain-0.9.0 → datachain-0.10.0}/docs/tutorials.md +0 -0
  57. {datachain-0.9.0 → datachain-0.10.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  58. {datachain-0.9.0 → datachain-0.10.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  59. {datachain-0.9.0 → datachain-0.10.0}/examples/computer_vision/openimage-detect.py +0 -0
  60. {datachain-0.9.0 → datachain-0.10.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  61. {datachain-0.9.0 → datachain-0.10.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  62. {datachain-0.9.0 → datachain-0.10.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  63. {datachain-0.9.0 → datachain-0.10.0}/examples/get_started/common_sql_functions.py +0 -0
  64. {datachain-0.9.0 → datachain-0.10.0}/examples/get_started/json-csv-reader.py +0 -0
  65. {datachain-0.9.0 → datachain-0.10.0}/examples/get_started/torch-loader.py +0 -0
  66. {datachain-0.9.0 → datachain-0.10.0}/examples/get_started/udfs/parallel.py +0 -0
  67. {datachain-0.9.0 → datachain-0.10.0}/examples/get_started/udfs/simple.py +0 -0
  68. {datachain-0.9.0 → datachain-0.10.0}/examples/get_started/udfs/stateful.py +0 -0
  69. {datachain-0.9.0 → datachain-0.10.0}/examples/llm_and_nlp/claude-query.py +0 -0
  70. {datachain-0.9.0 → datachain-0.10.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  71. {datachain-0.9.0 → datachain-0.10.0}/examples/multimodal/clip_inference.py +0 -0
  72. {datachain-0.9.0 → datachain-0.10.0}/examples/multimodal/hf_pipeline.py +0 -0
  73. {datachain-0.9.0 → datachain-0.10.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  74. {datachain-0.9.0 → datachain-0.10.0}/examples/multimodal/wds.py +0 -0
  75. {datachain-0.9.0 → datachain-0.10.0}/examples/multimodal/wds_filtered.py +0 -0
  76. {datachain-0.9.0 → datachain-0.10.0}/mkdocs.yml +0 -0
  77. {datachain-0.9.0 → datachain-0.10.0}/noxfile.py +0 -0
  78. {datachain-0.9.0 → datachain-0.10.0}/pyproject.toml +0 -0
  79. {datachain-0.9.0 → datachain-0.10.0}/setup.cfg +0 -0
  80. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/__init__.py +0 -0
  81. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/__main__.py +0 -0
  82. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/asyn.py +0 -0
  83. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cache.py +0 -0
  84. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/catalog/__init__.py +0 -0
  85. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/catalog/catalog.py +0 -0
  86. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/catalog/datasource.py +0 -0
  87. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/catalog/loader.py +0 -0
  88. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/__init__.py +0 -0
  89. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/commands/__init__.py +0 -0
  90. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/commands/datasets.py +0 -0
  91. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/commands/du.py +0 -0
  92. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/commands/index.py +0 -0
  93. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/commands/ls.py +0 -0
  94. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/commands/misc.py +0 -0
  95. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/commands/query.py +0 -0
  96. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/commands/show.py +0 -0
  97. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/parser/__init__.py +0 -0
  98. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/parser/job.py +0 -0
  99. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/parser/studio.py +0 -0
  100. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/parser/utils.py +0 -0
  101. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/cli/utils.py +0 -0
  102. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/client/__init__.py +0 -0
  103. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/client/azure.py +0 -0
  104. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/client/fileslice.py +0 -0
  105. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/client/gcs.py +0 -0
  106. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/client/hf.py +0 -0
  107. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/client/local.py +0 -0
  108. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/client/s3.py +0 -0
  109. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/config.py +0 -0
  110. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/data_storage/__init__.py +0 -0
  111. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/data_storage/db_engine.py +0 -0
  112. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/data_storage/job.py +0 -0
  113. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/data_storage/metastore.py +0 -0
  114. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/data_storage/schema.py +0 -0
  115. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/data_storage/serializer.py +0 -0
  116. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/data_storage/sqlite.py +0 -0
  117. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/data_storage/warehouse.py +0 -0
  118. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/dataset.py +0 -0
  119. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/diff/__init__.py +0 -0
  120. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/error.py +0 -0
  121. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/fs/__init__.py +0 -0
  122. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/fs/reference.py +0 -0
  123. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/__init__.py +0 -0
  124. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/aggregate.py +0 -0
  125. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/array.py +0 -0
  126. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/base.py +0 -0
  127. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/conditional.py +0 -0
  128. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/func.py +0 -0
  129. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/numeric.py +0 -0
  130. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/path.py +0 -0
  131. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/random.py +0 -0
  132. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/string.py +0 -0
  133. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/func/window.py +0 -0
  134. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/job.py +0 -0
  135. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/__init__.py +0 -0
  136. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/arrow.py +0 -0
  137. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/clip.py +0 -0
  138. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/convert/__init__.py +0 -0
  139. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/convert/flatten.py +0 -0
  140. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  141. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  142. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/convert/unflatten.py +0 -0
  143. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  144. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/data_model.py +0 -0
  145. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/dataset_info.py +0 -0
  146. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/hf.py +0 -0
  147. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/image.py +0 -0
  148. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/listing.py +0 -0
  149. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/listing_info.py +0 -0
  150. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/meta_formats.py +0 -0
  151. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/model_store.py +0 -0
  152. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/pytorch.py +0 -0
  153. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/settings.py +0 -0
  154. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/signal_schema.py +0 -0
  155. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/tar.py +0 -0
  156. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/text.py +0 -0
  157. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/udf.py +0 -0
  158. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/udf_signature.py +0 -0
  159. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/utils.py +0 -0
  160. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/video.py +0 -0
  161. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/webdataset.py +0 -0
  162. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/lib/webdataset_laion.py +0 -0
  163. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/listing.py +0 -0
  164. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/model/__init__.py +0 -0
  165. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/model/bbox.py +0 -0
  166. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/model/pose.py +0 -0
  167. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/model/segment.py +0 -0
  168. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  169. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  170. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/model/ultralytics/pose.py +0 -0
  171. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/model/ultralytics/segment.py +0 -0
  172. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/node.py +0 -0
  173. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/nodes_fetcher.py +0 -0
  174. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/nodes_thread_pool.py +0 -0
  175. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/progress.py +0 -0
  176. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/py.typed +0 -0
  177. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/__init__.py +0 -0
  178. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/batch.py +0 -0
  179. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/dispatch.py +0 -0
  180. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/metrics.py +0 -0
  181. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/params.py +0 -0
  182. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/queue.py +0 -0
  183. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/schema.py +0 -0
  184. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/udf.py +0 -0
  185. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/query/utils.py +0 -0
  186. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/remote/__init__.py +0 -0
  187. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/remote/studio.py +0 -0
  188. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/__init__.py +0 -0
  189. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/default/__init__.py +0 -0
  190. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/default/base.py +0 -0
  191. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/functions/__init__.py +0 -0
  192. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/functions/aggregate.py +0 -0
  193. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/functions/array.py +0 -0
  194. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/functions/conditional.py +0 -0
  195. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/functions/numeric.py +0 -0
  196. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/functions/path.py +0 -0
  197. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/functions/random.py +0 -0
  198. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/functions/string.py +0 -0
  199. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/selectable.py +0 -0
  200. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  201. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/sqlite/base.py +0 -0
  202. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/sqlite/types.py +0 -0
  203. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/sqlite/vector.py +0 -0
  204. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/types.py +0 -0
  205. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/sql/utils.py +0 -0
  206. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/studio.py +0 -0
  207. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/telemetry.py +0 -0
  208. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/toolkit/__init__.py +0 -0
  209. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/toolkit/split.py +0 -0
  210. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/torch/__init__.py +0 -0
  211. {datachain-0.9.0 → datachain-0.10.0}/src/datachain/utils.py +0 -0
  212. {datachain-0.9.0 → datachain-0.10.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  213. {datachain-0.9.0 → datachain-0.10.0}/src/datachain.egg-info/entry_points.txt +0 -0
  214. {datachain-0.9.0 → datachain-0.10.0}/src/datachain.egg-info/requires.txt +0 -0
  215. {datachain-0.9.0 → datachain-0.10.0}/src/datachain.egg-info/top_level.txt +0 -0
  216. {datachain-0.9.0 → datachain-0.10.0}/tests/__init__.py +0 -0
  217. {datachain-0.9.0 → datachain-0.10.0}/tests/benchmarks/__init__.py +0 -0
  218. {datachain-0.9.0 → datachain-0.10.0}/tests/benchmarks/conftest.py +0 -0
  219. {datachain-0.9.0 → datachain-0.10.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  220. {datachain-0.9.0 → datachain-0.10.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  221. {datachain-0.9.0 → datachain-0.10.0}/tests/benchmarks/datasets/.gitignore +0 -0
  222. {datachain-0.9.0 → datachain-0.10.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  223. {datachain-0.9.0 → datachain-0.10.0}/tests/benchmarks/test_datachain.py +0 -0
  224. {datachain-0.9.0 → datachain-0.10.0}/tests/benchmarks/test_ls.py +0 -0
  225. {datachain-0.9.0 → datachain-0.10.0}/tests/benchmarks/test_version.py +0 -0
  226. {datachain-0.9.0 → datachain-0.10.0}/tests/data.py +0 -0
  227. {datachain-0.9.0 → datachain-0.10.0}/tests/examples/__init__.py +0 -0
  228. {datachain-0.9.0 → datachain-0.10.0}/tests/examples/test_examples.py +0 -0
  229. {datachain-0.9.0 → datachain-0.10.0}/tests/examples/test_wds_e2e.py +0 -0
  230. {datachain-0.9.0 → datachain-0.10.0}/tests/examples/wds_data.py +0 -0
  231. {datachain-0.9.0 → datachain-0.10.0}/tests/func/__init__.py +0 -0
  232. {datachain-0.9.0 → datachain-0.10.0}/tests/func/fake-service-account-credentials.json +0 -0
  233. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_catalog.py +0 -0
  234. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_client.py +0 -0
  235. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_data_storage.py +0 -0
  236. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_datachain_merge.py +0 -0
  237. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_dataset_query.py +0 -0
  238. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_datasets.py +0 -0
  239. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_feature_pickling.py +0 -0
  240. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_hf.py +0 -0
  241. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_listing.py +0 -0
  242. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_ls.py +0 -0
  243. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_meta_formats.py +0 -0
  244. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_metrics.py +0 -0
  245. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_pytorch.py +0 -0
  246. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_query.py +0 -0
  247. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_session.py +0 -0
  248. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_toolkit.py +0 -0
  249. {datachain-0.9.0 → datachain-0.10.0}/tests/func/test_warehouse.py +0 -0
  250. {datachain-0.9.0 → datachain-0.10.0}/tests/scripts/feature_class.py +0 -0
  251. {datachain-0.9.0 → datachain-0.10.0}/tests/scripts/feature_class_exception.py +0 -0
  252. {datachain-0.9.0 → datachain-0.10.0}/tests/scripts/feature_class_parallel.py +0 -0
  253. {datachain-0.9.0 → datachain-0.10.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  254. {datachain-0.9.0 → datachain-0.10.0}/tests/scripts/name_len_slow.py +0 -0
  255. {datachain-0.9.0 → datachain-0.10.0}/tests/test_atomicity.py +0 -0
  256. {datachain-0.9.0 → datachain-0.10.0}/tests/test_cli_e2e.py +0 -0
  257. {datachain-0.9.0 → datachain-0.10.0}/tests/test_cli_studio.py +0 -0
  258. {datachain-0.9.0 → datachain-0.10.0}/tests/test_query_e2e.py +0 -0
  259. {datachain-0.9.0 → datachain-0.10.0}/tests/test_telemetry.py +0 -0
  260. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/__init__.py +0 -0
  261. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/__init__.py +0 -0
  262. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/conftest.py +0 -0
  263. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  264. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_arrow.py +0 -0
  265. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_clip.py +0 -0
  266. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_datachain.py +0 -0
  267. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  268. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  269. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_diff.py +0 -0
  270. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_feature.py +0 -0
  271. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_feature_utils.py +0 -0
  272. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_file.py +0 -0
  273. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_hf.py +0 -0
  274. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_image.py +0 -0
  275. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_listing_info.py +0 -0
  276. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_models.py +0 -0
  277. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  278. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_schema.py +0 -0
  279. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_signal_schema.py +0 -0
  280. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  281. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_text.py +0 -0
  282. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_udf_signature.py +0 -0
  283. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_utils.py +0 -0
  284. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_video.py +0 -0
  285. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/lib/test_webdataset.py +0 -0
  286. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/sql/__init__.py +0 -0
  287. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  288. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  289. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  290. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/sql/test_array.py +0 -0
  291. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/sql/test_conditional.py +0 -0
  292. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/sql/test_path.py +0 -0
  293. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/sql/test_random.py +0 -0
  294. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/sql/test_selectable.py +0 -0
  295. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/sql/test_string.py +0 -0
  296. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_asyn.py +0 -0
  297. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_cache.py +0 -0
  298. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_catalog.py +0 -0
  299. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_catalog_loader.py +0 -0
  300. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_cli_parsing.py +0 -0
  301. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_client.py +0 -0
  302. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_client_gcs.py +0 -0
  303. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_client_s3.py +0 -0
  304. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_config.py +0 -0
  305. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_data_storage.py +0 -0
  306. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_database_engine.py +0 -0
  307. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_dataset.py +0 -0
  308. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_dispatch.py +0 -0
  309. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_fileslice.py +0 -0
  310. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_func.py +0 -0
  311. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_listing.py +0 -0
  312. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_metastore.py +0 -0
  313. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_module_exports.py +0 -0
  314. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_pytorch.py +0 -0
  315. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_query.py +0 -0
  316. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_query_metrics.py +0 -0
  317. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_query_params.py +0 -0
  318. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_serializer.py +0 -0
  319. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_session.py +0 -0
  320. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_utils.py +0 -0
  321. {datachain-0.9.0 → datachain-0.10.0}/tests/unit/test_warehouse.py +0 -0
  322. {datachain-0.9.0 → datachain-0.10.0}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.9.0
3
+ Version: 0.10.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -175,7 +175,7 @@ high confidence scores.
175
175
 
176
176
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
177
  & (Column("meta.inference.class_") == "cat"))
178
- likely_cats.export_files("high-confidence-cats/", signal="file")
178
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
179
179
 
180
180
 
181
181
  Example: LLM based text-file evaluation
@@ -216,7 +216,7 @@ Python code:
216
216
  )
217
217
 
218
218
  successful_chain = chain.filter(Column("is_success") == True)
219
- successful_chain.export_files("./output_mistral")
219
+ successful_chain.to_storage("./output_mistral")
220
220
 
221
221
  print(f"{successful_chain.count()} files were exported")
222
222
 
@@ -68,7 +68,7 @@ high confidence scores.
68
68
 
69
69
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
70
70
  & (Column("meta.inference.class_") == "cat"))
71
- likely_cats.export_files("high-confidence-cats/", signal="file")
71
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
72
72
 
73
73
 
74
74
  Example: LLM based text-file evaluation
@@ -109,7 +109,7 @@ Python code:
109
109
  )
110
110
 
111
111
  successful_chain = chain.filter(Column("is_success") == True)
112
- successful_chain.export_files("./output_mistral")
112
+ successful_chain.to_storage("./output_mistral")
113
113
 
114
114
  print(f"{successful_chain.count()} files were exported")
115
115
 
@@ -47,7 +47,7 @@ annotated = images_id.merge(meta, on="id", right_on="meta.id")
47
47
 
48
48
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
49
49
  & (Column("meta.inference.class_") == "cat"))
50
- likely_cats.export_files("high-confidence-cats/", signal="file")
50
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
51
51
  ```
52
52
 
53
53
  ## Data curation with a local AI model
@@ -85,7 +85,7 @@ chain = (
85
85
  )
86
86
 
87
87
  positive_chain = chain.filter(Column("is_positive") == True)
88
- positive_chain.export_files("./output")
88
+ positive_chain.to_storage("./output")
89
89
 
90
90
  print(f"{positive_chain.count()} files were exported")
91
91
  ```
@@ -390,7 +390,7 @@ class Client(ABC):
390
390
  ) # type: ignore[return-value]
391
391
 
392
392
  def upload(self, data: bytes, path: str) -> "File":
393
- full_path = self.get_full_path(path)
393
+ full_path = path if path.startswith(self.PREFIX) else self.get_full_path(path)
394
394
 
395
395
  parent = posixpath.dirname(full_path)
396
396
  self.fs.makedirs(parent, exist_ok=True)
@@ -411,6 +411,7 @@ class DataChain:
411
411
  object_name: str = "file",
412
412
  update: bool = False,
413
413
  anon: bool = False,
414
+ client_config: Optional[dict] = None,
414
415
  ) -> "Self":
415
416
  """Get data from a storage as a list of file with all file attributes.
416
417
  It returns the chain itself as usual.
@@ -423,15 +424,32 @@ class DataChain:
423
424
  object_name : Created object column name.
424
425
  update : force storage reindexing. Default is False.
425
426
  anon : If True, we will treat cloud bucket as public one
427
+ client_config : Optional client configuration for the storage client.
426
428
 
427
429
  Example:
430
+ Simple call from s3
428
431
  ```py
429
432
  chain = DataChain.from_storage("s3://my-bucket/my-dir")
430
433
  ```
434
+
435
+ With AWS S3-compatible storage
436
+ ```py
437
+ chain = DataChain.from_storage(
438
+ "s3://my-bucket/my-dir",
439
+ client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
440
+ )
441
+ ```
442
+
443
+ Pass existing session
444
+ ```py
445
+ session = Session.get()
446
+ chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
447
+ ```
431
448
  """
432
449
  file_type = get_file_type(type)
433
450
 
434
- client_config = {"anon": True} if anon else None
451
+ if anon:
452
+ client_config = (client_config or {}) | {"anon": True}
435
453
  session = Session.get(session, client_config=client_config, in_memory=in_memory)
436
454
  cache = session.catalog.cache
437
455
  client_config = session.catalog.client_config
@@ -481,25 +499,56 @@ class DataChain:
481
499
  version: Optional[int] = None,
482
500
  session: Optional[Session] = None,
483
501
  settings: Optional[dict] = None,
484
- fallback_to_remote: bool = True,
502
+ fallback_to_studio: bool = True,
485
503
  ) -> "Self":
486
504
  """Get data from a saved Dataset. It returns the chain itself.
505
+ If dataset or version is not found locally, it will try to pull it from Studio.
487
506
 
488
507
  Parameters:
489
508
  name : dataset name
490
509
  version : dataset version
510
+ session : Session to use for the chain.
511
+ settings : Settings to use for the chain.
512
+ fallback_to_studio : Try to pull dataset from Studio if not found locally.
513
+ Default is True.
491
514
 
492
515
  Example:
493
516
  ```py
494
517
  chain = DataChain.from_dataset("my_cats")
495
518
  ```
519
+
520
+ ```py
521
+ chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
522
+ ```
523
+
524
+ ```py
525
+ chain = DataChain.from_dataset("my_cats", version=1)
526
+ ```
527
+
528
+ ```py
529
+ session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
530
+ settings = {
531
+ "cache": True,
532
+ "parallel": 4,
533
+ "workers": 4,
534
+ "min_task_size": 1000,
535
+ "prefetch": 10,
536
+ }
537
+ chain = DataChain.from_dataset(
538
+ name="my_cats",
539
+ version=1,
540
+ session=session,
541
+ settings=settings,
542
+ fallback_to_studio=True,
543
+ )
544
+ ```
496
545
  """
497
546
  query = DatasetQuery(
498
547
  name=name,
499
548
  version=version,
500
549
  session=session,
501
550
  indexing_column_types=File._datachain_column_types,
502
- fallback_to_remote=fallback_to_remote,
551
+ fallback_to_studio=fallback_to_studio,
503
552
  )
504
553
  telemetry.send_event_once("class", "datachain_init", name=name, version=version)
505
554
  if settings:
@@ -2444,7 +2493,7 @@ class DataChain:
2444
2493
  self._setup = self._setup | kwargs
2445
2494
  return self
2446
2495
 
2447
- def export_files(
2496
+ def to_storage(
2448
2497
  self,
2449
2498
  output: str,
2450
2499
  signal: str = "file",
@@ -2462,6 +2511,13 @@ class DataChain:
2462
2511
  use_cache: If `True`, cache the files before exporting.
2463
2512
  link_type: Method to use for exporting files.
2464
2513
  Falls back to `'copy'` if symlinking fails.
2514
+
2515
+ Example:
2516
+ Cross cloud transfer
2517
+ ```py
2518
+ ds = DataChain.from_storage("s3://mybucket")
2519
+ ds.to_storage("gs://mybucket", placement="filename")
2520
+ ```
2465
2521
  """
2466
2522
  if placement == "filename" and (
2467
2523
  self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
@@ -17,6 +17,7 @@ from urllib.parse import unquote, urlparse
17
17
  from urllib.request import url2pathname
18
18
 
19
19
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
20
+ from fsspec.utils import stringify_path
20
21
  from PIL import Image as PilImage
21
22
  from pydantic import Field, field_validator
22
23
 
@@ -214,10 +215,13 @@ class File(DataModel):
214
215
 
215
216
  catalog = get_catalog()
216
217
 
217
- parent, name = posixpath.split(path)
218
+ from datachain.client.fsspec import Client
218
219
 
219
- client = catalog.get_client(parent)
220
- file = client.upload(data, name)
220
+ client_cls = Client.get_implementation(path)
221
+ source, rel_path = client_cls.split_url(path)
222
+
223
+ client = catalog.get_client(client_cls.get_uri(source))
224
+ file = client.upload(data, rel_path)
221
225
  if not isinstance(file, cls):
222
226
  file = cls(**file.model_dump())
223
227
  file._set_stream(catalog)
@@ -267,8 +271,9 @@ class File(DataModel):
267
271
 
268
272
  def save(self, destination: str):
269
273
  """Writes it's content to destination"""
270
- with open(destination, mode="wb") as f:
271
- f.write(self.read())
274
+ destination = stringify_path(destination)
275
+ client: Client = self._catalog.get_client(str(destination))
276
+ client.upload(self.read(), str(destination))
272
277
 
273
278
  def _symlink_to(self, destination: str):
274
279
  if self.location:
@@ -282,6 +287,7 @@ class File(DataModel):
282
287
  source = self.get_path()
283
288
  else:
284
289
  raise OSError(errno.EXDEV, "can't link across filesystems")
290
+
285
291
  return os.symlink(source, destination)
286
292
 
287
293
  def export(
@@ -296,7 +302,8 @@ class File(DataModel):
296
302
  self._caching_enabled = use_cache
297
303
  dst = self.get_destination_path(output, placement)
298
304
  dst_dir = os.path.dirname(dst)
299
- os.makedirs(dst_dir, exist_ok=True)
305
+ client: Client = self._catalog.get_client(dst_dir)
306
+ client.fs.makedirs(dst_dir, exist_ok=True)
300
307
 
301
308
  if link_type == "symlink":
302
309
  try:
@@ -493,7 +500,10 @@ class TextFile(File):
493
500
 
494
501
  def save(self, destination: str):
495
502
  """Writes it's content to destination"""
496
- with open(destination, mode="w") as f:
503
+ destination = stringify_path(destination)
504
+
505
+ client: Client = self._catalog.get_client(destination)
506
+ with client.fs.open(destination, mode="w") as f:
497
507
  f.write(self.read_text())
498
508
 
499
509
 
@@ -507,7 +517,11 @@ class ImageFile(File):
507
517
 
508
518
  def save(self, destination: str):
509
519
  """Writes it's content to destination"""
510
- self.read().save(destination)
520
+ destination = stringify_path(destination)
521
+
522
+ client: Client = self._catalog.get_client(destination)
523
+ with client.fs.open(destination, mode="wb") as f:
524
+ self.read().save(f)
511
525
 
512
526
 
513
527
  class Image(DataModel):
@@ -1085,7 +1085,7 @@ class DatasetQuery:
1085
1085
  session: Optional[Session] = None,
1086
1086
  indexing_column_types: Optional[dict[str, Any]] = None,
1087
1087
  in_memory: bool = False,
1088
- fallback_to_remote: bool = True,
1088
+ fallback_to_studio: bool = True,
1089
1089
  ) -> None:
1090
1090
  self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
1091
1091
  self.catalog = catalog or self.session.catalog
@@ -1103,7 +1103,7 @@ class DatasetQuery:
1103
1103
 
1104
1104
  self.name = name
1105
1105
 
1106
- if fallback_to_remote and is_token_set():
1106
+ if fallback_to_studio and is_token_set():
1107
1107
  ds = self.catalog.get_dataset_with_remote_fallback(name, version)
1108
1108
  else:
1109
1109
  ds = self.catalog.get_dataset(name)
@@ -139,21 +139,33 @@ class Session:
139
139
 
140
140
  # Access the active (most recent) context from the stack
141
141
  if cls.SESSION_CONTEXTS:
142
- return cls.SESSION_CONTEXTS[-1]
142
+ session = cls.SESSION_CONTEXTS[-1]
143
143
 
144
- if cls.GLOBAL_SESSION_CTX is None:
144
+ elif cls.GLOBAL_SESSION_CTX is None:
145
145
  cls.GLOBAL_SESSION_CTX = Session(
146
146
  cls.GLOBAL_SESSION_NAME,
147
147
  catalog,
148
148
  client_config=client_config,
149
149
  in_memory=in_memory,
150
150
  )
151
+ session = cls.GLOBAL_SESSION_CTX
151
152
 
152
153
  atexit.register(cls._global_cleanup)
153
154
  cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
154
155
  sys.excepthook = cls.except_hook
156
+ else:
157
+ session = cls.GLOBAL_SESSION_CTX
155
158
 
156
- return cls.GLOBAL_SESSION_CTX
159
+ if client_config and session.catalog.client_config != client_config:
160
+ session = Session(
161
+ "session" + uuid4().hex[:4],
162
+ catalog,
163
+ client_config=client_config,
164
+ in_memory=in_memory,
165
+ )
166
+ session.__enter__()
167
+
168
+ return session
157
169
 
158
170
  @staticmethod
159
171
  def except_hook(exc_type, exc_value, exc_traceback):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.9.0
3
+ Version: 0.10.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -175,7 +175,7 @@ high confidence scores.
175
175
 
176
176
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
177
  & (Column("meta.inference.class_") == "cat"))
178
- likely_cats.export_files("high-confidence-cats/", signal="file")
178
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
179
179
 
180
180
 
181
181
  Example: LLM based text-file evaluation
@@ -216,7 +216,7 @@ Python code:
216
216
  )
217
217
 
218
218
  successful_chain = chain.filter(Column("is_success") == True)
219
- successful_chain.export_files("./output_mistral")
219
+ successful_chain.to_storage("./output_mistral")
220
220
 
221
221
  print(f"{successful_chain.count()} files were exported")
222
222
 
@@ -232,6 +232,7 @@ tests/func/__init__.py
232
232
  tests/func/fake-service-account-credentials.json
233
233
  tests/func/test_catalog.py
234
234
  tests/func/test_client.py
235
+ tests/func/test_cloud_transfer.py
235
236
  tests/func/test_data_storage.py
236
237
  tests/func/test_datachain.py
237
238
  tests/func/test_datachain_merge.py
@@ -472,9 +472,9 @@ def cloud_server_credentials(cloud_server, monkeypatch):
472
472
 
473
473
  def get_cloud_test_catalog(cloud_server, tmp_path, metastore, warehouse):
474
474
  cache_dir = tmp_path / ".datachain" / "cache"
475
- cache_dir.mkdir(parents=True)
475
+ cache_dir.mkdir(parents=True, exist_ok=True)
476
476
  tmpfile_dir = tmp_path / ".datachain" / "tmp"
477
- tmpfile_dir.mkdir()
477
+ tmpfile_dir.mkdir(exist_ok=True)
478
478
 
479
479
  catalog = Catalog(
480
480
  metastore=metastore,
@@ -0,0 +1,68 @@
1
+ import pytest
2
+
3
+ from datachain import Session
4
+ from datachain.lib.dc import DataChain
5
+ from tests.conftest import get_cloud_test_catalog, make_cloud_server
6
+
7
+
8
+ def test_cross_cloud_transfer(
9
+ request,
10
+ tmp_upath_factory,
11
+ tree,
12
+ tmp_path,
13
+ metastore,
14
+ warehouse,
15
+ ):
16
+ disabled_remotes = request.config.getoption("--disable-remotes") or []
17
+
18
+ if any(remote in disabled_remotes for remote in ["azure", "gs", "all"]):
19
+ pytest.skip("Skipping all tests for azure, gs or all remotes")
20
+
21
+ azure_path = tmp_upath_factory.mktemp("azure", version_aware=False)
22
+ azure_server = make_cloud_server(azure_path, "azure", tree)
23
+
24
+ gcloud_path = tmp_upath_factory.mktemp("gs", version_aware=False)
25
+ gcloud_server = make_cloud_server(gcloud_path, "gs", tree)
26
+
27
+ # Initialize cloud catalogs
28
+ azure_catalog = get_cloud_test_catalog(azure_server, tmp_path, metastore, warehouse)
29
+ gcloud_catalog = get_cloud_test_catalog(
30
+ gcloud_server, tmp_path, metastore, warehouse
31
+ )
32
+
33
+ # Define test file paths
34
+ test_filename = "image_1.jpg"
35
+ test_content = b"bytes"
36
+
37
+ source_dir = f"{azure_catalog.src_uri}/source-test-images"
38
+ source_file = f"{source_dir}/{test_filename}"
39
+
40
+ dest_dir = f"{gcloud_catalog.src_uri}/destination-test-images"
41
+ dest_file = f"{dest_dir}/{test_filename}"
42
+
43
+ # Get cloud clients
44
+ azure_client = azure_catalog.catalog.get_client(source_file)
45
+ gcloud_client = gcloud_catalog.catalog.get_client(dest_file)
46
+
47
+ try:
48
+ # Create test file in Azure
49
+ with azure_client.fs.open(source_file, "wb") as f:
50
+ f.write(test_content)
51
+
52
+ # Perform cross-cloud transfer
53
+ combined_config = azure_server.client_config | gcloud_server.client_config
54
+ with Session("testSession", client_config=combined_config):
55
+ datachain = DataChain.from_storage(source_dir)
56
+ datachain.to_storage(dest_dir, placement="filename")
57
+
58
+ # Verify transfer
59
+ with gcloud_client.fs.open(dest_file, "rb") as f:
60
+ assert f.read() == test_content
61
+
62
+ finally:
63
+ # Cleanup
64
+ try:
65
+ azure_client.fs.rm(source_dir, recursive=True)
66
+ gcloud_client.fs.rm(dest_dir, recursive=True)
67
+ except FileNotFoundError:
68
+ pass
@@ -64,6 +64,16 @@ def test_catalog_anon(tmp_dir, catalog, anon):
64
64
  assert chain.session.catalog.client_config.get("anon", False) is anon
65
65
 
66
66
 
67
+ def test_from_storage_client_config(tmp_dir, catalog):
68
+ dc = DataChain.from_storage(tmp_dir.as_uri())
69
+ assert dc.session.catalog.client_config == {} # Default client config is set.
70
+
71
+ dc = DataChain.from_storage(tmp_dir.as_uri(), client_config={"anon": True})
72
+ assert dc.session.catalog.client_config == {
73
+ "anon": True
74
+ } # New client config is set.
75
+
76
+
67
77
  def test_from_storage(cloud_test_catalog):
68
78
  ctc = cloud_test_catalog
69
79
  dc = DataChain.from_storage(ctc.src_uri, session=ctc.session)
@@ -292,20 +302,20 @@ def test_read_file(cloud_test_catalog, use_cache):
292
302
  @pytest.mark.parametrize("use_cache", [True, False])
293
303
  @pytest.mark.parametrize("file_type", ["", "binary", "text"])
294
304
  @pytest.mark.parametrize("cloud_type", ["file"], indirect=True)
295
- def test_export_files(
305
+ def test_to_storage(
296
306
  tmp_dir, cloud_test_catalog, test_session, placement, use_map, use_cache, file_type
297
307
  ):
298
308
  ctc = cloud_test_catalog
299
309
  df = DataChain.from_storage(ctc.src_uri, type=file_type, session=test_session)
300
310
  if use_map:
301
- df.export_files(tmp_dir / "output", placement=placement, use_cache=use_cache)
311
+ df.to_storage(tmp_dir / "output", placement=placement, use_cache=use_cache)
302
312
  df.map(
303
313
  res=lambda file: file.export(
304
314
  tmp_dir / "output", placement=placement, use_cache=use_cache
305
315
  )
306
316
  ).exec()
307
317
  else:
308
- df.export_files(tmp_dir / "output", placement=placement)
318
+ df.to_storage(tmp_dir / "output", placement=placement)
309
319
 
310
320
  expected = {
311
321
  "description": "Cats and Dogs",
@@ -341,14 +351,14 @@ def test_export_images_files(test_session, tmp_dir, tmp_path, use_cache):
341
351
  ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images
342
352
  ],
343
353
  session=test_session,
344
- ).export_files(tmp_dir / "output", placement="filename", use_cache=use_cache)
354
+ ).to_storage(tmp_dir / "output", placement="filename", use_cache=use_cache)
345
355
 
346
356
  for img in images:
347
357
  exported_img = Image.open(tmp_dir / "output" / img["name"])
348
358
  assert images_equal(img["data"], exported_img)
349
359
 
350
360
 
351
- def test_export_files_filename_placement_not_unique_files(tmp_dir, test_session):
361
+ def test_to_storage_files_filename_placement_not_unique_files(tmp_dir, test_session):
352
362
  data = b"some\x00data\x00is\x48\x65\x6c\x57\x6f\x72\x6c\x64\xff\xffheRe"
353
363
  bucket_name = "mybucket"
354
364
  files = ["dir1/a.json", "dir1/dir2/a.json"]
@@ -364,7 +374,7 @@ def test_export_files_filename_placement_not_unique_files(tmp_dir, test_session)
364
374
 
365
375
  df = DataChain.from_storage((tmp_dir / bucket_name).as_uri(), session=test_session)
366
376
  with pytest.raises(ValueError):
367
- df.export_files(tmp_dir / "output", placement="filename")
377
+ df.to_storage(tmp_dir / "output", placement="filename")
368
378
 
369
379
 
370
380
  def test_show(capsys, test_session):
@@ -50,16 +50,18 @@ def test_upload(cloud_test_catalog):
50
50
 
51
51
  src_uri = ctc.src_uri
52
52
  filename = "image_1.jpg"
53
- source = f"{src_uri}/upload-test-images"
53
+ dest = f"{src_uri}/upload-test-images"
54
54
  catalog = ctc.catalog
55
55
 
56
56
  img_bytes = b"bytes"
57
57
 
58
- f = File.upload(img_bytes, f"{source}/{filename}", catalog)
58
+ f = File.upload(img_bytes, f"{dest}/{filename}", catalog)
59
59
 
60
- assert f.path == filename
61
- assert f.source == source
60
+ client = catalog.get_client(src_uri)
61
+ source, rel_path = client.split_url(f"{dest}/{filename}")
62
+
63
+ assert f.path == rel_path
64
+ assert f.source == client.get_uri(source)
62
65
  assert f.read() == img_bytes
63
66
 
64
- client = catalog.get_client(src_uri)
65
- client.fs.rm(source, recursive=True)
67
+ client.fs.rm(dest, recursive=True)
@@ -295,7 +295,7 @@ def test_datachain_from_dataset_pull(
295
295
  ds = DataChain.from_dataset(
296
296
  name="dogs",
297
297
  version=1,
298
- fallback_to_remote=True,
298
+ fallback_to_studio=True,
299
299
  )
300
300
 
301
301
  assert ds.dataset.name == "dogs"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes