datachain 0.9.1__tar.gz → 0.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (324) hide show
  1. {datachain-0.9.1 → datachain-0.11.0}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.9.1 → datachain-0.11.0}/PKG-INFO +5 -4
  3. {datachain-0.9.1 → datachain-0.11.0}/README.rst +2 -2
  4. {datachain-0.9.1 → datachain-0.11.0}/docs/quick-start.md +2 -2
  5. {datachain-0.9.1 → datachain-0.11.0}/pyproject.toml +3 -2
  6. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/fsspec.py +1 -1
  7. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/dc.py +60 -4
  8. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/file.py +20 -5
  9. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/dataset.py +2 -2
  10. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/session.py +15 -3
  11. datachain-0.11.0/src/datachain/script_meta.py +147 -0
  12. {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/PKG-INFO +5 -4
  13. {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/SOURCES.txt +3 -0
  14. {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/requires.txt +4 -1
  15. {datachain-0.9.1 → datachain-0.11.0}/tests/conftest.py +2 -2
  16. datachain-0.11.0/tests/func/test_cloud_transfer.py +68 -0
  17. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_datachain.py +37 -6
  18. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_pull.py +1 -1
  19. datachain-0.11.0/tests/unit/test_script_meta.py +119 -0
  20. {datachain-0.9.1 → datachain-0.11.0}/.cruft.json +0 -0
  21. {datachain-0.9.1 → datachain-0.11.0}/.gitattributes +0 -0
  22. {datachain-0.9.1 → datachain-0.11.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  23. {datachain-0.9.1 → datachain-0.11.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  24. {datachain-0.9.1 → datachain-0.11.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  25. {datachain-0.9.1 → datachain-0.11.0}/.github/codecov.yaml +0 -0
  26. {datachain-0.9.1 → datachain-0.11.0}/.github/dependabot.yml +0 -0
  27. {datachain-0.9.1 → datachain-0.11.0}/.github/workflows/benchmarks.yml +0 -0
  28. {datachain-0.9.1 → datachain-0.11.0}/.github/workflows/release.yml +0 -0
  29. {datachain-0.9.1 → datachain-0.11.0}/.github/workflows/tests-studio.yml +0 -0
  30. {datachain-0.9.1 → datachain-0.11.0}/.github/workflows/tests.yml +0 -0
  31. {datachain-0.9.1 → datachain-0.11.0}/.github/workflows/update-template.yaml +0 -0
  32. {datachain-0.9.1 → datachain-0.11.0}/.gitignore +0 -0
  33. {datachain-0.9.1 → datachain-0.11.0}/CODE_OF_CONDUCT.rst +0 -0
  34. {datachain-0.9.1 → datachain-0.11.0}/LICENSE +0 -0
  35. {datachain-0.9.1 → datachain-0.11.0}/docs/assets/captioned_cartoons.png +0 -0
  36. {datachain-0.9.1 → datachain-0.11.0}/docs/assets/datachain-white.svg +0 -0
  37. {datachain-0.9.1 → datachain-0.11.0}/docs/assets/datachain.svg +0 -0
  38. {datachain-0.9.1 → datachain-0.11.0}/docs/contributing.md +0 -0
  39. {datachain-0.9.1 → datachain-0.11.0}/docs/css/github-permalink-style.css +0 -0
  40. {datachain-0.9.1 → datachain-0.11.0}/docs/examples.md +0 -0
  41. {datachain-0.9.1 → datachain-0.11.0}/docs/index.md +0 -0
  42. {datachain-0.9.1 → datachain-0.11.0}/docs/overrides/main.html +0 -0
  43. {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/arrowrow.md +0 -0
  44. {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/bbox.md +0 -0
  45. {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/file.md +0 -0
  46. {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/imagefile.md +0 -0
  47. {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/index.md +0 -0
  48. {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/pose.md +0 -0
  49. {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/segment.md +0 -0
  50. {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/tarvfile.md +0 -0
  51. {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/textfile.md +0 -0
  52. {datachain-0.9.1 → datachain-0.11.0}/docs/references/data-types/videofile.md +0 -0
  53. {datachain-0.9.1 → datachain-0.11.0}/docs/references/datachain.md +0 -0
  54. {datachain-0.9.1 → datachain-0.11.0}/docs/references/func.md +0 -0
  55. {datachain-0.9.1 → datachain-0.11.0}/docs/references/index.md +0 -0
  56. {datachain-0.9.1 → datachain-0.11.0}/docs/references/toolkit.md +0 -0
  57. {datachain-0.9.1 → datachain-0.11.0}/docs/references/torch.md +0 -0
  58. {datachain-0.9.1 → datachain-0.11.0}/docs/references/udf.md +0 -0
  59. {datachain-0.9.1 → datachain-0.11.0}/docs/tutorials.md +0 -0
  60. {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  61. {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  62. {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/openimage-detect.py +0 -0
  63. {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  64. {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  65. {datachain-0.9.1 → datachain-0.11.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  66. {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/common_sql_functions.py +0 -0
  67. {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/json-csv-reader.py +0 -0
  68. {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/torch-loader.py +0 -0
  69. {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/udfs/parallel.py +0 -0
  70. {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/udfs/simple.py +0 -0
  71. {datachain-0.9.1 → datachain-0.11.0}/examples/get_started/udfs/stateful.py +0 -0
  72. {datachain-0.9.1 → datachain-0.11.0}/examples/llm_and_nlp/claude-query.py +0 -0
  73. {datachain-0.9.1 → datachain-0.11.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  74. {datachain-0.9.1 → datachain-0.11.0}/examples/multimodal/clip_inference.py +0 -0
  75. {datachain-0.9.1 → datachain-0.11.0}/examples/multimodal/hf_pipeline.py +0 -0
  76. {datachain-0.9.1 → datachain-0.11.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  77. {datachain-0.9.1 → datachain-0.11.0}/examples/multimodal/wds.py +0 -0
  78. {datachain-0.9.1 → datachain-0.11.0}/examples/multimodal/wds_filtered.py +0 -0
  79. {datachain-0.9.1 → datachain-0.11.0}/mkdocs.yml +0 -0
  80. {datachain-0.9.1 → datachain-0.11.0}/noxfile.py +0 -0
  81. {datachain-0.9.1 → datachain-0.11.0}/setup.cfg +0 -0
  82. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/__init__.py +0 -0
  83. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/__main__.py +0 -0
  84. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/asyn.py +0 -0
  85. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cache.py +0 -0
  86. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/catalog/__init__.py +0 -0
  87. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/catalog/catalog.py +0 -0
  88. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/catalog/datasource.py +0 -0
  89. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/catalog/loader.py +0 -0
  90. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/__init__.py +0 -0
  91. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/__init__.py +0 -0
  92. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/datasets.py +0 -0
  93. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/du.py +0 -0
  94. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/index.py +0 -0
  95. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/ls.py +0 -0
  96. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/misc.py +0 -0
  97. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/query.py +0 -0
  98. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/commands/show.py +0 -0
  99. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/parser/__init__.py +0 -0
  100. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/parser/job.py +0 -0
  101. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/parser/studio.py +0 -0
  102. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/parser/utils.py +0 -0
  103. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/cli/utils.py +0 -0
  104. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/__init__.py +0 -0
  105. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/azure.py +0 -0
  106. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/fileslice.py +0 -0
  107. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/gcs.py +0 -0
  108. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/hf.py +0 -0
  109. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/local.py +0 -0
  110. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/client/s3.py +0 -0
  111. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/config.py +0 -0
  112. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/__init__.py +0 -0
  113. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/db_engine.py +0 -0
  114. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/job.py +0 -0
  115. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/metastore.py +0 -0
  116. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/schema.py +0 -0
  117. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/serializer.py +0 -0
  118. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/sqlite.py +0 -0
  119. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/data_storage/warehouse.py +0 -0
  120. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/dataset.py +0 -0
  121. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/diff/__init__.py +0 -0
  122. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/error.py +0 -0
  123. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/fs/__init__.py +0 -0
  124. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/fs/reference.py +0 -0
  125. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/__init__.py +0 -0
  126. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/aggregate.py +0 -0
  127. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/array.py +0 -0
  128. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/base.py +0 -0
  129. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/conditional.py +0 -0
  130. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/func.py +0 -0
  131. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/numeric.py +0 -0
  132. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/path.py +0 -0
  133. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/random.py +0 -0
  134. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/string.py +0 -0
  135. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/func/window.py +0 -0
  136. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/job.py +0 -0
  137. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/__init__.py +0 -0
  138. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/arrow.py +0 -0
  139. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/clip.py +0 -0
  140. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/__init__.py +0 -0
  141. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/flatten.py +0 -0
  142. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  143. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  144. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/unflatten.py +0 -0
  145. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  146. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/data_model.py +0 -0
  147. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/dataset_info.py +0 -0
  148. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/hf.py +0 -0
  149. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/image.py +0 -0
  150. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/listing.py +0 -0
  151. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/listing_info.py +0 -0
  152. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/meta_formats.py +0 -0
  153. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/model_store.py +0 -0
  154. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/pytorch.py +0 -0
  155. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/settings.py +0 -0
  156. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/signal_schema.py +0 -0
  157. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/tar.py +0 -0
  158. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/text.py +0 -0
  159. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/udf.py +0 -0
  160. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/udf_signature.py +0 -0
  161. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/utils.py +0 -0
  162. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/video.py +0 -0
  163. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/webdataset.py +0 -0
  164. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/lib/webdataset_laion.py +0 -0
  165. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/listing.py +0 -0
  166. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/__init__.py +0 -0
  167. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/bbox.py +0 -0
  168. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/pose.py +0 -0
  169. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/segment.py +0 -0
  170. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  171. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  172. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/ultralytics/pose.py +0 -0
  173. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/model/ultralytics/segment.py +0 -0
  174. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/node.py +0 -0
  175. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/nodes_fetcher.py +0 -0
  176. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/nodes_thread_pool.py +0 -0
  177. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/progress.py +0 -0
  178. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/py.typed +0 -0
  179. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/__init__.py +0 -0
  180. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/batch.py +0 -0
  181. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/dispatch.py +0 -0
  182. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/metrics.py +0 -0
  183. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/params.py +0 -0
  184. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/queue.py +0 -0
  185. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/schema.py +0 -0
  186. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/udf.py +0 -0
  187. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/query/utils.py +0 -0
  188. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/remote/__init__.py +0 -0
  189. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/remote/studio.py +0 -0
  190. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/__init__.py +0 -0
  191. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/default/__init__.py +0 -0
  192. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/default/base.py +0 -0
  193. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/__init__.py +0 -0
  194. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/aggregate.py +0 -0
  195. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/array.py +0 -0
  196. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/conditional.py +0 -0
  197. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/numeric.py +0 -0
  198. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/path.py +0 -0
  199. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/random.py +0 -0
  200. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/functions/string.py +0 -0
  201. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/selectable.py +0 -0
  202. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  203. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/sqlite/base.py +0 -0
  204. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/sqlite/types.py +0 -0
  205. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/sqlite/vector.py +0 -0
  206. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/types.py +0 -0
  207. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/sql/utils.py +0 -0
  208. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/studio.py +0 -0
  209. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/telemetry.py +0 -0
  210. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/toolkit/__init__.py +0 -0
  211. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/toolkit/split.py +0 -0
  212. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/torch/__init__.py +0 -0
  213. {datachain-0.9.1 → datachain-0.11.0}/src/datachain/utils.py +0 -0
  214. {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  215. {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/entry_points.txt +0 -0
  216. {datachain-0.9.1 → datachain-0.11.0}/src/datachain.egg-info/top_level.txt +0 -0
  217. {datachain-0.9.1 → datachain-0.11.0}/tests/__init__.py +0 -0
  218. {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/__init__.py +0 -0
  219. {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/conftest.py +0 -0
  220. {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  221. {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  222. {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/datasets/.gitignore +0 -0
  223. {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  224. {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/test_datachain.py +0 -0
  225. {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/test_ls.py +0 -0
  226. {datachain-0.9.1 → datachain-0.11.0}/tests/benchmarks/test_version.py +0 -0
  227. {datachain-0.9.1 → datachain-0.11.0}/tests/data.py +0 -0
  228. {datachain-0.9.1 → datachain-0.11.0}/tests/examples/__init__.py +0 -0
  229. {datachain-0.9.1 → datachain-0.11.0}/tests/examples/test_examples.py +0 -0
  230. {datachain-0.9.1 → datachain-0.11.0}/tests/examples/test_wds_e2e.py +0 -0
  231. {datachain-0.9.1 → datachain-0.11.0}/tests/examples/wds_data.py +0 -0
  232. {datachain-0.9.1 → datachain-0.11.0}/tests/func/__init__.py +0 -0
  233. {datachain-0.9.1 → datachain-0.11.0}/tests/func/fake-service-account-credentials.json +0 -0
  234. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_catalog.py +0 -0
  235. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_client.py +0 -0
  236. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_data_storage.py +0 -0
  237. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_datachain_merge.py +0 -0
  238. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_dataset_query.py +0 -0
  239. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_datasets.py +0 -0
  240. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_feature_pickling.py +0 -0
  241. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_file.py +0 -0
  242. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_hf.py +0 -0
  243. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_listing.py +0 -0
  244. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_ls.py +0 -0
  245. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_meta_formats.py +0 -0
  246. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_metrics.py +0 -0
  247. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_pytorch.py +0 -0
  248. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_query.py +0 -0
  249. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_session.py +0 -0
  250. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_toolkit.py +0 -0
  251. {datachain-0.9.1 → datachain-0.11.0}/tests/func/test_warehouse.py +0 -0
  252. {datachain-0.9.1 → datachain-0.11.0}/tests/scripts/feature_class.py +0 -0
  253. {datachain-0.9.1 → datachain-0.11.0}/tests/scripts/feature_class_exception.py +0 -0
  254. {datachain-0.9.1 → datachain-0.11.0}/tests/scripts/feature_class_parallel.py +0 -0
  255. {datachain-0.9.1 → datachain-0.11.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  256. {datachain-0.9.1 → datachain-0.11.0}/tests/scripts/name_len_slow.py +0 -0
  257. {datachain-0.9.1 → datachain-0.11.0}/tests/test_atomicity.py +0 -0
  258. {datachain-0.9.1 → datachain-0.11.0}/tests/test_cli_e2e.py +0 -0
  259. {datachain-0.9.1 → datachain-0.11.0}/tests/test_cli_studio.py +0 -0
  260. {datachain-0.9.1 → datachain-0.11.0}/tests/test_query_e2e.py +0 -0
  261. {datachain-0.9.1 → datachain-0.11.0}/tests/test_telemetry.py +0 -0
  262. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/__init__.py +0 -0
  263. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/__init__.py +0 -0
  264. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/conftest.py +0 -0
  265. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  266. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_arrow.py +0 -0
  267. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_clip.py +0 -0
  268. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_datachain.py +0 -0
  269. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  270. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  271. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_diff.py +0 -0
  272. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_feature.py +0 -0
  273. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_feature_utils.py +0 -0
  274. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_file.py +0 -0
  275. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_hf.py +0 -0
  276. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_image.py +0 -0
  277. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_listing_info.py +0 -0
  278. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_models.py +0 -0
  279. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  280. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_schema.py +0 -0
  281. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_signal_schema.py +0 -0
  282. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  283. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_text.py +0 -0
  284. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_udf_signature.py +0 -0
  285. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_utils.py +0 -0
  286. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_video.py +0 -0
  287. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/lib/test_webdataset.py +0 -0
  288. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/__init__.py +0 -0
  289. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  290. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  291. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  292. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_array.py +0 -0
  293. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_conditional.py +0 -0
  294. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_path.py +0 -0
  295. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_random.py +0 -0
  296. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_selectable.py +0 -0
  297. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/sql/test_string.py +0 -0
  298. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_asyn.py +0 -0
  299. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_cache.py +0 -0
  300. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_catalog.py +0 -0
  301. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_catalog_loader.py +0 -0
  302. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_cli_parsing.py +0 -0
  303. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_client.py +0 -0
  304. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_client_gcs.py +0 -0
  305. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_client_s3.py +0 -0
  306. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_config.py +0 -0
  307. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_data_storage.py +0 -0
  308. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_database_engine.py +0 -0
  309. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_dataset.py +0 -0
  310. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_dispatch.py +0 -0
  311. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_fileslice.py +0 -0
  312. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_func.py +0 -0
  313. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_listing.py +0 -0
  314. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_metastore.py +0 -0
  315. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_module_exports.py +0 -0
  316. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_pytorch.py +0 -0
  317. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_query.py +0 -0
  318. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_query_metrics.py +0 -0
  319. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_query_params.py +0 -0
  320. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_serializer.py +0 -0
  321. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_session.py +0 -0
  322. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_utils.py +0 -0
  323. {datachain-0.9.1 → datachain-0.11.0}/tests/unit/test_warehouse.py +0 -0
  324. {datachain-0.9.1 → datachain-0.11.0}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.9.6'
27
+ rev: 'v0.9.7'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.9.1
3
+ Version: 0.11.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -49,6 +49,7 @@ Requires-Dist: platformdirs
49
49
  Requires-Dist: dvc-studio-client<1,>=0.21
50
50
  Requires-Dist: tabulate
51
51
  Requires-Dist: websockets
52
+ Requires-Dist: tomli; python_version < "3.11"
52
53
  Provides-Extra: docs
53
54
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
54
55
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -102,7 +103,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
102
103
  Requires-Dist: defusedxml; extra == "examples"
103
104
  Requires-Dist: accelerate; extra == "examples"
104
105
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
105
- Requires-Dist: ultralytics==8.3.74; extra == "examples"
106
+ Requires-Dist: ultralytics==8.3.78; extra == "examples"
106
107
  Requires-Dist: open_clip_torch; extra == "examples"
107
108
 
108
109
  ================
@@ -175,7 +176,7 @@ high confidence scores.
175
176
 
176
177
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
178
  & (Column("meta.inference.class_") == "cat"))
178
- likely_cats.export_files("high-confidence-cats/", signal="file")
179
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
179
180
 
180
181
 
181
182
  Example: LLM based text-file evaluation
@@ -216,7 +217,7 @@ Python code:
216
217
  )
217
218
 
218
219
  successful_chain = chain.filter(Column("is_success") == True)
219
- successful_chain.export_files("./output_mistral")
220
+ successful_chain.to_storage("./output_mistral")
220
221
 
221
222
  print(f"{successful_chain.count()} files were exported")
222
223
 
@@ -68,7 +68,7 @@ high confidence scores.
68
68
 
69
69
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
70
70
  & (Column("meta.inference.class_") == "cat"))
71
- likely_cats.export_files("high-confidence-cats/", signal="file")
71
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
72
72
 
73
73
 
74
74
  Example: LLM based text-file evaluation
@@ -109,7 +109,7 @@ Python code:
109
109
  )
110
110
 
111
111
  successful_chain = chain.filter(Column("is_success") == True)
112
- successful_chain.export_files("./output_mistral")
112
+ successful_chain.to_storage("./output_mistral")
113
113
 
114
114
  print(f"{successful_chain.count()} files were exported")
115
115
 
@@ -47,7 +47,7 @@ annotated = images_id.merge(meta, on="id", right_on="meta.id")
47
47
 
48
48
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
49
49
  & (Column("meta.inference.class_") == "cat"))
50
- likely_cats.export_files("high-confidence-cats/", signal="file")
50
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
51
51
  ```
52
52
 
53
53
  ## Data curation with a local AI model
@@ -85,7 +85,7 @@ chain = (
85
85
  )
86
86
 
87
87
  positive_chain = chain.filter(Column("is_positive") == True)
88
- positive_chain.export_files("./output")
88
+ positive_chain.to_storage("./output")
89
89
 
90
90
  print(f"{positive_chain.count()} files were exported")
91
91
  ```
@@ -51,7 +51,8 @@ dependencies = [
51
51
  "platformdirs",
52
52
  "dvc-studio-client>=0.21,<1",
53
53
  "tabulate",
54
- "websockets"
54
+ "websockets",
55
+ "tomli;python_version<'3.11'"
55
56
  ]
56
57
 
57
58
  [project.optional-dependencies]
@@ -118,7 +119,7 @@ examples = [
118
119
  "defusedxml",
119
120
  "accelerate",
120
121
  "huggingface_hub[hf_transfer]",
121
- "ultralytics==8.3.74",
122
+ "ultralytics==8.3.78",
122
123
  "open_clip_torch"
123
124
  ]
124
125
 
@@ -390,7 +390,7 @@ class Client(ABC):
390
390
  ) # type: ignore[return-value]
391
391
 
392
392
  def upload(self, data: bytes, path: str) -> "File":
393
- full_path = self.get_full_path(path)
393
+ full_path = path if path.startswith(self.PREFIX) else self.get_full_path(path)
394
394
 
395
395
  parent = posixpath.dirname(full_path)
396
396
  self.fs.makedirs(parent, exist_ok=True)
@@ -411,6 +411,7 @@ class DataChain:
411
411
  object_name: str = "file",
412
412
  update: bool = False,
413
413
  anon: bool = False,
414
+ client_config: Optional[dict] = None,
414
415
  ) -> "Self":
415
416
  """Get data from a storage as a list of file with all file attributes.
416
417
  It returns the chain itself as usual.
@@ -423,15 +424,32 @@ class DataChain:
423
424
  object_name : Created object column name.
424
425
  update : force storage reindexing. Default is False.
425
426
  anon : If True, we will treat cloud bucket as public one
427
+ client_config : Optional client configuration for the storage client.
426
428
 
427
429
  Example:
430
+ Simple call from s3
428
431
  ```py
429
432
  chain = DataChain.from_storage("s3://my-bucket/my-dir")
430
433
  ```
434
+
435
+ With AWS S3-compatible storage
436
+ ```py
437
+ chain = DataChain.from_storage(
438
+ "s3://my-bucket/my-dir",
439
+ client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
440
+ )
441
+ ```
442
+
443
+ Pass existing session
444
+ ```py
445
+ session = Session.get()
446
+ chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
447
+ ```
431
448
  """
432
449
  file_type = get_file_type(type)
433
450
 
434
- client_config = {"anon": True} if anon else None
451
+ if anon:
452
+ client_config = (client_config or {}) | {"anon": True}
435
453
  session = Session.get(session, client_config=client_config, in_memory=in_memory)
436
454
  cache = session.catalog.cache
437
455
  client_config = session.catalog.client_config
@@ -481,25 +499,56 @@ class DataChain:
481
499
  version: Optional[int] = None,
482
500
  session: Optional[Session] = None,
483
501
  settings: Optional[dict] = None,
484
- fallback_to_remote: bool = True,
502
+ fallback_to_studio: bool = True,
485
503
  ) -> "Self":
486
504
  """Get data from a saved Dataset. It returns the chain itself.
505
+ If dataset or version is not found locally, it will try to pull it from Studio.
487
506
 
488
507
  Parameters:
489
508
  name : dataset name
490
509
  version : dataset version
510
+ session : Session to use for the chain.
511
+ settings : Settings to use for the chain.
512
+ fallback_to_studio : Try to pull dataset from Studio if not found locally.
513
+ Default is True.
491
514
 
492
515
  Example:
493
516
  ```py
494
517
  chain = DataChain.from_dataset("my_cats")
495
518
  ```
519
+
520
+ ```py
521
+ chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
522
+ ```
523
+
524
+ ```py
525
+ chain = DataChain.from_dataset("my_cats", version=1)
526
+ ```
527
+
528
+ ```py
529
+ session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
530
+ settings = {
531
+ "cache": True,
532
+ "parallel": 4,
533
+ "workers": 4,
534
+ "min_task_size": 1000,
535
+ "prefetch": 10,
536
+ }
537
+ chain = DataChain.from_dataset(
538
+ name="my_cats",
539
+ version=1,
540
+ session=session,
541
+ settings=settings,
542
+ fallback_to_studio=True,
543
+ )
544
+ ```
496
545
  """
497
546
  query = DatasetQuery(
498
547
  name=name,
499
548
  version=version,
500
549
  session=session,
501
550
  indexing_column_types=File._datachain_column_types,
502
- fallback_to_remote=fallback_to_remote,
551
+ fallback_to_studio=fallback_to_studio,
503
552
  )
504
553
  telemetry.send_event_once("class", "datachain_init", name=name, version=version)
505
554
  if settings:
@@ -2444,7 +2493,7 @@ class DataChain:
2444
2493
  self._setup = self._setup | kwargs
2445
2494
  return self
2446
2495
 
2447
- def export_files(
2496
+ def to_storage(
2448
2497
  self,
2449
2498
  output: str,
2450
2499
  signal: str = "file",
@@ -2462,6 +2511,13 @@ class DataChain:
2462
2511
  use_cache: If `True`, cache the files before exporting.
2463
2512
  link_type: Method to use for exporting files.
2464
2513
  Falls back to `'copy'` if symlinking fails.
2514
+
2515
+ Example:
2516
+ Cross cloud transfer
2517
+ ```py
2518
+ ds = DataChain.from_storage("s3://mybucket")
2519
+ ds.to_storage("gs://mybucket", placement="filename")
2520
+ ```
2465
2521
  """
2466
2522
  if placement == "filename" and (
2467
2523
  self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
@@ -17,6 +17,7 @@ from urllib.parse import unquote, urlparse
17
17
  from urllib.request import url2pathname
18
18
 
19
19
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
20
+ from fsspec.utils import stringify_path
20
21
  from PIL import Image as PilImage
21
22
  from pydantic import Field, field_validator
22
23
 
@@ -270,8 +271,13 @@ class File(DataModel):
270
271
 
271
272
  def save(self, destination: str):
272
273
  """Writes it's content to destination"""
273
- with open(destination, mode="wb") as f:
274
- f.write(self.read())
274
+ destination = stringify_path(destination)
275
+ client: Client = self._catalog.get_client(destination)
276
+
277
+ if client.PREFIX == "file://" and not destination.startswith(client.PREFIX):
278
+ destination = Path(destination).absolute().as_uri()
279
+
280
+ client.upload(self.read(), destination)
275
281
 
276
282
  def _symlink_to(self, destination: str):
277
283
  if self.location:
@@ -285,6 +291,7 @@ class File(DataModel):
285
291
  source = self.get_path()
286
292
  else:
287
293
  raise OSError(errno.EXDEV, "can't link across filesystems")
294
+
288
295
  return os.symlink(source, destination)
289
296
 
290
297
  def export(
@@ -299,7 +306,8 @@ class File(DataModel):
299
306
  self._caching_enabled = use_cache
300
307
  dst = self.get_destination_path(output, placement)
301
308
  dst_dir = os.path.dirname(dst)
302
- os.makedirs(dst_dir, exist_ok=True)
309
+ client: Client = self._catalog.get_client(dst_dir)
310
+ client.fs.makedirs(dst_dir, exist_ok=True)
303
311
 
304
312
  if link_type == "symlink":
305
313
  try:
@@ -496,7 +504,10 @@ class TextFile(File):
496
504
 
497
505
  def save(self, destination: str):
498
506
  """Writes it's content to destination"""
499
- with open(destination, mode="w") as f:
507
+ destination = stringify_path(destination)
508
+
509
+ client: Client = self._catalog.get_client(destination)
510
+ with client.fs.open(destination, mode="w") as f:
500
511
  f.write(self.read_text())
501
512
 
502
513
 
@@ -510,7 +521,11 @@ class ImageFile(File):
510
521
 
511
522
  def save(self, destination: str):
512
523
  """Writes it's content to destination"""
513
- self.read().save(destination)
524
+ destination = stringify_path(destination)
525
+
526
+ client: Client = self._catalog.get_client(destination)
527
+ with client.fs.open(destination, mode="wb") as f:
528
+ self.read().save(f)
514
529
 
515
530
 
516
531
  class Image(DataModel):
@@ -1085,7 +1085,7 @@ class DatasetQuery:
1085
1085
  session: Optional[Session] = None,
1086
1086
  indexing_column_types: Optional[dict[str, Any]] = None,
1087
1087
  in_memory: bool = False,
1088
- fallback_to_remote: bool = True,
1088
+ fallback_to_studio: bool = True,
1089
1089
  ) -> None:
1090
1090
  self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
1091
1091
  self.catalog = catalog or self.session.catalog
@@ -1103,7 +1103,7 @@ class DatasetQuery:
1103
1103
 
1104
1104
  self.name = name
1105
1105
 
1106
- if fallback_to_remote and is_token_set():
1106
+ if fallback_to_studio and is_token_set():
1107
1107
  ds = self.catalog.get_dataset_with_remote_fallback(name, version)
1108
1108
  else:
1109
1109
  ds = self.catalog.get_dataset(name)
@@ -139,21 +139,33 @@ class Session:
139
139
 
140
140
  # Access the active (most recent) context from the stack
141
141
  if cls.SESSION_CONTEXTS:
142
- return cls.SESSION_CONTEXTS[-1]
142
+ session = cls.SESSION_CONTEXTS[-1]
143
143
 
144
- if cls.GLOBAL_SESSION_CTX is None:
144
+ elif cls.GLOBAL_SESSION_CTX is None:
145
145
  cls.GLOBAL_SESSION_CTX = Session(
146
146
  cls.GLOBAL_SESSION_NAME,
147
147
  catalog,
148
148
  client_config=client_config,
149
149
  in_memory=in_memory,
150
150
  )
151
+ session = cls.GLOBAL_SESSION_CTX
151
152
 
152
153
  atexit.register(cls._global_cleanup)
153
154
  cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
154
155
  sys.excepthook = cls.except_hook
156
+ else:
157
+ session = cls.GLOBAL_SESSION_CTX
155
158
 
156
- return cls.GLOBAL_SESSION_CTX
159
+ if client_config and session.catalog.client_config != client_config:
160
+ session = Session(
161
+ "session" + uuid4().hex[:4],
162
+ catalog,
163
+ client_config=client_config,
164
+ in_memory=in_memory,
165
+ )
166
+ session.__enter__()
167
+
168
+ return session
157
169
 
158
170
  @staticmethod
159
171
  def except_hook(exc_type, exc_value, exc_traceback):
@@ -0,0 +1,147 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from typing import Any, Optional
4
+
5
+ try:
6
+ import tomllib
7
+ except ModuleNotFoundError:
8
+ # tomllib is in standard library from python 3.11 so for earlier versions
9
+ # we need tomli
10
+ import tomli as tomllib # type: ignore[no-redef]
11
+
12
+
13
+ class ScriptConfigParsingError(Exception):
14
+ def __init__(self, message):
15
+ super().__init__(message)
16
+
17
+
18
+ @dataclass
19
+ class ScriptConfig:
20
+ """
21
+ Class that is parsing inline script metadata to get some basic information for
22
+ running datachain script like python version, dependencies, attachments etc.
23
+ Inline script metadata must follow the format described in https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata.
24
+ Example of script with inline metadata:
25
+ # /// script
26
+ # requires-python = ">=3.12"
27
+ #
28
+ # dependencies = [
29
+ # "pandas < 2.1.0",
30
+ # "numpy == 1.26.4"
31
+ # ]
32
+ #
33
+ # [tools.datachain.workers]
34
+ # num_workers = 3
35
+ #
36
+ # [tools.datachain.attachments]
37
+ # image1 = "s3://ldb-public/image1.jpg"
38
+ # file1 = "s3://ldb-public/file.pdf"
39
+ #
40
+ # [tools.datachain.params]
41
+ # min_length_sec = 1
42
+ # cache = false
43
+ #
44
+ # [tools.datachain.inputs]
45
+ # threshold = 0.5
46
+ # start_ds_name = "ds://start"
47
+ #
48
+ # [tools.datachain.outputs]
49
+ # result_dataset = "ds://res"
50
+ # result_dir = "/temp"
51
+ #
52
+ # ///
53
+
54
+ import sys
55
+ import pandas as pd
56
+
57
+ print(f"Python version: {sys.version_info}")
58
+ print(f"Pandas version: {pd.__version__}")
59
+
60
+ """
61
+
62
+ python_version: Optional[str]
63
+ dependencies: list[str]
64
+ attachments: dict[str, str]
65
+ params: dict[str, Any]
66
+ inputs: dict[str, Any]
67
+ outputs: dict[str, Any]
68
+ num_workers: Optional[int] = None
69
+
70
+ def __init__(
71
+ self,
72
+ python_version: Optional[str] = None,
73
+ dependencies: Optional[list[str]] = None,
74
+ attachments: Optional[dict[str, str]] = None,
75
+ params: Optional[dict[str, Any]] = None,
76
+ inputs: Optional[dict[str, Any]] = None,
77
+ outputs: Optional[dict[str, Any]] = None,
78
+ num_workers: Optional[int] = None,
79
+ ):
80
+ self.python_version = python_version
81
+ self.dependencies = dependencies or []
82
+ self.attachments = attachments or {}
83
+ self.params = params or {}
84
+ self.inputs = inputs or {}
85
+ self.outputs = outputs or {}
86
+ self.num_workers = num_workers
87
+
88
+ def get_param(self, name: str, default: Any) -> Any:
89
+ return self.params.get(name, default)
90
+
91
+ def get_input(self, name: str, default: Any) -> Any:
92
+ return self.inputs.get(name, default)
93
+
94
+ def get_output(self, name: str, default: Any) -> Any:
95
+ return self.outputs.get(name, default)
96
+
97
+ def get_attachment(self, name: str, default: Any) -> Any:
98
+ return self.attachments.get(name, default)
99
+
100
+ @staticmethod
101
+ def read(script: str) -> Optional[dict]:
102
+ """Converts inline script metadata to dict with all found data"""
103
+ regex = (
104
+ r"(?m)^# \/\/\/ (?P<type>[a-zA-Z0-9-]+)[ \t]*$[\r\n|\r|\n]"
105
+ "(?P<content>(?:^#(?:| .*)$[\r\n|\r|\n])+)^# \\/\\/\\/[ \t]*$"
106
+ )
107
+ name = "script"
108
+ matches = list(
109
+ filter(lambda m: m.group("type") == name, re.finditer(regex, script))
110
+ )
111
+ if len(matches) > 1:
112
+ raise ValueError(f"Multiple {name} blocks found")
113
+ if len(matches) == 1:
114
+ content = "".join(
115
+ line[2:] if line.startswith("# ") else line[1:]
116
+ for line in matches[0].group("content").splitlines(keepends=True)
117
+ )
118
+ return tomllib.loads(content)
119
+ return None
120
+
121
+ @staticmethod
122
+ def parse(script: str) -> Optional["ScriptConfig"]:
123
+ """
124
+ Method that is parsing inline script metadata from datachain script and
125
+ instantiating ScriptConfig class with found data. If no inline metadata is
126
+ found, it returns None
127
+ """
128
+ try:
129
+ meta = ScriptConfig.read(script)
130
+ if not meta:
131
+ return None
132
+ custom = meta.get("tools", {}).get("datachain", {})
133
+ return ScriptConfig(
134
+ python_version=meta.get("requires-python"),
135
+ dependencies=meta.get("dependencies"),
136
+ num_workers=custom.get("workers", {}).get("num_workers"),
137
+ attachments=custom.get("attachments"),
138
+ params={k: str(v) for k, v in custom.get("params").items()}
139
+ if custom.get("params")
140
+ else None,
141
+ inputs=custom.get("inputs"),
142
+ outputs=custom.get("outputs"),
143
+ )
144
+ except Exception as e:
145
+ raise ScriptConfigParsingError(
146
+ f"Error when parsing script meta: {e}"
147
+ ) from e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.9.1
3
+ Version: 0.11.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -49,6 +49,7 @@ Requires-Dist: platformdirs
49
49
  Requires-Dist: dvc-studio-client<1,>=0.21
50
50
  Requires-Dist: tabulate
51
51
  Requires-Dist: websockets
52
+ Requires-Dist: tomli; python_version < "3.11"
52
53
  Provides-Extra: docs
53
54
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
54
55
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -102,7 +103,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
102
103
  Requires-Dist: defusedxml; extra == "examples"
103
104
  Requires-Dist: accelerate; extra == "examples"
104
105
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
105
- Requires-Dist: ultralytics==8.3.74; extra == "examples"
106
+ Requires-Dist: ultralytics==8.3.78; extra == "examples"
106
107
  Requires-Dist: open_clip_torch; extra == "examples"
107
108
 
108
109
  ================
@@ -175,7 +176,7 @@ high confidence scores.
175
176
 
176
177
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
178
  & (Column("meta.inference.class_") == "cat"))
178
- likely_cats.export_files("high-confidence-cats/", signal="file")
179
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
179
180
 
180
181
 
181
182
  Example: LLM based text-file evaluation
@@ -216,7 +217,7 @@ Python code:
216
217
  )
217
218
 
218
219
  successful_chain = chain.filter(Column("is_success") == True)
219
- successful_chain.export_files("./output_mistral")
220
+ successful_chain.to_storage("./output_mistral")
220
221
 
221
222
  print(f"{successful_chain.count()} files were exported")
222
223
 
@@ -77,6 +77,7 @@ src/datachain/nodes_fetcher.py
77
77
  src/datachain/nodes_thread_pool.py
78
78
  src/datachain/progress.py
79
79
  src/datachain/py.typed
80
+ src/datachain/script_meta.py
80
81
  src/datachain/studio.py
81
82
  src/datachain/telemetry.py
82
83
  src/datachain/utils.py
@@ -232,6 +233,7 @@ tests/func/__init__.py
232
233
  tests/func/fake-service-account-credentials.json
233
234
  tests/func/test_catalog.py
234
235
  tests/func/test_client.py
236
+ tests/func/test_cloud_transfer.py
235
237
  tests/func/test_data_storage.py
236
238
  tests/func/test_datachain.py
237
239
  tests/func/test_datachain_merge.py
@@ -278,6 +280,7 @@ tests/unit/test_pytorch.py
278
280
  tests/unit/test_query.py
279
281
  tests/unit/test_query_metrics.py
280
282
  tests/unit/test_query_params.py
283
+ tests/unit/test_script_meta.py
281
284
  tests/unit/test_serializer.py
282
285
  tests/unit/test_session.py
283
286
  tests/unit/test_utils.py
@@ -32,6 +32,9 @@ dvc-studio-client<1,>=0.21
32
32
  tabulate
33
33
  websockets
34
34
 
35
+ [:python_version < "3.11"]
36
+ tomli
37
+
35
38
  [dev]
36
39
  datachain[docs,tests]
37
40
  mypy==1.15.0
@@ -55,7 +58,7 @@ datachain[tests]
55
58
  defusedxml
56
59
  accelerate
57
60
  huggingface_hub[hf_transfer]
58
- ultralytics==8.3.74
61
+ ultralytics==8.3.78
59
62
  open_clip_torch
60
63
 
61
64
  [hf]
@@ -472,9 +472,9 @@ def cloud_server_credentials(cloud_server, monkeypatch):
472
472
 
473
473
  def get_cloud_test_catalog(cloud_server, tmp_path, metastore, warehouse):
474
474
  cache_dir = tmp_path / ".datachain" / "cache"
475
- cache_dir.mkdir(parents=True)
475
+ cache_dir.mkdir(parents=True, exist_ok=True)
476
476
  tmpfile_dir = tmp_path / ".datachain" / "tmp"
477
- tmpfile_dir.mkdir()
477
+ tmpfile_dir.mkdir(exist_ok=True)
478
478
 
479
479
  catalog = Catalog(
480
480
  metastore=metastore,
@@ -0,0 +1,68 @@
1
+ import pytest
2
+
3
+ from datachain import Session
4
+ from datachain.lib.dc import DataChain
5
+ from tests.conftest import get_cloud_test_catalog, make_cloud_server
6
+
7
+
8
+ def test_cross_cloud_transfer(
9
+ request,
10
+ tmp_upath_factory,
11
+ tree,
12
+ tmp_path,
13
+ metastore,
14
+ warehouse,
15
+ ):
16
+ disabled_remotes = request.config.getoption("--disable-remotes") or []
17
+
18
+ if any(remote in disabled_remotes for remote in ["azure", "gs", "all"]):
19
+ pytest.skip("Skipping all tests for azure, gs or all remotes")
20
+
21
+ azure_path = tmp_upath_factory.mktemp("azure", version_aware=False)
22
+ azure_server = make_cloud_server(azure_path, "azure", tree)
23
+
24
+ gcloud_path = tmp_upath_factory.mktemp("gs", version_aware=False)
25
+ gcloud_server = make_cloud_server(gcloud_path, "gs", tree)
26
+
27
+ # Initialize cloud catalogs
28
+ azure_catalog = get_cloud_test_catalog(azure_server, tmp_path, metastore, warehouse)
29
+ gcloud_catalog = get_cloud_test_catalog(
30
+ gcloud_server, tmp_path, metastore, warehouse
31
+ )
32
+
33
+ # Define test file paths
34
+ test_filename = "image_1.jpg"
35
+ test_content = b"bytes"
36
+
37
+ source_dir = f"{azure_catalog.src_uri}/source-test-images"
38
+ source_file = f"{source_dir}/{test_filename}"
39
+
40
+ dest_dir = f"{gcloud_catalog.src_uri}/destination-test-images"
41
+ dest_file = f"{dest_dir}/{test_filename}"
42
+
43
+ # Get cloud clients
44
+ azure_client = azure_catalog.catalog.get_client(source_file)
45
+ gcloud_client = gcloud_catalog.catalog.get_client(dest_file)
46
+
47
+ try:
48
+ # Create test file in Azure
49
+ with azure_client.fs.open(source_file, "wb") as f:
50
+ f.write(test_content)
51
+
52
+ # Perform cross-cloud transfer
53
+ combined_config = azure_server.client_config | gcloud_server.client_config
54
+ with Session("testSession", client_config=combined_config):
55
+ datachain = DataChain.from_storage(source_dir)
56
+ datachain.to_storage(dest_dir, placement="filename")
57
+
58
+ # Verify transfer
59
+ with gcloud_client.fs.open(dest_file, "rb") as f:
60
+ assert f.read() == test_content
61
+
62
+ finally:
63
+ # Cleanup
64
+ try:
65
+ azure_client.fs.rm(source_dir, recursive=True)
66
+ gcloud_client.fs.rm(dest_dir, recursive=True)
67
+ except FileNotFoundError:
68
+ pass