datachain 0.8.4__tar.gz → 0.8.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (305) hide show
  1. {datachain-0.8.4 → datachain-0.8.6}/.gitignore +3 -0
  2. {datachain-0.8.4 → datachain-0.8.6}/PKG-INFO +6 -6
  3. {datachain-0.8.4 → datachain-0.8.6}/README.rst +2 -2
  4. {datachain-0.8.4 → datachain-0.8.6}/docs/index.md +1 -1
  5. {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/ultralytics-bbox.py +5 -0
  6. {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/ultralytics-pose.py +5 -0
  7. {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/ultralytics-segment.py +5 -0
  8. {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/torch-loader.py +2 -2
  9. {datachain-0.8.4 → datachain-0.8.6}/pyproject.toml +2 -2
  10. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/asyn.py +16 -6
  11. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cache.py +32 -10
  12. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/catalog/catalog.py +17 -1
  13. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/azure.py +6 -2
  14. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/fsspec.py +1 -1
  15. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/gcs.py +6 -2
  16. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/s3.py +22 -4
  17. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/db_engine.py +9 -0
  18. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/schema.py +4 -10
  19. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/sqlite.py +7 -1
  20. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/warehouse.py +6 -4
  21. datachain-0.8.4/src/datachain/lib/diff.py → datachain-0.8.6/src/datachain/diff/__init__.py +116 -12
  22. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/__init__.py +2 -1
  23. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/conditional.py +31 -9
  24. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/arrow.py +3 -1
  25. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/dc.py +5 -3
  26. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/file.py +15 -4
  27. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/hf.py +1 -1
  28. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/pytorch.py +57 -13
  29. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/udf.py +82 -40
  30. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/listing.py +1 -0
  31. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/progress.py +18 -1
  32. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/dataset.py +122 -93
  33. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/dispatch.py +22 -16
  34. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/utils.py +13 -2
  35. {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/PKG-INFO +6 -6
  36. {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/SOURCES.txt +3 -1
  37. {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/requires.txt +2 -2
  38. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_datachain.py +83 -1
  39. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_pytorch.py +41 -0
  40. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_datachain.py +15 -0
  41. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_diff.py +49 -43
  42. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_conditional.py +21 -4
  43. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_asyn.py +33 -0
  44. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_cache.py +27 -1
  45. datachain-0.8.6/tests/unit/test_diff.py +70 -0
  46. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_func.py +18 -0
  47. datachain-0.8.6/tests/unit/test_pytorch.py +58 -0
  48. {datachain-0.8.4 → datachain-0.8.6}/.cruft.json +0 -0
  49. {datachain-0.8.4 → datachain-0.8.6}/.gitattributes +0 -0
  50. {datachain-0.8.4 → datachain-0.8.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  51. {datachain-0.8.4 → datachain-0.8.6}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  52. {datachain-0.8.4 → datachain-0.8.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  53. {datachain-0.8.4 → datachain-0.8.6}/.github/codecov.yaml +0 -0
  54. {datachain-0.8.4 → datachain-0.8.6}/.github/dependabot.yml +0 -0
  55. {datachain-0.8.4 → datachain-0.8.6}/.github/workflows/benchmarks.yml +0 -0
  56. {datachain-0.8.4 → datachain-0.8.6}/.github/workflows/release.yml +0 -0
  57. {datachain-0.8.4 → datachain-0.8.6}/.github/workflows/tests-studio.yml +0 -0
  58. {datachain-0.8.4 → datachain-0.8.6}/.github/workflows/tests.yml +0 -0
  59. {datachain-0.8.4 → datachain-0.8.6}/.github/workflows/update-template.yaml +0 -0
  60. {datachain-0.8.4 → datachain-0.8.6}/.pre-commit-config.yaml +0 -0
  61. {datachain-0.8.4 → datachain-0.8.6}/CODE_OF_CONDUCT.rst +0 -0
  62. {datachain-0.8.4 → datachain-0.8.6}/LICENSE +0 -0
  63. {datachain-0.8.4 → datachain-0.8.6}/docs/assets/captioned_cartoons.png +0 -0
  64. {datachain-0.8.4 → datachain-0.8.6}/docs/assets/datachain-white.svg +0 -0
  65. {datachain-0.8.4 → datachain-0.8.6}/docs/assets/datachain.svg +0 -0
  66. {datachain-0.8.4 → datachain-0.8.6}/docs/contributing.md +0 -0
  67. {datachain-0.8.4 → datachain-0.8.6}/docs/css/github-permalink-style.css +0 -0
  68. {datachain-0.8.4 → datachain-0.8.6}/docs/examples.md +0 -0
  69. {datachain-0.8.4 → datachain-0.8.6}/docs/overrides/main.html +0 -0
  70. {datachain-0.8.4 → datachain-0.8.6}/docs/quick-start.md +0 -0
  71. {datachain-0.8.4 → datachain-0.8.6}/docs/references/datachain.md +0 -0
  72. {datachain-0.8.4 → datachain-0.8.6}/docs/references/datatype.md +0 -0
  73. {datachain-0.8.4 → datachain-0.8.6}/docs/references/file.md +0 -0
  74. {datachain-0.8.4 → datachain-0.8.6}/docs/references/index.md +0 -0
  75. {datachain-0.8.4 → datachain-0.8.6}/docs/references/sql.md +0 -0
  76. {datachain-0.8.4 → datachain-0.8.6}/docs/references/torch.md +0 -0
  77. {datachain-0.8.4 → datachain-0.8.6}/docs/references/udf.md +0 -0
  78. {datachain-0.8.4 → datachain-0.8.6}/docs/tutorials.md +0 -0
  79. {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  80. {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  81. {datachain-0.8.4 → datachain-0.8.6}/examples/computer_vision/openimage-detect.py +0 -0
  82. {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/common_sql_functions.py +0 -0
  83. {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/json-csv-reader.py +0 -0
  84. {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/udfs/parallel.py +0 -0
  85. {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/udfs/simple.py +0 -0
  86. {datachain-0.8.4 → datachain-0.8.6}/examples/get_started/udfs/stateful.py +0 -0
  87. {datachain-0.8.4 → datachain-0.8.6}/examples/llm_and_nlp/claude-query.py +0 -0
  88. {datachain-0.8.4 → datachain-0.8.6}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  89. {datachain-0.8.4 → datachain-0.8.6}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  90. {datachain-0.8.4 → datachain-0.8.6}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  91. {datachain-0.8.4 → datachain-0.8.6}/examples/multimodal/clip_inference.py +0 -0
  92. {datachain-0.8.4 → datachain-0.8.6}/examples/multimodal/hf_pipeline.py +0 -0
  93. {datachain-0.8.4 → datachain-0.8.6}/examples/multimodal/openai_image_desc_lib.py +0 -0
  94. {datachain-0.8.4 → datachain-0.8.6}/examples/multimodal/wds.py +0 -0
  95. {datachain-0.8.4 → datachain-0.8.6}/examples/multimodal/wds_filtered.py +0 -0
  96. {datachain-0.8.4 → datachain-0.8.6}/mkdocs.yml +0 -0
  97. {datachain-0.8.4 → datachain-0.8.6}/noxfile.py +0 -0
  98. {datachain-0.8.4 → datachain-0.8.6}/setup.cfg +0 -0
  99. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/__init__.py +0 -0
  100. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/__main__.py +0 -0
  101. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/catalog/__init__.py +0 -0
  102. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/catalog/datasource.py +0 -0
  103. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/catalog/loader.py +0 -0
  104. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/__init__.py +0 -0
  105. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/__init__.py +0 -0
  106. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/datasets.py +0 -0
  107. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/du.py +0 -0
  108. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/index.py +0 -0
  109. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/ls.py +0 -0
  110. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/misc.py +0 -0
  111. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/query.py +0 -0
  112. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/commands/show.py +0 -0
  113. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/parser/__init__.py +0 -0
  114. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/parser/job.py +0 -0
  115. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/parser/studio.py +0 -0
  116. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/parser/utils.py +0 -0
  117. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/cli/utils.py +0 -0
  118. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/__init__.py +0 -0
  119. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/fileslice.py +0 -0
  120. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/hf.py +0 -0
  121. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/client/local.py +0 -0
  122. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/config.py +0 -0
  123. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/__init__.py +0 -0
  124. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/job.py +0 -0
  125. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/metastore.py +0 -0
  126. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/data_storage/serializer.py +0 -0
  127. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/dataset.py +0 -0
  128. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/error.py +0 -0
  129. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/aggregate.py +0 -0
  130. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/array.py +0 -0
  131. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/base.py +0 -0
  132. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/func.py +0 -0
  133. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/numeric.py +0 -0
  134. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/path.py +0 -0
  135. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/random.py +0 -0
  136. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/string.py +0 -0
  137. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/func/window.py +0 -0
  138. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/job.py +0 -0
  139. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/__init__.py +0 -0
  140. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/clip.py +0 -0
  141. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/__init__.py +0 -0
  142. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/flatten.py +0 -0
  143. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/python_to_sql.py +0 -0
  144. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/sql_to_python.py +0 -0
  145. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/unflatten.py +0 -0
  146. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  147. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/data_model.py +0 -0
  148. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/dataset_info.py +0 -0
  149. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/image.py +0 -0
  150. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/listing.py +0 -0
  151. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/listing_info.py +0 -0
  152. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/meta_formats.py +0 -0
  153. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/model_store.py +0 -0
  154. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/settings.py +0 -0
  155. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/signal_schema.py +0 -0
  156. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/tar.py +0 -0
  157. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/text.py +0 -0
  158. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/udf_signature.py +0 -0
  159. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/utils.py +0 -0
  160. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/vfile.py +0 -0
  161. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/webdataset.py +0 -0
  162. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/lib/webdataset_laion.py +0 -0
  163. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/__init__.py +0 -0
  164. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/bbox.py +0 -0
  165. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/pose.py +0 -0
  166. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/segment.py +0 -0
  167. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/ultralytics/__init__.py +0 -0
  168. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/ultralytics/bbox.py +0 -0
  169. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/ultralytics/pose.py +0 -0
  170. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/model/ultralytics/segment.py +0 -0
  171. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/node.py +0 -0
  172. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/nodes_fetcher.py +0 -0
  173. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/nodes_thread_pool.py +0 -0
  174. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/py.typed +0 -0
  175. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/__init__.py +0 -0
  176. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/batch.py +0 -0
  177. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/metrics.py +0 -0
  178. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/params.py +0 -0
  179. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/queue.py +0 -0
  180. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/schema.py +0 -0
  181. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/session.py +0 -0
  182. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/udf.py +0 -0
  183. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/query/utils.py +0 -0
  184. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/remote/__init__.py +0 -0
  185. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/remote/studio.py +0 -0
  186. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/__init__.py +0 -0
  187. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/default/__init__.py +0 -0
  188. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/default/base.py +0 -0
  189. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/__init__.py +0 -0
  190. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/aggregate.py +0 -0
  191. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/array.py +0 -0
  192. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/conditional.py +0 -0
  193. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/numeric.py +0 -0
  194. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/path.py +0 -0
  195. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/random.py +0 -0
  196. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/functions/string.py +0 -0
  197. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/selectable.py +0 -0
  198. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/sqlite/__init__.py +0 -0
  199. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/sqlite/base.py +0 -0
  200. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/sqlite/types.py +0 -0
  201. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/sqlite/vector.py +0 -0
  202. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/types.py +0 -0
  203. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/sql/utils.py +0 -0
  204. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/studio.py +0 -0
  205. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/telemetry.py +0 -0
  206. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/toolkit/__init__.py +0 -0
  207. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/toolkit/split.py +0 -0
  208. {datachain-0.8.4 → datachain-0.8.6}/src/datachain/torch/__init__.py +0 -0
  209. {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/dependency_links.txt +0 -0
  210. {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/entry_points.txt +0 -0
  211. {datachain-0.8.4 → datachain-0.8.6}/src/datachain.egg-info/top_level.txt +0 -0
  212. {datachain-0.8.4 → datachain-0.8.6}/tests/__init__.py +0 -0
  213. {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/__init__.py +0 -0
  214. {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/conftest.py +0 -0
  215. {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  216. {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/datasets/.dvc/config +0 -0
  217. {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/datasets/.gitignore +0 -0
  218. {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  219. {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/test_datachain.py +0 -0
  220. {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/test_ls.py +0 -0
  221. {datachain-0.8.4 → datachain-0.8.6}/tests/benchmarks/test_version.py +0 -0
  222. {datachain-0.8.4 → datachain-0.8.6}/tests/conftest.py +0 -0
  223. {datachain-0.8.4 → datachain-0.8.6}/tests/data.py +0 -0
  224. {datachain-0.8.4 → datachain-0.8.6}/tests/examples/__init__.py +0 -0
  225. {datachain-0.8.4 → datachain-0.8.6}/tests/examples/test_examples.py +0 -0
  226. {datachain-0.8.4 → datachain-0.8.6}/tests/examples/test_wds_e2e.py +0 -0
  227. {datachain-0.8.4 → datachain-0.8.6}/tests/examples/wds_data.py +0 -0
  228. {datachain-0.8.4 → datachain-0.8.6}/tests/func/__init__.py +0 -0
  229. {datachain-0.8.4 → datachain-0.8.6}/tests/func/fake-service-account-credentials.json +0 -0
  230. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_catalog.py +0 -0
  231. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_client.py +0 -0
  232. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_dataset_query.py +0 -0
  233. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_datasets.py +0 -0
  234. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_feature_pickling.py +0 -0
  235. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_listing.py +0 -0
  236. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_ls.py +0 -0
  237. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_meta_formats.py +0 -0
  238. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_metrics.py +0 -0
  239. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_pull.py +0 -0
  240. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_query.py +0 -0
  241. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_session.py +0 -0
  242. {datachain-0.8.4 → datachain-0.8.6}/tests/func/test_toolkit.py +0 -0
  243. {datachain-0.8.4 → datachain-0.8.6}/tests/scripts/feature_class.py +0 -0
  244. {datachain-0.8.4 → datachain-0.8.6}/tests/scripts/feature_class_exception.py +0 -0
  245. {datachain-0.8.4 → datachain-0.8.6}/tests/scripts/feature_class_parallel.py +0 -0
  246. {datachain-0.8.4 → datachain-0.8.6}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  247. {datachain-0.8.4 → datachain-0.8.6}/tests/scripts/name_len_slow.py +0 -0
  248. {datachain-0.8.4 → datachain-0.8.6}/tests/test_atomicity.py +0 -0
  249. {datachain-0.8.4 → datachain-0.8.6}/tests/test_cli_e2e.py +0 -0
  250. {datachain-0.8.4 → datachain-0.8.6}/tests/test_cli_studio.py +0 -0
  251. {datachain-0.8.4 → datachain-0.8.6}/tests/test_query_e2e.py +0 -0
  252. {datachain-0.8.4 → datachain-0.8.6}/tests/test_telemetry.py +0 -0
  253. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/__init__.py +0 -0
  254. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/__init__.py +0 -0
  255. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/conftest.py +0 -0
  256. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_arrow.py +0 -0
  257. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_clip.py +0 -0
  258. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  259. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_datachain_merge.py +0 -0
  260. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_feature.py +0 -0
  261. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_feature_utils.py +0 -0
  262. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_file.py +0 -0
  263. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_hf.py +0 -0
  264. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_image.py +0 -0
  265. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_listing_info.py +0 -0
  266. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_models.py +0 -0
  267. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_schema.py +0 -0
  268. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_signal_schema.py +0 -0
  269. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_sql_to_python.py +0 -0
  270. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_text.py +0 -0
  271. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_udf_signature.py +0 -0
  272. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_utils.py +0 -0
  273. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/lib/test_webdataset.py +0 -0
  274. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/__init__.py +0 -0
  275. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/sqlite/__init__.py +0 -0
  276. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/sqlite/test_types.py +0 -0
  277. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/sqlite/test_utils.py +0 -0
  278. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_array.py +0 -0
  279. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_path.py +0 -0
  280. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_random.py +0 -0
  281. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_selectable.py +0 -0
  282. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/sql/test_string.py +0 -0
  283. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_catalog.py +0 -0
  284. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_catalog_loader.py +0 -0
  285. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_cli_parsing.py +0 -0
  286. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_client.py +0 -0
  287. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_client_gcs.py +0 -0
  288. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_client_s3.py +0 -0
  289. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_config.py +0 -0
  290. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_data_storage.py +0 -0
  291. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_database_engine.py +0 -0
  292. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_dataset.py +0 -0
  293. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_dispatch.py +0 -0
  294. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_fileslice.py +0 -0
  295. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_listing.py +0 -0
  296. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_metastore.py +0 -0
  297. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_module_exports.py +0 -0
  298. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_query.py +0 -0
  299. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_query_metrics.py +0 -0
  300. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_query_params.py +0 -0
  301. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_serializer.py +0 -0
  302. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_session.py +0 -0
  303. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_utils.py +0 -0
  304. {datachain-0.8.4 → datachain-0.8.6}/tests/unit/test_warehouse.py +0 -0
  305. {datachain-0.8.4 → datachain-0.8.6}/tests/utils.py +0 -0
@@ -140,3 +140,6 @@ cython_debug/
140
140
  .vscode/
141
141
  .datachain/
142
142
  .dvcx/
143
+
144
+ # pt files produced by ultralytics examples
145
+ *.pt
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.8.4
3
+ Version: 0.8.6
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -96,10 +96,10 @@ Requires-Dist: defusedxml; extra == "examples"
96
96
  Requires-Dist: accelerate; extra == "examples"
97
97
  Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
98
98
  Requires-Dist: unstructured[pdf]<0.16.12; extra == "examples"
99
- Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
+ Requires-Dist: pdfplumber==0.11.5; extra == "examples"
100
100
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
101
101
  Requires-Dist: onnx==1.16.1; extra == "examples"
102
- Requires-Dist: ultralytics==8.3.55; extra == "examples"
102
+ Requires-Dist: ultralytics==8.3.58; extra == "examples"
103
103
 
104
104
  ================
105
105
  |logo| DataChain
@@ -134,7 +134,7 @@ Use Cases
134
134
  1. **ETL.** Pythonic framework for describing and running unstructured data transformations
135
135
  and enrichments, applying models to data, including LLMs.
136
136
  2. **Analytics.** DataChain dataset is a table that combines all the information about data
137
- objects in one place + it provides dataframe-like API and vecrorized engine to do analytics
137
+ objects in one place + it provides dataframe-like API and vectorized engine to do analytics
138
138
  on these tables at scale.
139
139
  3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
140
140
  Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
@@ -270,7 +270,7 @@ DataChain Studio Platform
270
270
 
271
271
  `DataChain Studio`_ is a proprietary solution for teams that offers:
272
272
 
273
- - **Centralized dataset registry** to manage data, code and dependency
273
+ - **Centralized dataset registry** to manage data, code and
274
274
  dependencies in one place.
275
275
  - **Data Lineage** for data sources as well as derivative dataset.
276
276
  - **UI for Multimodal Data** like images, videos, and PDFs.
@@ -31,7 +31,7 @@ Use Cases
31
31
  1. **ETL.** Pythonic framework for describing and running unstructured data transformations
32
32
  and enrichments, applying models to data, including LLMs.
33
33
  2. **Analytics.** DataChain dataset is a table that combines all the information about data
34
- objects in one place + it provides dataframe-like API and vecrorized engine to do analytics
34
+ objects in one place + it provides dataframe-like API and vectorized engine to do analytics
35
35
  on these tables at scale.
36
36
  3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
37
37
  Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
@@ -167,7 +167,7 @@ DataChain Studio Platform
167
167
 
168
168
  `DataChain Studio`_ is a proprietary solution for teams that offers:
169
169
 
170
- - **Centralized dataset registry** to manage data, code and dependency
170
+ - **Centralized dataset registry** to manage data, code and
171
171
  dependencies in one place.
172
172
  - **Data Lineage** for data sources as well as derivative dataset.
173
173
  - **UI for Multimodal Data** like images, videos, and PDFs.
@@ -42,7 +42,7 @@ database for easy and efficient querying.
42
42
  including LLMs.
43
43
  2. **Analytics.** DataChain dataset is a table that combines all the
44
44
  information about data objects in one place + it provides
45
- dataframe-like API and vecrorized engine to do analytics on these
45
+ dataframe-like API and vectorized engine to do analytics on these
46
46
  tables at scale.
47
47
  3. **Versioning.** DataChain doesn't store, require moving or copying
48
48
  data (unlike DVC). Perfect use case is a bucket with thousands or
@@ -1,3 +1,8 @@
1
+ import os
2
+
3
+ os.environ["YOLO_VERBOSE"] = "false"
4
+
5
+
1
6
  from io import BytesIO
2
7
 
3
8
  from PIL import Image
@@ -1,3 +1,8 @@
1
+ import os
2
+
3
+ os.environ["YOLO_VERBOSE"] = "false"
4
+
5
+
1
6
  from io import BytesIO
2
7
 
3
8
  from PIL import Image
@@ -1,3 +1,8 @@
1
+ import os
2
+
3
+ os.environ["YOLO_VERBOSE"] = "false"
4
+
5
+
1
6
  from io import BytesIO
2
7
 
3
8
  from PIL import Image
@@ -56,7 +56,7 @@ class CNN(nn.Module):
56
56
  if __name__ == "__main__":
57
57
  ds = (
58
58
  DataChain.from_storage(STORAGE, type="image")
59
- .settings(cache=True, prefetch=25)
59
+ .settings(prefetch=25)
60
60
  .filter(C("file.path").glob("*.jpg"))
61
61
  .map(
62
62
  label=lambda path: label_to_int(basename(path)[:3], CLASSES),
@@ -68,7 +68,7 @@ if __name__ == "__main__":
68
68
  train_loader = DataLoader(
69
69
  ds.to_pytorch(transform=transform),
70
70
  batch_size=25,
71
- num_workers=max(4, os.cpu_count() or 2),
71
+ num_workers=min(4, os.cpu_count() or 2),
72
72
  persistent_workers=True,
73
73
  multiprocessing_context=multiprocessing.get_context("spawn"),
74
74
  )
@@ -109,10 +109,10 @@ examples = [
109
109
  "accelerate",
110
110
  "unstructured_ingest[embed-huggingface]",
111
111
  "unstructured[pdf]<0.16.12",
112
- "pdfplumber==0.11.4",
112
+ "pdfplumber==0.11.5",
113
113
  "huggingface_hub[hf_transfer]",
114
114
  "onnx==1.16.1",
115
- "ultralytics==8.3.55"
115
+ "ultralytics==8.3.58"
116
116
  ]
117
117
 
118
118
  [project.urls]
@@ -8,12 +8,14 @@ from collections.abc import (
8
8
  Iterable,
9
9
  Iterator,
10
10
  )
11
- from concurrent.futures import ThreadPoolExecutor
11
+ from concurrent.futures import ThreadPoolExecutor, wait
12
12
  from heapq import heappop, heappush
13
13
  from typing import Any, Callable, Generic, Optional, TypeVar
14
14
 
15
15
  from fsspec.asyn import get_loop
16
16
 
17
+ from datachain.utils import safe_closing
18
+
17
19
  ASYNC_WORKERS = 20
18
20
 
19
21
  InputT = TypeVar("InputT", contravariant=True) # noqa: PLC0105
@@ -56,6 +58,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
56
58
  self.pool = ThreadPoolExecutor(workers)
57
59
  self._tasks: set[asyncio.Task] = set()
58
60
  self._shutdown_producer = threading.Event()
61
+ self._producer_is_shutdown = threading.Event()
59
62
 
60
63
  def start_task(self, coro: Coroutine) -> asyncio.Task:
61
64
  task = self.loop.create_task(coro)
@@ -64,11 +67,16 @@ class AsyncMapper(Generic[InputT, ResultT]):
64
67
  return task
65
68
 
66
69
  def _produce(self) -> None:
67
- for item in self.iterable:
68
- if self._shutdown_producer.is_set():
69
- return
70
- fut = asyncio.run_coroutine_threadsafe(self.work_queue.put(item), self.loop)
71
- fut.result() # wait until the item is in the queue
70
+ try:
71
+ with safe_closing(self.iterable):
72
+ for item in self.iterable:
73
+ if self._shutdown_producer.is_set():
74
+ return
75
+ coro = self.work_queue.put(item)
76
+ fut = asyncio.run_coroutine_threadsafe(coro, self.loop)
77
+ fut.result() # wait until the item is in the queue
78
+ finally:
79
+ self._producer_is_shutdown.set()
72
80
 
73
81
  async def produce(self) -> None:
74
82
  await self.to_thread(self._produce)
@@ -179,6 +187,8 @@ class AsyncMapper(Generic[InputT, ResultT]):
179
187
  self.shutdown_producer()
180
188
  if not async_run.done():
181
189
  async_run.cancel()
190
+ wait([async_run])
191
+ self._producer_is_shutdown.wait()
182
192
 
183
193
  def __iter__(self):
184
194
  return self.iterate()
@@ -1,8 +1,12 @@
1
1
  import os
2
+ from collections.abc import Iterator
3
+ from contextlib import contextmanager
4
+ from tempfile import mkdtemp
2
5
  from typing import TYPE_CHECKING, Optional
3
6
 
4
7
  from dvc_data.hashfile.db.local import LocalHashFileDB
5
8
  from dvc_objects.fs.local import LocalFileSystem
9
+ from dvc_objects.fs.utils import remove
6
10
  from fsspec.callbacks import Callback, TqdmCallback
7
11
 
8
12
  from .progress import Tqdm
@@ -20,6 +24,23 @@ def try_scandir(path):
20
24
  pass
21
25
 
22
26
 
27
+ def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "DataChainCache":
28
+ cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
29
+ return DataChainCache(cache_dir, tmp_dir=tmp_dir)
30
+
31
+
32
+ @contextmanager
33
+ def temporary_cache(
34
+ tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
35
+ ) -> Iterator["DataChainCache"]:
36
+ cache = get_temp_cache(tmp_dir, prefix=prefix)
37
+ try:
38
+ yield cache
39
+ finally:
40
+ if delete:
41
+ cache.destroy()
42
+
43
+
23
44
  class DataChainCache:
24
45
  def __init__(self, cache_dir: str, tmp_dir: str):
25
46
  self.odb = LocalHashFileDB(
@@ -28,6 +49,9 @@ class DataChainCache:
28
49
  tmp_dir=tmp_dir,
29
50
  )
30
51
 
52
+ def __eq__(self, other) -> bool:
53
+ return self.odb == other.odb
54
+
31
55
  @property
32
56
  def cache_dir(self):
33
57
  return self.odb.path
@@ -63,7 +87,7 @@ class DataChainCache:
63
87
  if size < 0:
64
88
  size = await client.get_size(from_path, version_id=file.version)
65
89
  cb = callback or TqdmCallback(
66
- tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
90
+ tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True, "leave": False},
67
91
  tqdm_cls=Tqdm,
68
92
  size=size,
69
93
  )
@@ -82,20 +106,18 @@ class DataChainCache:
82
106
  os.unlink(tmp_info)
83
107
 
84
108
  def store_data(self, file: "File", contents: bytes) -> None:
85
- checksum = file.get_hash()
86
- dst = self.path_from_checksum(checksum)
87
- if not os.path.exists(dst):
88
- # Create the file only if it's not already in cache
89
- os.makedirs(os.path.dirname(dst), exist_ok=True)
90
- with open(dst, mode="wb") as f:
91
- f.write(contents)
92
-
93
- def clear(self):
109
+ self.odb.add_bytes(file.get_hash(), contents)
110
+
111
+ def clear(self) -> None:
94
112
  """
95
113
  Completely clear the cache.
96
114
  """
97
115
  self.odb.clear()
98
116
 
117
+ def destroy(self) -> None:
118
+ # `clear` leaves the prefix directory structure intact.
119
+ remove(self.cache_dir)
120
+
99
121
  def get_total_size(self) -> int:
100
122
  total = 0
101
123
  for subdir in try_scandir(self.odb.path):
@@ -405,6 +405,7 @@ def get_download_bar(bar_format: str, total_size: int):
405
405
  unit_scale=True,
406
406
  unit_divisor=1000,
407
407
  total=total_size,
408
+ leave=False,
408
409
  )
409
410
 
410
411
 
@@ -429,6 +430,7 @@ def instantiate_node_groups(
429
430
  unit_scale=True,
430
431
  unit_divisor=1000,
431
432
  total=total_files,
433
+ leave=False,
432
434
  )
433
435
  )
434
436
 
@@ -534,6 +536,12 @@ def find_column_to_str( # noqa: PLR0911
534
536
  return ""
535
537
 
536
538
 
539
+ def clone_catalog_with_cache(catalog: "Catalog", cache: "DataChainCache") -> "Catalog":
540
+ clone = catalog.copy()
541
+ clone.cache = cache
542
+ return clone
543
+
544
+
537
545
  class Catalog:
538
546
  def __init__(
539
547
  self,
@@ -1242,10 +1250,17 @@ class Catalog:
1242
1250
  path: str,
1243
1251
  version_id: Optional[str] = None,
1244
1252
  client_config=None,
1253
+ content_disposition: Optional[str] = None,
1254
+ **kwargs,
1245
1255
  ) -> str:
1246
1256
  client_config = client_config or self.client_config
1247
1257
  client = Client.get_client(source, self.cache, **client_config)
1248
- return client.url(path, version_id=version_id)
1258
+ return client.url(
1259
+ path,
1260
+ version_id=version_id,
1261
+ content_disposition=content_disposition,
1262
+ **kwargs,
1263
+ )
1249
1264
 
1250
1265
  def export_dataset_table(
1251
1266
  self,
@@ -1437,6 +1452,7 @@ class Catalog:
1437
1452
  unit_scale=True,
1438
1453
  unit_divisor=1000,
1439
1454
  total=ds_stats.num_objects, # type: ignore [union-attr]
1455
+ leave=False,
1440
1456
  )
1441
1457
 
1442
1458
  schema = DatasetRecord.parse_schema(remote_ds_version.schema)
@@ -31,8 +31,12 @@ class AzureClient(Client):
31
31
  Generate a signed URL for the given path.
32
32
  """
33
33
  version_id = kwargs.pop("version_id", None)
34
+ content_disposition = kwargs.pop("content_disposition", None)
34
35
  result = self.fs.sign(
35
- self.get_full_path(path, version_id), expiration=expires, **kwargs
36
+ self.get_full_path(path, version_id),
37
+ expiration=expires,
38
+ content_disposition=content_disposition,
39
+ **kwargs,
36
40
  )
37
41
  return result + (f"&versionid={version_id}" if version_id else "")
38
42
 
@@ -42,7 +46,7 @@ class AzureClient(Client):
42
46
  prefix = prefix.lstrip(DELIMITER) + DELIMITER
43
47
  found = False
44
48
  try:
45
- with tqdm(desc=f"Listing {self.uri}", unit=" objects") as pbar:
49
+ with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
46
50
  async with self.fs.service_client.get_container_client(
47
51
  container=self.name
48
52
  ) as container_client:
@@ -249,7 +249,7 @@ class Client(ABC):
249
249
  await main_task
250
250
 
251
251
  async def _fetch_nested(self, start_prefix: str, result_queue: ResultQueue) -> None:
252
- progress_bar = tqdm(desc=f"Listing {self.uri}", unit=" objects")
252
+ progress_bar = tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False)
253
253
  loop = get_loop()
254
254
 
255
255
  queue: asyncio.Queue[str] = asyncio.Queue()
@@ -39,11 +39,15 @@ class GCSClient(Client):
39
39
  (see https://cloud.google.com/storage/docs/access-public-data#api-link).
40
40
  """
41
41
  version_id = kwargs.pop("version_id", None)
42
+ content_disposition = kwargs.pop("content_disposition", None)
42
43
  if self.fs.storage_options.get("token") == "anon":
43
44
  query = f"?generation={version_id}" if version_id else ""
44
45
  return f"https://storage.googleapis.com/{self.name}/{path}{query}"
45
46
  return self.fs.sign(
46
- self.get_full_path(path, version_id), expiration=expires, **kwargs
47
+ self.get_full_path(path, version_id),
48
+ expiration=expires,
49
+ response_disposition=content_disposition,
50
+ **kwargs,
47
51
  )
48
52
 
49
53
  @staticmethod
@@ -83,7 +87,7 @@ class GCSClient(Client):
83
87
  self, page_queue: PageQueue, result_queue: ResultQueue
84
88
  ) -> bool:
85
89
  found = False
86
- with tqdm(desc=f"Listing {self.uri}", unit=" objects") as pbar:
90
+ with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
87
91
  while (page := await page_queue.get()) is not None:
88
92
  if page:
89
93
  found = True
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import os
2
3
  from typing import Any, Optional, cast
3
4
  from urllib.parse import parse_qs, urlsplit, urlunsplit
4
5
 
@@ -31,9 +32,11 @@ class ClientS3(Client):
31
32
  if "aws_token" in kwargs:
32
33
  kwargs.setdefault("token", kwargs.pop("aws_token"))
33
34
 
34
- # caching bucket regions to use the right one in signed urls, otherwise
35
- # it tries to randomly guess and creates wrong signature
36
- kwargs.setdefault("cache_regions", True)
35
+ # remove this `if` when https://github.com/fsspec/s3fs/pull/929 lands
36
+ if not os.environ.get("AWS_REGION") and not os.environ.get("AWS_ENDPOINT_URL"):
37
+ # caching bucket regions to use the right one in signed urls, otherwise
38
+ # it tries to randomly guess and creates wrong signature
39
+ kwargs.setdefault("cache_regions", True)
37
40
 
38
41
  # We want to use newer v4 signature version since regions added after
39
42
  # 2014 are not going to support v2 which is the older one.
@@ -51,6 +54,21 @@ class ClientS3(Client):
51
54
 
52
55
  return cast(S3FileSystem, super().create_fs(**kwargs))
53
56
 
57
+ def url(self, path: str, expires: int = 3600, **kwargs) -> str:
58
+ """
59
+ Generate a signed URL for the given path.
60
+ """
61
+ version_id = kwargs.pop("version_id", None)
62
+ content_disposition = kwargs.pop("content_disposition", None)
63
+ if content_disposition:
64
+ kwargs["ResponseContentDisposition"] = content_disposition
65
+
66
+ return self.fs.sign(
67
+ self.get_full_path(path, version_id),
68
+ expiration=expires,
69
+ **kwargs,
70
+ )
71
+
54
72
  async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
55
73
  async def get_pages(it, page_queue):
56
74
  try:
@@ -61,7 +79,7 @@ class ClientS3(Client):
61
79
 
62
80
  async def process_pages(page_queue, result_queue):
63
81
  found = False
64
- with tqdm(desc=f"Listing {self.uri}", unit=" objects") as pbar:
82
+ with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
65
83
  while (res := await page_queue.get()) is not None:
66
84
  if res:
67
85
  found = True
@@ -79,6 +79,15 @@ class DatabaseEngine(ABC, Serializable):
79
79
  conn: Optional[Any] = None,
80
80
  ) -> Iterator[tuple[Any, ...]]: ...
81
81
 
82
+ def get_table(self, name: str) -> "Table":
83
+ table = self.metadata.tables.get(name)
84
+ if table is None:
85
+ sa.Table(name, self.metadata, autoload_with=self.engine)
86
+ # ^^^ This table may not be correctly initialised on some dialects
87
+ # Grab it from metadata instead.
88
+ table = self.metadata.tables[name]
89
+ return table
90
+
82
91
  @abstractmethod
83
92
  def executemany(
84
93
  self, query, params, cursor: Optional[Any] = None
@@ -16,7 +16,6 @@ from datachain.sql.functions import path as pathfunc
16
16
  from datachain.sql.types import Int, SQLType, UInt64
17
17
 
18
18
  if TYPE_CHECKING:
19
- from sqlalchemy import Engine
20
19
  from sqlalchemy.engine.interfaces import Dialect
21
20
  from sqlalchemy.sql.base import (
22
21
  ColumnCollection,
@@ -25,6 +24,8 @@ if TYPE_CHECKING:
25
24
  )
26
25
  from sqlalchemy.sql.elements import ColumnElement
27
26
 
27
+ from datachain.data_storage.db_engine import DatabaseEngine
28
+
28
29
 
29
30
  DEFAULT_DELIMITER = "__"
30
31
 
@@ -150,14 +151,12 @@ class DataTable:
150
151
  def __init__(
151
152
  self,
152
153
  name: str,
153
- engine: "Engine",
154
- metadata: Optional["sa.MetaData"] = None,
154
+ engine: "DatabaseEngine",
155
155
  column_types: Optional[dict[str, SQLType]] = None,
156
156
  object_name: str = "file",
157
157
  ):
158
158
  self.name: str = name
159
159
  self.engine = engine
160
- self.metadata: sa.MetaData = metadata if metadata is not None else sa.MetaData()
161
160
  self.column_types: dict[str, SQLType] = column_types or {}
162
161
  self.object_name = object_name
163
162
 
@@ -211,12 +210,7 @@ class DataTable:
211
210
  return sa.Table(name, metadata, *columns)
212
211
 
213
212
  def get_table(self) -> "sa.Table":
214
- table = self.metadata.tables.get(self.name)
215
- if table is None:
216
- sa.Table(self.name, self.metadata, autoload_with=self.engine)
217
- # ^^^ This table may not be correctly initialised on some dialects
218
- # Grab it from metadata instead.
219
- table = self.metadata.tables[self.name]
213
+ table = self.engine.get_table(self.name)
220
214
 
221
215
  column_types = self.column_types | {c.name: c.type for c in self.sys_columns()}
222
216
  # adjusting types for custom columns to be instances of SQLType if possible
@@ -186,6 +186,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
186
186
  self.db_file = db_file
187
187
  self.is_closed = False
188
188
 
189
+ def get_table(self, name: str) -> Table:
190
+ if self.is_closed:
191
+ # Reconnect in case of being closed previously.
192
+ self._reconnect()
193
+ return super().get_table(name)
194
+
189
195
  @retry_sqlite_locks
190
196
  def execute(
191
197
  self,
@@ -670,7 +676,7 @@ class SQLiteWarehouse(AbstractWarehouse):
670
676
  ]
671
677
  table = self.create_udf_table(columns)
672
678
 
673
- with tqdm(desc="Preparing", unit=" rows") as pbar:
679
+ with tqdm(desc="Preparing", unit=" rows", leave=False) as pbar:
674
680
  self.copy_table(table, query, progress_cb=pbar.update)
675
681
 
676
682
  return table
@@ -191,8 +191,7 @@ class AbstractWarehouse(ABC, Serializable):
191
191
  table_name = self.dataset_table_name(dataset.name, version)
192
192
  return self.schema.dataset_row_cls(
193
193
  table_name,
194
- self.db.engine,
195
- self.db.metadata,
194
+ self.db,
196
195
  dataset.get_schema(version),
197
196
  object_name=object_name,
198
197
  )
@@ -904,8 +903,11 @@ class AbstractWarehouse(ABC, Serializable):
904
903
  This should be implemented to ensure that the provided tables
905
904
  are cleaned up as soon as they are no longer needed.
906
905
  """
907
- with tqdm(desc="Cleanup", unit=" tables") as pbar:
908
- for name in set(names):
906
+ to_drop = set(names)
907
+ with tqdm(
908
+ desc="Cleanup", unit=" tables", total=len(to_drop), leave=False
909
+ ) as pbar:
910
+ for name in to_drop:
909
911
  self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
910
912
  pbar.update(1)
911
913