datachain 0.3.9__tar.gz → 0.3.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (245) hide show
  1. {datachain-0.3.9 → datachain-0.3.10}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.3.9/src/datachain.egg-info → datachain-0.3.10}/PKG-INFO +3 -2
  3. {datachain-0.3.9 → datachain-0.3.10}/examples/get_started/udfs/parallel.py +1 -1
  4. {datachain-0.3.9 → datachain-0.3.10}/examples/get_started/udfs/simple.py +1 -1
  5. {datachain-0.3.9 → datachain-0.3.10}/pyproject.toml +3 -2
  6. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/catalog/catalog.py +11 -80
  7. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/cli.py +6 -38
  8. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/client/fsspec.py +3 -0
  9. datachain-0.3.10/src/datachain/client/hf.py +47 -0
  10. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/data_storage/metastore.py +2 -29
  11. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/data_storage/sqlite.py +3 -12
  12. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/data_storage/warehouse.py +20 -29
  13. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/dataset.py +44 -32
  14. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/arrow.py +21 -5
  15. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/dataset_info.py +4 -0
  16. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/dc.py +108 -25
  17. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/file.py +10 -33
  18. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/hf.py +2 -1
  19. datachain-0.3.10/src/datachain/lib/listing.py +119 -0
  20. datachain-0.3.10/src/datachain/lib/listing_info.py +32 -0
  21. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/meta_formats.py +4 -4
  22. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/signal_schema.py +5 -2
  23. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/node.py +13 -0
  24. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/dataset.py +11 -81
  25. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/metrics.py +8 -0
  26. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/utils.py +5 -0
  27. {datachain-0.3.9 → datachain-0.3.10/src/datachain.egg-info}/PKG-INFO +3 -2
  28. {datachain-0.3.9 → datachain-0.3.10}/src/datachain.egg-info/SOURCES.txt +3 -0
  29. {datachain-0.3.9 → datachain-0.3.10}/src/datachain.egg-info/requires.txt +2 -1
  30. {datachain-0.3.9 → datachain-0.3.10}/tests/benchmarks/test_datachain.py +4 -6
  31. {datachain-0.3.9 → datachain-0.3.10}/tests/conftest.py +4 -0
  32. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_catalog.py +19 -24
  33. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_datachain.py +147 -11
  34. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_dataset_query.py +20 -4
  35. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_datasets.py +18 -13
  36. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_feature_pickling.py +21 -16
  37. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_ls.py +7 -4
  38. datachain-0.3.10/tests/func/test_metrics.py +14 -0
  39. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_query.py +15 -23
  40. {datachain-0.3.9 → datachain-0.3.10}/tests/scripts/feature_class.py +2 -2
  41. {datachain-0.3.9 → datachain-0.3.10}/tests/scripts/feature_class_parallel.py +1 -1
  42. {datachain-0.3.9 → datachain-0.3.10}/tests/scripts/feature_class_parallel_data_model.py +1 -1
  43. {datachain-0.3.9 → datachain-0.3.10}/tests/test_query_e2e.py +5 -4
  44. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_arrow.py +38 -1
  45. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_datachain.py +60 -4
  46. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_signal_schema.py +20 -3
  47. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_dataset.py +28 -0
  48. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_listing.py +86 -0
  49. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_storage.py +0 -34
  50. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_utils.py +17 -0
  51. datachain-0.3.9/src/datachain/lib/listing.py +0 -111
  52. {datachain-0.3.9 → datachain-0.3.10}/.cruft.json +0 -0
  53. {datachain-0.3.9 → datachain-0.3.10}/.gitattributes +0 -0
  54. {datachain-0.3.9 → datachain-0.3.10}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  55. {datachain-0.3.9 → datachain-0.3.10}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  56. {datachain-0.3.9 → datachain-0.3.10}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  57. {datachain-0.3.9 → datachain-0.3.10}/.github/codecov.yaml +0 -0
  58. {datachain-0.3.9 → datachain-0.3.10}/.github/dependabot.yml +0 -0
  59. {datachain-0.3.9 → datachain-0.3.10}/.github/workflows/benchmarks.yml +0 -0
  60. {datachain-0.3.9 → datachain-0.3.10}/.github/workflows/release.yml +0 -0
  61. {datachain-0.3.9 → datachain-0.3.10}/.github/workflows/tests-studio.yml +0 -0
  62. {datachain-0.3.9 → datachain-0.3.10}/.github/workflows/tests.yml +0 -0
  63. {datachain-0.3.9 → datachain-0.3.10}/.github/workflows/update-template.yaml +0 -0
  64. {datachain-0.3.9 → datachain-0.3.10}/.gitignore +0 -0
  65. {datachain-0.3.9 → datachain-0.3.10}/CODE_OF_CONDUCT.rst +0 -0
  66. {datachain-0.3.9 → datachain-0.3.10}/CONTRIBUTING.rst +0 -0
  67. {datachain-0.3.9 → datachain-0.3.10}/LICENSE +0 -0
  68. {datachain-0.3.9 → datachain-0.3.10}/README.rst +0 -0
  69. {datachain-0.3.9 → datachain-0.3.10}/docs/assets/captioned_cartoons.png +0 -0
  70. {datachain-0.3.9 → datachain-0.3.10}/docs/assets/datachain.png +0 -0
  71. {datachain-0.3.9 → datachain-0.3.10}/docs/assets/flowchart.png +0 -0
  72. {datachain-0.3.9 → datachain-0.3.10}/docs/index.md +0 -0
  73. {datachain-0.3.9 → datachain-0.3.10}/docs/references/datachain.md +0 -0
  74. {datachain-0.3.9 → datachain-0.3.10}/docs/references/datatype.md +0 -0
  75. {datachain-0.3.9 → datachain-0.3.10}/docs/references/file.md +0 -0
  76. {datachain-0.3.9 → datachain-0.3.10}/docs/references/index.md +0 -0
  77. {datachain-0.3.9 → datachain-0.3.10}/docs/references/sql.md +0 -0
  78. {datachain-0.3.9 → datachain-0.3.10}/docs/references/torch.md +0 -0
  79. {datachain-0.3.9 → datachain-0.3.10}/docs/references/udf.md +0 -0
  80. {datachain-0.3.9 → datachain-0.3.10}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  81. {datachain-0.3.9 → datachain-0.3.10}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  82. {datachain-0.3.9 → datachain-0.3.10}/examples/computer_vision/openimage-detect.py +0 -0
  83. {datachain-0.3.9 → datachain-0.3.10}/examples/get_started/common_sql_functions.py +0 -0
  84. {datachain-0.3.9 → datachain-0.3.10}/examples/get_started/json-csv-reader.py +0 -0
  85. {datachain-0.3.9 → datachain-0.3.10}/examples/get_started/torch-loader.py +0 -0
  86. {datachain-0.3.9 → datachain-0.3.10}/examples/get_started/udfs/stateful.py +0 -0
  87. {datachain-0.3.9 → datachain-0.3.10}/examples/llm_and_nlp/claude-query.py +0 -0
  88. {datachain-0.3.9 → datachain-0.3.10}/examples/llm_and_nlp/unstructured-text.py +0 -0
  89. {datachain-0.3.9 → datachain-0.3.10}/examples/multimodal/clip_inference.py +0 -0
  90. {datachain-0.3.9 → datachain-0.3.10}/examples/multimodal/hf_pipeline.py +0 -0
  91. {datachain-0.3.9 → datachain-0.3.10}/examples/multimodal/openai_image_desc_lib.py +0 -0
  92. {datachain-0.3.9 → datachain-0.3.10}/examples/multimodal/wds.py +0 -0
  93. {datachain-0.3.9 → datachain-0.3.10}/examples/multimodal/wds_filtered.py +0 -0
  94. {datachain-0.3.9 → datachain-0.3.10}/mkdocs.yml +0 -0
  95. {datachain-0.3.9 → datachain-0.3.10}/noxfile.py +0 -0
  96. {datachain-0.3.9 → datachain-0.3.10}/setup.cfg +0 -0
  97. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/__init__.py +0 -0
  98. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/__main__.py +0 -0
  99. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/asyn.py +0 -0
  100. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/cache.py +0 -0
  101. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/catalog/__init__.py +0 -0
  102. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/catalog/datasource.py +0 -0
  103. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/catalog/loader.py +0 -0
  104. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/catalog/subclass.py +0 -0
  105. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/cli_utils.py +0 -0
  106. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/client/__init__.py +0 -0
  107. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/client/azure.py +0 -0
  108. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/client/fileslice.py +0 -0
  109. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/client/gcs.py +0 -0
  110. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/client/local.py +0 -0
  111. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/client/s3.py +0 -0
  112. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/config.py +0 -0
  113. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/data_storage/__init__.py +0 -0
  114. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/data_storage/db_engine.py +0 -0
  115. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/data_storage/id_generator.py +0 -0
  116. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/data_storage/job.py +0 -0
  117. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/data_storage/schema.py +0 -0
  118. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/data_storage/serializer.py +0 -0
  119. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/error.py +0 -0
  120. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/job.py +0 -0
  121. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/__init__.py +0 -0
  122. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/clip.py +0 -0
  123. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/convert/__init__.py +0 -0
  124. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/convert/flatten.py +0 -0
  125. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/convert/python_to_sql.py +0 -0
  126. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/convert/sql_to_python.py +0 -0
  127. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/convert/unflatten.py +0 -0
  128. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  129. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/data_model.py +0 -0
  130. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/image.py +0 -0
  131. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/model_store.py +0 -0
  132. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/pytorch.py +0 -0
  133. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/settings.py +0 -0
  134. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/text.py +0 -0
  135. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/udf.py +0 -0
  136. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/udf_signature.py +0 -0
  137. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/utils.py +0 -0
  138. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/vfile.py +0 -0
  139. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/webdataset.py +0 -0
  140. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/lib/webdataset_laion.py +0 -0
  141. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/listing.py +0 -0
  142. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/nodes_fetcher.py +0 -0
  143. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/nodes_thread_pool.py +0 -0
  144. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/progress.py +0 -0
  145. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/py.typed +0 -0
  146. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/__init__.py +0 -0
  147. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/batch.py +0 -0
  148. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/builtins.py +0 -0
  149. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/dispatch.py +0 -0
  150. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/params.py +0 -0
  151. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/queue.py +0 -0
  152. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/schema.py +0 -0
  153. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/session.py +0 -0
  154. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/query/udf.py +0 -0
  155. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/remote/__init__.py +0 -0
  156. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/remote/studio.py +0 -0
  157. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/__init__.py +0 -0
  158. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/default/__init__.py +0 -0
  159. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/default/base.py +0 -0
  160. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/functions/__init__.py +0 -0
  161. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/functions/array.py +0 -0
  162. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/functions/conditional.py +0 -0
  163. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/functions/path.py +0 -0
  164. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/functions/random.py +0 -0
  165. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/functions/string.py +0 -0
  166. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/selectable.py +0 -0
  167. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/sqlite/__init__.py +0 -0
  168. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/sqlite/base.py +0 -0
  169. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/sqlite/types.py +0 -0
  170. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/sqlite/vector.py +0 -0
  171. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/types.py +0 -0
  172. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/sql/utils.py +0 -0
  173. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/storage.py +0 -0
  174. {datachain-0.3.9 → datachain-0.3.10}/src/datachain/torch/__init__.py +0 -0
  175. {datachain-0.3.9 → datachain-0.3.10}/src/datachain.egg-info/dependency_links.txt +0 -0
  176. {datachain-0.3.9 → datachain-0.3.10}/src/datachain.egg-info/entry_points.txt +0 -0
  177. {datachain-0.3.9 → datachain-0.3.10}/src/datachain.egg-info/top_level.txt +0 -0
  178. {datachain-0.3.9 → datachain-0.3.10}/tests/__init__.py +0 -0
  179. {datachain-0.3.9 → datachain-0.3.10}/tests/benchmarks/__init__.py +0 -0
  180. {datachain-0.3.9 → datachain-0.3.10}/tests/benchmarks/conftest.py +0 -0
  181. {datachain-0.3.9 → datachain-0.3.10}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  182. {datachain-0.3.9 → datachain-0.3.10}/tests/benchmarks/datasets/.dvc/config +0 -0
  183. {datachain-0.3.9 → datachain-0.3.10}/tests/benchmarks/datasets/.gitignore +0 -0
  184. {datachain-0.3.9 → datachain-0.3.10}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  185. {datachain-0.3.9 → datachain-0.3.10}/tests/benchmarks/test_ls.py +0 -0
  186. {datachain-0.3.9 → datachain-0.3.10}/tests/benchmarks/test_version.py +0 -0
  187. {datachain-0.3.9 → datachain-0.3.10}/tests/data.py +0 -0
  188. {datachain-0.3.9 → datachain-0.3.10}/tests/examples/__init__.py +0 -0
  189. {datachain-0.3.9 → datachain-0.3.10}/tests/examples/test_examples.py +0 -0
  190. {datachain-0.3.9 → datachain-0.3.10}/tests/examples/test_wds_e2e.py +0 -0
  191. {datachain-0.3.9 → datachain-0.3.10}/tests/examples/wds_data.py +0 -0
  192. {datachain-0.3.9 → datachain-0.3.10}/tests/func/__init__.py +0 -0
  193. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_client.py +0 -0
  194. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_listing.py +0 -0
  195. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_pull.py +0 -0
  196. {datachain-0.3.9 → datachain-0.3.10}/tests/func/test_pytorch.py +0 -0
  197. {datachain-0.3.9 → datachain-0.3.10}/tests/scripts/name_len_slow.py +0 -0
  198. {datachain-0.3.9 → datachain-0.3.10}/tests/test_cli_e2e.py +0 -0
  199. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/__init__.py +0 -0
  200. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/__init__.py +0 -0
  201. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/conftest.py +0 -0
  202. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_clip.py +0 -0
  203. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  204. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_datachain_merge.py +0 -0
  205. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_feature.py +0 -0
  206. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_feature_utils.py +0 -0
  207. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_file.py +0 -0
  208. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_hf.py +0 -0
  209. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_image.py +0 -0
  210. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_schema.py +0 -0
  211. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_sql_to_python.py +0 -0
  212. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_text.py +0 -0
  213. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_udf_signature.py +0 -0
  214. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_utils.py +0 -0
  215. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/lib/test_webdataset.py +0 -0
  216. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/sql/__init__.py +0 -0
  217. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/sql/sqlite/__init__.py +0 -0
  218. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/sql/sqlite/test_utils.py +0 -0
  219. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/sql/test_array.py +0 -0
  220. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/sql/test_conditional.py +0 -0
  221. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/sql/test_path.py +0 -0
  222. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/sql/test_random.py +0 -0
  223. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/sql/test_selectable.py +0 -0
  224. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/sql/test_string.py +0 -0
  225. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_asyn.py +0 -0
  226. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_cache.py +0 -0
  227. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_catalog.py +0 -0
  228. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_catalog_loader.py +0 -0
  229. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_cli_parsing.py +0 -0
  230. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_client.py +0 -0
  231. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_client_s3.py +0 -0
  232. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_data_storage.py +0 -0
  233. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_database_engine.py +0 -0
  234. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_dispatch.py +0 -0
  235. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_fileslice.py +0 -0
  236. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_id_generator.py +0 -0
  237. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_metastore.py +0 -0
  238. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_module_exports.py +0 -0
  239. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_query_metrics.py +0 -0
  240. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_query_params.py +0 -0
  241. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_serializer.py +0 -0
  242. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_session.py +0 -0
  243. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_udf.py +0 -0
  244. {datachain-0.3.9 → datachain-0.3.10}/tests/unit/test_warehouse.py +0 -0
  245. {datachain-0.3.9 → datachain-0.3.10}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.6.2'
27
+ rev: 'v0.6.3'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.9
3
+ Version: 0.3.10
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -43,6 +43,7 @@ Requires-Dist: datamodel-code-generator>=0.25
43
43
  Requires-Dist: Pillow<11,>=10.0.0
44
44
  Requires-Dist: msgpack<2,>=1.0.4
45
45
  Requires-Dist: psutil
46
+ Requires-Dist: huggingface_hub
46
47
  Provides-Extra: docs
47
48
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
48
49
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -61,7 +62,7 @@ Provides-Extra: vector
61
62
  Requires-Dist: usearch; extra == "vector"
62
63
  Provides-Extra: hf
63
64
  Requires-Dist: numba>=0.60.0; extra == "hf"
64
- Requires-Dist: datasets[audio,vision]; extra == "hf"
65
+ Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
65
66
  Provides-Extra: tests
66
67
  Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
67
68
  Requires-Dist: pytest<9,>=8; extra == "tests"
@@ -31,7 +31,7 @@ def path_len_benchmark(path):
31
31
 
32
32
  # Run in chain
33
33
  DataChain.from_storage(
34
- path="gs://datachain-demo/dogs-and-cats/",
34
+ "gs://datachain-demo/dogs-and-cats/",
35
35
  ).settings(parallel=-1).map(
36
36
  path_len_benchmark,
37
37
  params=["file.path"],
@@ -11,7 +11,7 @@ def path_len(path):
11
11
  if __name__ == "__main__":
12
12
  # Run in chain
13
13
  DataChain.from_storage(
14
- path="gs://datachain-demo/dogs-and-cats/",
14
+ uri="gs://datachain-demo/dogs-and-cats/",
15
15
  ).map(
16
16
  path_len,
17
17
  params=["file.path"],
@@ -45,7 +45,8 @@ dependencies = [
45
45
  "datamodel-code-generator>=0.25",
46
46
  "Pillow>=10.0.0,<11",
47
47
  "msgpack>=1.0.4,<2",
48
- "psutil"
48
+ "psutil",
49
+ "huggingface_hub"
49
50
  ]
50
51
 
51
52
  [project.optional-dependencies]
@@ -71,7 +72,7 @@ vector = [
71
72
  ]
72
73
  hf = [
73
74
  "numba>=0.60.0",
74
- "datasets[audio,vision]"
75
+ "datasets[audio,vision]>=2.21.0"
75
76
  ]
76
77
  tests = [
77
78
  "datachain[torch,remote,vector,hf]",
@@ -156,8 +156,6 @@ class QueryResult(NamedTuple):
156
156
  dataset: Optional[DatasetRecord]
157
157
  version: Optional[int]
158
158
  output: str
159
- preview: Optional[list[dict]]
160
- metrics: dict[str, Any]
161
159
 
162
160
 
163
161
  class DatasetRowsFetcher(NodesThreadPool):
@@ -1020,20 +1018,6 @@ class Catalog:
1020
1018
 
1021
1019
  return node_groups
1022
1020
 
1023
- def unlist_source(self, uri: StorageURI) -> None:
1024
- self.metastore.clone(uri=uri).mark_storage_not_indexed(uri)
1025
-
1026
- def storage_stats(self, uri: StorageURI) -> Optional[DatasetStats]:
1027
- """
1028
- Returns tuple with storage stats: total number of rows and total dataset size.
1029
- """
1030
- partial_path = self.metastore.get_last_partial_path(uri)
1031
- if partial_path is None:
1032
- return None
1033
- dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
1034
-
1035
- return self.dataset_stats(dataset.name, dataset.latest_version)
1036
-
1037
1021
  def create_dataset(
1038
1022
  self,
1039
1023
  name: str,
@@ -1297,19 +1281,6 @@ class Catalog:
1297
1281
 
1298
1282
  return self.get_dataset(name)
1299
1283
 
1300
- def register_new_dataset(
1301
- self,
1302
- source_dataset: DatasetRecord,
1303
- source_version: int,
1304
- target_name: str,
1305
- ) -> DatasetRecord:
1306
- target_dataset = self.metastore.create_dataset(
1307
- target_name,
1308
- query_script=source_dataset.query_script,
1309
- schema=source_dataset.serialized_schema,
1310
- )
1311
- return self.register_dataset(source_dataset, source_version, target_dataset, 1)
1312
-
1313
1284
  def register_dataset(
1314
1285
  self,
1315
1286
  dataset: DatasetRecord,
@@ -1422,17 +1393,18 @@ class Catalog:
1422
1393
 
1423
1394
  return direct_dependencies
1424
1395
 
1425
- def ls_datasets(self) -> Iterator[DatasetRecord]:
1396
+ def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetRecord]:
1426
1397
  datasets = self.metastore.list_datasets()
1427
1398
  for d in datasets:
1428
- if not d.is_bucket_listing:
1399
+ if not d.is_bucket_listing or include_listing:
1429
1400
  yield d
1430
1401
 
1431
1402
  def list_datasets_versions(
1432
1403
  self,
1404
+ include_listing: bool = False,
1433
1405
  ) -> Iterator[tuple[DatasetRecord, "DatasetVersion", Optional["Job"]]]:
1434
1406
  """Iterate over all dataset versions with related jobs."""
1435
- datasets = list(self.ls_datasets())
1407
+ datasets = list(self.ls_datasets(include_listing=include_listing))
1436
1408
 
1437
1409
  # preselect dataset versions jobs from db to avoid multiple queries
1438
1410
  jobs_ids: set[str] = {
@@ -1632,15 +1604,6 @@ class Catalog:
1632
1604
  for source in data_sources: # type: ignore [union-attr]
1633
1605
  yield source, source.ls(fields)
1634
1606
 
1635
- def ls_storage_uris(self) -> Iterator[str]:
1636
- yield from self.metastore.get_all_storage_uris()
1637
-
1638
- def get_storage(self, uri: StorageURI) -> Storage:
1639
- return self.metastore.get_storage(uri)
1640
-
1641
- def ls_storages(self) -> list[Storage]:
1642
- return self.metastore.list_storages()
1643
-
1644
1607
  def pull_dataset(
1645
1608
  self,
1646
1609
  dataset_uri: str,
@@ -1874,10 +1837,6 @@ class Catalog:
1874
1837
  envs: Optional[Mapping[str, str]] = None,
1875
1838
  python_executable: Optional[str] = None,
1876
1839
  save: bool = False,
1877
- save_as: Optional[str] = None,
1878
- preview_limit: int = 10,
1879
- preview_offset: int = 0,
1880
- preview_columns: Optional[list[str]] = None,
1881
1840
  capture_output: bool = True,
1882
1841
  output_hook: Callable[[str], None] = noop,
1883
1842
  params: Optional[dict[str, str]] = None,
@@ -1905,7 +1864,6 @@ class Catalog:
1905
1864
  C.size > 1000
1906
1865
  )
1907
1866
  """
1908
- from datachain.query.dataset import ExecutionResult
1909
1867
 
1910
1868
  feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
1911
1869
  dir=os.getcwd(), suffix=".py", delete=False
@@ -1922,11 +1880,7 @@ class Catalog:
1922
1880
  feature_module,
1923
1881
  output_hook,
1924
1882
  params,
1925
- preview_columns,
1926
- preview_limit,
1927
- preview_offset,
1928
1883
  save,
1929
- save_as,
1930
1884
  job_id,
1931
1885
  )
1932
1886
  finally:
@@ -1955,25 +1909,18 @@ class Catalog:
1955
1909
  )
1956
1910
 
1957
1911
  try:
1958
- response = json.loads(response_text)
1912
+ result = json.loads(response_text)
1959
1913
  except ValueError:
1960
- response = {}
1961
- exec_result = ExecutionResult(**response)
1914
+ result = None
1962
1915
 
1963
1916
  dataset: Optional[DatasetRecord] = None
1964
1917
  version: Optional[int] = None
1965
- if save or save_as:
1918
+ if save:
1966
1919
  dataset, version = self.save_result(
1967
- query_script, exec_result, output, version, job_id
1920
+ query_script, result, output, version, job_id
1968
1921
  )
1969
1922
 
1970
- return QueryResult(
1971
- dataset=dataset,
1972
- version=version,
1973
- output=output,
1974
- preview=exec_result.preview,
1975
- metrics=exec_result.metrics,
1976
- )
1923
+ return QueryResult(dataset=dataset, version=version, output=output)
1977
1924
 
1978
1925
  def run_query(
1979
1926
  self,
@@ -1985,11 +1932,7 @@ class Catalog:
1985
1932
  feature_module: str,
1986
1933
  output_hook: Callable[[str], None],
1987
1934
  params: Optional[dict[str, str]],
1988
- preview_columns: Optional[list[str]],
1989
- preview_limit: int,
1990
- preview_offset: int,
1991
1935
  save: bool,
1992
- save_as: Optional[str],
1993
1936
  job_id: Optional[str],
1994
1937
  ) -> tuple[list[str], subprocess.Popen, str]:
1995
1938
  try:
@@ -2004,10 +1947,6 @@ class Catalog:
2004
1947
  raise QueryScriptCompileError(
2005
1948
  f"Query script failed to compile, reason: {exc}"
2006
1949
  ) from exc
2007
- if save_as and save_as.startswith(QUERY_DATASET_PREFIX):
2008
- raise ValueError(
2009
- f"Cannot use {QUERY_DATASET_PREFIX} prefix for dataset name"
2010
- )
2011
1950
  r, w = os.pipe()
2012
1951
  if os.name == "nt":
2013
1952
  import msvcrt
@@ -2030,15 +1969,7 @@ class Catalog:
2030
1969
  {
2031
1970
  "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
2032
1971
  "PYTHONPATH": os.getcwd(), # For local imports
2033
- "DATACHAIN_QUERY_PREVIEW_ARGS": json.dumps(
2034
- {
2035
- "limit": preview_limit,
2036
- "offset": preview_offset,
2037
- "columns": preview_columns,
2038
- }
2039
- ),
2040
1972
  "DATACHAIN_QUERY_SAVE": "1" if save else "",
2041
- "DATACHAIN_QUERY_SAVE_AS": save_as or "",
2042
1973
  "PYTHONUNBUFFERED": "1",
2043
1974
  "DATACHAIN_OUTPUT_FD": str(handle),
2044
1975
  "DATACHAIN_JOB_ID": job_id or "",
@@ -2068,12 +1999,12 @@ class Catalog:
2068
1999
  return lines, proc, response_text
2069
2000
 
2070
2001
  def save_result(self, query_script, exec_result, output, version, job_id):
2071
- if not exec_result.dataset:
2002
+ if not exec_result:
2072
2003
  raise QueryScriptDatasetNotFound(
2073
2004
  "No dataset found after running Query script",
2074
2005
  output=output,
2075
2006
  )
2076
- name, version = exec_result.dataset
2007
+ name, version = exec_result
2077
2008
  # finding returning dataset
2078
2009
  try:
2079
2010
  dataset = self.get_dataset(name)
@@ -14,6 +14,7 @@ import shtab
14
14
 
15
15
  from datachain import utils
16
16
  from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
17
+ from datachain.lib.dc import DataChain
17
18
  from datachain.utils import DataChainDir
18
19
 
19
20
  if TYPE_CHECKING:
@@ -472,9 +473,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
472
473
  query_parser.add_argument(
473
474
  "script", metavar="<script.py>", type=str, help="Filepath for script"
474
475
  )
475
- query_parser.add_argument(
476
- "dataset_name", nargs="?", type=str, help="Save result dataset as"
477
- )
478
476
  query_parser.add_argument(
479
477
  "--parallel",
480
478
  nargs="?",
@@ -487,7 +485,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
487
485
  "N defaults to the CPU count."
488
486
  ),
489
487
  )
490
- add_show_args(query_parser)
491
488
  query_parser.add_argument(
492
489
  "-p",
493
490
  "--param",
@@ -619,18 +616,6 @@ def _ls_urls_flat(
619
616
  raise FileNotFoundError(f"No such file or directory: {source}")
620
617
 
621
618
 
622
- def ls_indexed_storages(catalog: "Catalog", long: bool = False) -> Iterator[str]:
623
- from datachain.node import long_line_str
624
-
625
- storage_uris = catalog.ls_storage_uris()
626
- if long:
627
- for uri in storage_uris:
628
- # TODO: add Storage.created so it can be used here
629
- yield long_line_str(uri, None, "")
630
- else:
631
- yield from storage_uris
632
-
633
-
634
619
  def ls_local(
635
620
  sources,
636
621
  long: bool = False,
@@ -661,8 +646,9 @@ def ls_local(
661
646
  for entry in entries:
662
647
  print(format_ls_entry(entry))
663
648
  else:
664
- for entry in ls_indexed_storages(catalog, long=long):
665
- print(format_ls_entry(entry))
649
+ chain = DataChain.listings()
650
+ for ls in chain.collect("listing"):
651
+ print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
666
652
 
667
653
 
668
654
  def format_ls_entry(entry: str) -> str:
@@ -813,16 +799,10 @@ def show(
813
799
  def query(
814
800
  catalog: "Catalog",
815
801
  script: str,
816
- dataset_name: Optional[str] = None,
817
802
  parallel: Optional[int] = None,
818
- limit: int = 10,
819
- offset: int = 0,
820
- columns: Optional[list[str]] = None,
821
- no_collapse: bool = False,
822
803
  params: Optional[dict[str, str]] = None,
823
804
  ) -> None:
824
805
  from datachain.data_storage import JobQueryType, JobStatus
825
- from datachain.utils import show_records
826
806
 
827
807
  with open(script, encoding="utf-8") as f:
828
808
  script_content = f.read()
@@ -843,13 +823,9 @@ def query(
843
823
  )
844
824
 
845
825
  try:
846
- result = catalog.query(
826
+ catalog.query(
847
827
  script_content,
848
828
  python_executable=python_executable,
849
- save_as=dataset_name,
850
- preview_limit=limit,
851
- preview_offset=offset,
852
- preview_columns=columns,
853
829
  capture_output=False,
854
830
  params=params,
855
831
  job_id=job_id,
@@ -864,10 +840,7 @@ def query(
864
840
  error_stack=error_stack,
865
841
  )
866
842
  raise
867
-
868
- catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE, metrics=result.metrics)
869
-
870
- show_records(result.preview, collapse_columns=not no_collapse)
843
+ catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
871
844
 
872
845
 
873
846
  def clear_cache(catalog: "Catalog"):
@@ -1042,12 +1015,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1042
1015
  query(
1043
1016
  catalog,
1044
1017
  args.script,
1045
- dataset_name=args.dataset_name,
1046
1018
  parallel=args.parallel,
1047
- limit=args.limit,
1048
- offset=args.offset,
1049
- columns=args.columns,
1050
- no_collapse=args.no_collapse,
1051
1019
  params=args.param,
1052
1020
  )
1053
1021
  elif args.command == "apply-udf":
@@ -87,6 +87,7 @@ class Client(ABC):
87
87
  def get_implementation(url: str) -> type["Client"]:
88
88
  from .azure import AzureClient
89
89
  from .gcs import GCSClient
90
+ from .hf import HfClient
90
91
  from .local import FileClient
91
92
  from .s3 import ClientS3
92
93
 
@@ -104,6 +105,8 @@ class Client(ABC):
104
105
  return AzureClient
105
106
  if protocol == FileClient.protocol:
106
107
  return FileClient
108
+ if protocol == HfClient.protocol:
109
+ return HfClient
107
110
 
108
111
  raise NotImplementedError(f"Unsupported protocol: {protocol}")
109
112
 
@@ -0,0 +1,47 @@
1
+ import os
2
+ import posixpath
3
+ from typing import Any, cast
4
+
5
+ from huggingface_hub import HfFileSystem
6
+
7
+ from datachain.lib.file import File
8
+ from datachain.node import Entry
9
+
10
+ from .fsspec import Client
11
+
12
+
13
+ class HfClient(Client):
14
+ FS_CLASS = HfFileSystem
15
+ PREFIX = "hf://"
16
+ protocol = "hf"
17
+
18
+ @classmethod
19
+ def create_fs(cls, **kwargs) -> HfFileSystem:
20
+ if os.environ.get("HF_TOKEN"):
21
+ kwargs["token"] = os.environ["HF_TOKEN"]
22
+
23
+ return cast(HfFileSystem, super().create_fs(**kwargs))
24
+
25
+ def convert_info(self, v: dict[str, Any], path: str) -> Entry:
26
+ return Entry.from_file(
27
+ path=path,
28
+ size=v["size"],
29
+ version=v["last_commit"].oid,
30
+ etag=v.get("blob_id", ""),
31
+ last_modified=v["last_commit"].date,
32
+ )
33
+
34
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
35
+ return File(
36
+ path=path,
37
+ size=v["size"],
38
+ version=v["last_commit"].oid,
39
+ etag=v.get("blob_id", ""),
40
+ last_modified=v["last_commit"].date,
41
+ )
42
+
43
+ async def ls_dir(self, path):
44
+ return self.fs.ls(path, detail=True)
45
+
46
+ def rel_path(self, path):
47
+ return posixpath.relpath(path, self.name)
@@ -167,21 +167,10 @@ class AbstractMetastore(ABC, Serializable):
167
167
  This method should be called when index operation is finished.
168
168
  """
169
169
 
170
- @abstractmethod
171
- def mark_storage_not_indexed(self, uri: StorageURI) -> None:
172
- """
173
- Mark storage as not indexed.
174
- This method should be called when storage index is deleted.
175
- """
176
-
177
170
  @abstractmethod
178
171
  def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
179
172
  """Updates last inserted datetime in bucket with current time."""
180
173
 
181
- @abstractmethod
182
- def get_all_storage_uris(self) -> Iterator[StorageURI]:
183
- """Returns all storage uris."""
184
-
185
174
  @abstractmethod
186
175
  def get_storage(self, uri: StorageURI) -> Storage:
187
176
  """
@@ -189,10 +178,6 @@ class AbstractMetastore(ABC, Serializable):
189
178
  E.g. if s3 is used as storage this would be s3 bucket data.
190
179
  """
191
180
 
192
- @abstractmethod
193
- def list_storages(self) -> list[Storage]:
194
- """Returns all storages."""
195
-
196
181
  @abstractmethod
197
182
  def mark_storage_pending(self, storage: Storage) -> Storage:
198
183
  """Marks storage as pending."""
@@ -324,7 +309,7 @@ class AbstractMetastore(ABC, Serializable):
324
309
  self.add_dataset_dependency(
325
310
  source_dataset_name,
326
311
  source_dataset_version,
327
- dependency.name,
312
+ dependency.dataset_name,
328
313
  int(dependency.version),
329
314
  )
330
315
  else:
@@ -906,11 +891,6 @@ class AbstractDBMetastore(AbstractMetastore):
906
891
  self._storages_update().where(s.c.uri == uri).values(**updates) # type: ignore [attr-defined]
907
892
  )
908
893
 
909
- def get_all_storage_uris(self) -> Iterator[StorageURI]:
910
- """Returns all storage uris."""
911
- s = self._storages
912
- yield from (r[0] for r in self.db.execute(self._storages_select(s.c.uri)))
913
-
914
894
  def get_storage(self, uri: StorageURI, conn=None) -> Storage:
915
895
  """
916
896
  Gets storage representation from database.
@@ -926,13 +906,6 @@ class AbstractDBMetastore(AbstractMetastore):
926
906
 
927
907
  return self.storage_class._make(result)
928
908
 
929
- def list_storages(self) -> list[Storage]:
930
- result = self.db.execute(self._storages_select())
931
- if not result:
932
- return []
933
-
934
- return [self.storage_class._make(r) for r in result]
935
-
936
909
  def mark_storage_pending(self, storage: Storage, conn=None) -> Storage:
937
910
  # Update status to pending and dates
938
911
  updates = {
@@ -1503,7 +1476,7 @@ class AbstractDBMetastore(AbstractMetastore):
1503
1476
  return self._jobs.update().where(*where)
1504
1477
 
1505
1478
  def _parse_job(self, rows) -> Job:
1506
- return Job.parse(*rows)
1479
+ return self.job_class.parse(*rows)
1507
1480
 
1508
1481
  def _parse_jobs(self, rows) -> Iterator["Job"]:
1509
1482
  for _, g in groupby(rows, lambda r: r[0]):
@@ -143,7 +143,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
143
143
  db.execute("PRAGMA synchronous = NORMAL")
144
144
  db.execute("PRAGMA case_sensitive_like = ON")
145
145
  if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
146
- db.set_trace_callback(print)
146
+ import sys
147
+
148
+ db.set_trace_callback(sys.stderr.write)
147
149
 
148
150
  load_usearch_extension(db)
149
151
 
@@ -515,17 +517,6 @@ class SQLiteMetastore(AbstractDBMetastore):
515
517
  def _datasets_dependencies_insert(self) -> "Insert":
516
518
  return sqlite.insert(self._datasets_dependencies)
517
519
 
518
- #
519
- # Storages
520
- #
521
-
522
- def mark_storage_not_indexed(self, uri: StorageURI) -> None:
523
- """
524
- Mark storage as not indexed.
525
- This method should be called when storage index is deleted.
526
- """
527
- self.db.execute(self._storages_delete().where(self._storages.c.uri == uri))
528
-
529
520
  #
530
521
  # Dataset dependencies
531
522
  #
@@ -218,35 +218,26 @@ class AbstractWarehouse(ABC, Serializable):
218
218
  results = None
219
219
  offset = 0
220
220
  num_yielded = 0
221
- try:
222
- while True:
223
- if limit is not None:
224
- limit -= num_yielded
225
- if limit == 0:
226
- break
227
- if limit < page_size:
228
- paginated_query = paginated_query.limit(None).limit(limit)
229
-
230
- results = self.dataset_rows_select(paginated_query.offset(offset))
231
-
232
- processed = False
233
- for row in results:
234
- processed = True
235
- yield row
236
- num_yielded += 1
237
-
238
- if not processed:
239
- break # no more results
240
- offset += page_size
241
- finally:
242
- # https://www2.sqlite.org/cvstrac/wiki?p=DatabaseIsLocked (SELECT not
243
- # finalized or reset) to prevent database table is locked error when an
244
- # exception is raised in the middle of processing the results (e.g.
245
- # https://github.com/iterative/dvcx/issues/924). Connections close
246
- # apparently is not enough in some cases, at least on sqlite
247
- # https://www.sqlite.org/c3ref/close.html
248
- if results and hasattr(results, "close"):
249
- results.close()
221
+
222
+ while True:
223
+ if limit is not None:
224
+ limit -= num_yielded
225
+ if limit == 0:
226
+ break
227
+ if limit < page_size:
228
+ paginated_query = paginated_query.limit(None).limit(limit)
229
+
230
+ results = self.dataset_rows_select(paginated_query.offset(offset))
231
+
232
+ processed = False
233
+ for row in results:
234
+ processed = True
235
+ yield row
236
+ num_yielded += 1
237
+
238
+ if not processed:
239
+ break # no more results
240
+ offset += page_size
250
241
 
251
242
  #
252
243
  # Table Name Internal Functions