datachain 0.2.15__tar.gz → 0.2.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (256) hide show
  1. {datachain-0.2.15/src/datachain.egg-info → datachain-0.2.17}/PKG-INFO +71 -12
  2. {datachain-0.2.15 → datachain-0.2.17}/README.rst +70 -11
  3. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/iptc_exif_xmp_lib.py +2 -1
  4. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/openimage-detect.py +1 -1
  5. {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/json-csv-reader.py +6 -7
  6. {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/torch-loader.py +1 -1
  7. {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/wds.py +20 -11
  8. {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/wds_filtered.py +1 -0
  9. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/catalog/catalog.py +52 -51
  10. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/cli.py +1 -1
  11. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/db_engine.py +6 -2
  12. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/id_generator.py +14 -0
  13. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/metastore.py +15 -2
  14. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/sqlite.py +45 -6
  15. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/warehouse.py +17 -6
  16. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/arrow.py +22 -7
  17. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/dc.py +37 -26
  18. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/file.py +3 -3
  19. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/signal_schema.py +37 -6
  20. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/listing.py +22 -10
  21. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/dataset.py +17 -17
  22. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/session.py +19 -4
  23. {datachain-0.2.15 → datachain-0.2.17/src/datachain.egg-info}/PKG-INFO +71 -12
  24. {datachain-0.2.15 → datachain-0.2.17}/tests/conftest.py +50 -23
  25. {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_catalog.py +1 -1
  26. {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_datachain.py +25 -15
  27. {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_dataset_query.py +43 -0
  28. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_arrow.py +0 -17
  29. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_datachain.py +372 -156
  30. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_datachain_merge.py +24 -20
  31. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_feature_utils.py +4 -4
  32. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_signal_schema.py +29 -2
  33. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_catalog_loader.py +24 -30
  34. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_data_storage.py +17 -17
  35. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_database_engine.py +9 -11
  36. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_id_generator.py +6 -8
  37. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_metastore.py +7 -9
  38. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_warehouse.py +7 -9
  39. {datachain-0.2.15 → datachain-0.2.17}/.cruft.json +0 -0
  40. {datachain-0.2.15 → datachain-0.2.17}/.gitattributes +0 -0
  41. {datachain-0.2.15 → datachain-0.2.17}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  42. {datachain-0.2.15 → datachain-0.2.17}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  43. {datachain-0.2.15 → datachain-0.2.17}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  44. {datachain-0.2.15 → datachain-0.2.17}/.github/codecov.yaml +0 -0
  45. {datachain-0.2.15 → datachain-0.2.17}/.github/dependabot.yml +0 -0
  46. {datachain-0.2.15 → datachain-0.2.17}/.github/workflows/benchmarks.yml +0 -0
  47. {datachain-0.2.15 → datachain-0.2.17}/.github/workflows/release.yml +0 -0
  48. {datachain-0.2.15 → datachain-0.2.17}/.github/workflows/tests.yml +0 -0
  49. {datachain-0.2.15 → datachain-0.2.17}/.github/workflows/update-template.yaml +0 -0
  50. {datachain-0.2.15 → datachain-0.2.17}/.gitignore +0 -0
  51. {datachain-0.2.15 → datachain-0.2.17}/.pre-commit-config.yaml +0 -0
  52. {datachain-0.2.15 → datachain-0.2.17}/CODE_OF_CONDUCT.rst +0 -0
  53. {datachain-0.2.15 → datachain-0.2.17}/CONTRIBUTING.rst +0 -0
  54. {datachain-0.2.15 → datachain-0.2.17}/LICENSE +0 -0
  55. {datachain-0.2.15 → datachain-0.2.17}/docs/assets/captioned_cartoons.png +0 -0
  56. {datachain-0.2.15 → datachain-0.2.17}/docs/assets/datachain.png +0 -0
  57. {datachain-0.2.15 → datachain-0.2.17}/docs/assets/flowchart.png +0 -0
  58. {datachain-0.2.15 → datachain-0.2.17}/docs/index.md +0 -0
  59. {datachain-0.2.15 → datachain-0.2.17}/docs/references/datachain.md +0 -0
  60. {datachain-0.2.15 → datachain-0.2.17}/docs/references/datatype.md +0 -0
  61. {datachain-0.2.15 → datachain-0.2.17}/docs/references/file.md +0 -0
  62. {datachain-0.2.15 → datachain-0.2.17}/docs/references/index.md +0 -0
  63. {datachain-0.2.15 → datachain-0.2.17}/docs/references/sql.md +0 -0
  64. {datachain-0.2.15 → datachain-0.2.17}/docs/references/torch.md +0 -0
  65. {datachain-0.2.15 → datachain-0.2.17}/docs/references/udf.md +0 -0
  66. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  67. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
  68. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
  69. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
  70. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
  71. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
  72. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/README.md +0 -0
  73. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  74. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
  75. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
  76. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
  77. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
  78. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
  79. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  80. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
  81. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  82. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  83. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  84. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  85. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  86. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  87. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  88. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  89. {datachain-0.2.15 → datachain-0.2.17}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  90. {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/common_sql_functions.py +0 -0
  91. {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/json-metadata-tutorial.ipynb +0 -0
  92. {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/udfs/parallel.py +0 -0
  93. {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/udfs/simple.py +0 -0
  94. {datachain-0.2.15 → datachain-0.2.17}/examples/get_started/udfs/stateful.py +0 -0
  95. {datachain-0.2.15 → datachain-0.2.17}/examples/llm/llm_chatbot_evaluation.ipynb +0 -0
  96. {datachain-0.2.15 → datachain-0.2.17}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  97. {datachain-0.2.15 → datachain-0.2.17}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  98. {datachain-0.2.15 → datachain-0.2.17}/examples/llm_and_nlp/llm-claude.py +0 -0
  99. {datachain-0.2.15 → datachain-0.2.17}/examples/llm_and_nlp/unstructured-text.py +0 -0
  100. {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
  101. {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/clip_inference.py +0 -0
  102. {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/hf_pipeline.py +0 -0
  103. {datachain-0.2.15 → datachain-0.2.17}/examples/multimodal/openai_image_desc_lib.py +0 -0
  104. {datachain-0.2.15 → datachain-0.2.17}/mkdocs.yml +0 -0
  105. {datachain-0.2.15 → datachain-0.2.17}/noxfile.py +0 -0
  106. {datachain-0.2.15 → datachain-0.2.17}/pyproject.toml +0 -0
  107. {datachain-0.2.15 → datachain-0.2.17}/setup.cfg +0 -0
  108. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/__init__.py +0 -0
  109. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/__main__.py +0 -0
  110. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/asyn.py +0 -0
  111. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/cache.py +0 -0
  112. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/catalog/__init__.py +0 -0
  113. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/catalog/datasource.py +0 -0
  114. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/catalog/loader.py +0 -0
  115. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/catalog/subclass.py +0 -0
  116. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/cli_utils.py +0 -0
  117. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/__init__.py +0 -0
  118. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/azure.py +0 -0
  119. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/fileslice.py +0 -0
  120. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/fsspec.py +0 -0
  121. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/gcs.py +0 -0
  122. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/local.py +0 -0
  123. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/client/s3.py +0 -0
  124. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/config.py +0 -0
  125. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/__init__.py +0 -0
  126. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/job.py +0 -0
  127. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/schema.py +0 -0
  128. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/data_storage/serializer.py +0 -0
  129. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/dataset.py +0 -0
  130. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/error.py +0 -0
  131. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/job.py +0 -0
  132. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/__init__.py +0 -0
  133. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/clip.py +0 -0
  134. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/__init__.py +0 -0
  135. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/flatten.py +0 -0
  136. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/python_to_sql.py +0 -0
  137. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/sql_to_python.py +0 -0
  138. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/unflatten.py +0 -0
  139. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  140. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/data_model.py +0 -0
  141. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/dataset_info.py +0 -0
  142. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/image.py +0 -0
  143. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/meta_formats.py +0 -0
  144. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/model_store.py +0 -0
  145. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/pytorch.py +0 -0
  146. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/settings.py +0 -0
  147. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/text.py +0 -0
  148. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/udf.py +0 -0
  149. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/udf_signature.py +0 -0
  150. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/utils.py +0 -0
  151. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/vfile.py +0 -0
  152. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/webdataset.py +0 -0
  153. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/lib/webdataset_laion.py +0 -0
  154. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/node.py +0 -0
  155. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/nodes_fetcher.py +0 -0
  156. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/nodes_thread_pool.py +0 -0
  157. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/progress.py +0 -0
  158. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/py.typed +0 -0
  159. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/__init__.py +0 -0
  160. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/batch.py +0 -0
  161. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/builtins.py +0 -0
  162. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/dispatch.py +0 -0
  163. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/metrics.py +0 -0
  164. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/params.py +0 -0
  165. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/schema.py +0 -0
  166. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/query/udf.py +0 -0
  167. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/remote/__init__.py +0 -0
  168. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/remote/studio.py +0 -0
  169. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/__init__.py +0 -0
  170. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/default/__init__.py +0 -0
  171. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/default/base.py +0 -0
  172. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/__init__.py +0 -0
  173. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/array.py +0 -0
  174. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/conditional.py +0 -0
  175. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/path.py +0 -0
  176. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/random.py +0 -0
  177. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/functions/string.py +0 -0
  178. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/selectable.py +0 -0
  179. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/sqlite/__init__.py +0 -0
  180. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/sqlite/base.py +0 -0
  181. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/sqlite/types.py +0 -0
  182. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/sqlite/vector.py +0 -0
  183. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/types.py +0 -0
  184. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/sql/utils.py +0 -0
  185. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/storage.py +0 -0
  186. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/torch/__init__.py +0 -0
  187. {datachain-0.2.15 → datachain-0.2.17}/src/datachain/utils.py +0 -0
  188. {datachain-0.2.15 → datachain-0.2.17}/src/datachain.egg-info/SOURCES.txt +0 -0
  189. {datachain-0.2.15 → datachain-0.2.17}/src/datachain.egg-info/dependency_links.txt +0 -0
  190. {datachain-0.2.15 → datachain-0.2.17}/src/datachain.egg-info/entry_points.txt +0 -0
  191. {datachain-0.2.15 → datachain-0.2.17}/src/datachain.egg-info/requires.txt +0 -0
  192. {datachain-0.2.15 → datachain-0.2.17}/src/datachain.egg-info/top_level.txt +0 -0
  193. {datachain-0.2.15 → datachain-0.2.17}/tests/__init__.py +0 -0
  194. {datachain-0.2.15 → datachain-0.2.17}/tests/benchmarks/__init__.py +0 -0
  195. {datachain-0.2.15 → datachain-0.2.17}/tests/benchmarks/conftest.py +0 -0
  196. {datachain-0.2.15 → datachain-0.2.17}/tests/benchmarks/test_ls.py +0 -0
  197. {datachain-0.2.15 → datachain-0.2.17}/tests/benchmarks/test_version.py +0 -0
  198. {datachain-0.2.15 → datachain-0.2.17}/tests/data.py +0 -0
  199. {datachain-0.2.15 → datachain-0.2.17}/tests/examples/__init__.py +0 -0
  200. {datachain-0.2.15 → datachain-0.2.17}/tests/examples/test_wds_e2e.py +0 -0
  201. {datachain-0.2.15 → datachain-0.2.17}/tests/examples/wds_data.py +0 -0
  202. {datachain-0.2.15 → datachain-0.2.17}/tests/func/__init__.py +0 -0
  203. {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_client.py +0 -0
  204. {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_datasets.py +0 -0
  205. {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_feature_pickling.py +0 -0
  206. {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_ls.py +0 -0
  207. {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_pull.py +0 -0
  208. {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_pytorch.py +0 -0
  209. {datachain-0.2.15 → datachain-0.2.17}/tests/func/test_query.py +0 -0
  210. {datachain-0.2.15 → datachain-0.2.17}/tests/scripts/feature_class.py +0 -0
  211. {datachain-0.2.15 → datachain-0.2.17}/tests/scripts/feature_class_parallel.py +0 -0
  212. {datachain-0.2.15 → datachain-0.2.17}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  213. {datachain-0.2.15 → datachain-0.2.17}/tests/scripts/name_len_normal.py +0 -0
  214. {datachain-0.2.15 → datachain-0.2.17}/tests/scripts/name_len_slow.py +0 -0
  215. {datachain-0.2.15 → datachain-0.2.17}/tests/test_cli_e2e.py +0 -0
  216. {datachain-0.2.15 → datachain-0.2.17}/tests/test_query_e2e.py +0 -0
  217. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/__init__.py +0 -0
  218. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/__init__.py +0 -0
  219. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/conftest.py +0 -0
  220. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_clip.py +0 -0
  221. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  222. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_feature.py +0 -0
  223. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_file.py +0 -0
  224. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_image.py +0 -0
  225. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_text.py +0 -0
  226. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_udf_signature.py +0 -0
  227. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_utils.py +0 -0
  228. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/lib/test_webdataset.py +0 -0
  229. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/__init__.py +0 -0
  230. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/sqlite/__init__.py +0 -0
  231. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/sqlite/test_utils.py +0 -0
  232. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_array.py +0 -0
  233. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_conditional.py +0 -0
  234. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_path.py +0 -0
  235. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_random.py +0 -0
  236. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_selectable.py +0 -0
  237. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/sql/test_string.py +0 -0
  238. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_asyn.py +0 -0
  239. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_cache.py +0 -0
  240. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_catalog.py +0 -0
  241. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_cli_parsing.py +0 -0
  242. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_client.py +0 -0
  243. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_client_s3.py +0 -0
  244. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_dataset.py +0 -0
  245. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_dispatch.py +0 -0
  246. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_fileslice.py +0 -0
  247. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_listing.py +0 -0
  248. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_module_exports.py +0 -0
  249. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_query_metrics.py +0 -0
  250. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_query_params.py +0 -0
  251. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_serializer.py +0 -0
  252. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_session.py +0 -0
  253. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_storage.py +0 -0
  254. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_udf.py +0 -0
  255. {datachain-0.2.15 → datachain-0.2.17}/tests/unit/test_utils.py +0 -0
  256. {datachain-0.2.15 → datachain-0.2.17}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.15
3
+ Version: 0.2.17
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -100,28 +100,87 @@ Requires-Dist: types-requests; extra == "dev"
100
100
  AI 🔗 DataChain
101
101
  ----------------
102
102
 
103
- DataChain is an open-source Python library for processing and curating unstructured
104
- data at scale.
103
+ DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
104
+ AI engineers build a metadata layer on top of unstructured files and analyze data using
105
+ this layer.
105
106
 
106
- 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
107
+ 📂 **Raw Files Processing**
108
+ Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
109
+ Local), version and update datasets.
107
110
 
108
- 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
111
+ 🌟 **Metadata layer.**
112
+ Build a metadata layer on top of files using structured sources like CSV, Parquet,
113
+ and JSON files.
109
114
 
110
- 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
115
+ **Metadata enrichment.**
116
+ Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
111
117
 
118
+ 🛠️ **Data Transformation.**
119
+ Transform metadata using traditional methods like filtering, grouping, joining, and
120
+ others.
112
121
 
113
- Datachain supports parallel processing, parallel data
114
- downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
115
-
116
- The typical use cases include Computer Vision data curation, LLM analytics,
117
- and validation of multimodal AI applications.
122
+ 🐍 **User-friendly interface.**
123
+ Operate efficiently with familiar Python objects and object fields, eliminating the
124
+ need for SQL.
118
125
 
119
126
 
120
127
  .. code:: console
121
128
 
122
129
  $ pip install datachain
123
130
 
124
- |Flowchart|
131
+
132
+ Data Structures
133
+ ===============
134
+
135
+ DataChain introduces expressive data structures tailored for AI-specific workload:
136
+
137
+ - **Dataset:** Preserves the file-references and meta-information. Takes care of Python
138
+ object serialization, dataset versioning and difference. Operations on dataset:
139
+
140
+ - **Transformations:** traditional data-frame or SQL operations such as filtering,
141
+ grouping, joining.
142
+ - **Enrichments:** mapping, aggregating and generating using customer’s Python
143
+ code. This is needed to work with ML inference and LLM calls.
144
+
145
+ - **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
146
+ mode - only when needed.
147
+
148
+ DataChain name comes from these major data structures: dataset and chaining.
149
+
150
+
151
+ What’s new in DataChain?
152
+ ========================
153
+
154
+ The project combines multiple ideas from different areas in order to simplify AI
155
+ use-cases and at the same time to fit it into traditional data infrastructure.
156
+
157
+ - **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
158
+ native language for AI. It’s powered by `Pydantic`_ data models.
159
+ - **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
160
+ group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
161
+ needed for distributed computations.
162
+ - **Resuming data processing** (in development). Introduces idempotent operations,
163
+ allowing data processing to resume from the last successful process file/record/batch
164
+ if it fails due to issues like failed LLM calls, ML inference or file download.
165
+
166
+ Additional relatively new ideas:
167
+
168
+ - **Functional style data processing.** Using a functional/chaining approach to data
169
+ processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
170
+ - **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
171
+ and implements data versioning, extending ideas from DVC (developed by the same team).
172
+
173
+
174
+ What DataChain is NOT?
175
+ ======================
176
+
177
+ - **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
178
+ `SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
179
+ version.
180
+ - **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
181
+ it delegates heavy data transformations to underlying data warehouses and focuses on
182
+ AI specific data enrichments and orchestrating all the pieces together.
183
+
125
184
 
126
185
  Quick Start
127
186
  -----------
@@ -16,28 +16,87 @@
16
16
  AI 🔗 DataChain
17
17
  ----------------
18
18
 
19
- DataChain is an open-source Python library for processing and curating unstructured
20
- data at scale.
19
+ DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
20
+ AI engineers build a metadata layer on top of unstructured files and analyze data using
21
+ this layer.
21
22
 
22
- 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
23
+ 📂 **Raw Files Processing**
24
+ Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
25
+ Local), version and update datasets.
23
26
 
24
- 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
27
+ 🌟 **Metadata layer.**
28
+ Build a metadata layer on top of files using structured sources like CSV, Parquet,
29
+ and JSON files.
25
30
 
26
- 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
31
+ **Metadata enrichment.**
32
+ Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
27
33
 
34
+ 🛠️ **Data Transformation.**
35
+ Transform metadata using traditional methods like filtering, grouping, joining, and
36
+ others.
28
37
 
29
- Datachain supports parallel processing, parallel data
30
- downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
31
-
32
- The typical use cases include Computer Vision data curation, LLM analytics,
33
- and validation of multimodal AI applications.
38
+ 🐍 **User-friendly interface.**
39
+ Operate efficiently with familiar Python objects and object fields, eliminating the
40
+ need for SQL.
34
41
 
35
42
 
36
43
  .. code:: console
37
44
 
38
45
  $ pip install datachain
39
46
 
40
- |Flowchart|
47
+
48
+ Data Structures
49
+ ===============
50
+
51
+ DataChain introduces expressive data structures tailored for AI-specific workload:
52
+
53
+ - **Dataset:** Preserves the file-references and meta-information. Takes care of Python
54
+ object serialization, dataset versioning and difference. Operations on dataset:
55
+
56
+ - **Transformations:** traditional data-frame or SQL operations such as filtering,
57
+ grouping, joining.
58
+ - **Enrichments:** mapping, aggregating and generating using customer’s Python
59
+ code. This is needed to work with ML inference and LLM calls.
60
+
61
+ - **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
62
+ mode - only when needed.
63
+
64
+ DataChain name comes from these major data structures: dataset and chaining.
65
+
66
+
67
+ What’s new in DataChain?
68
+ ========================
69
+
70
+ The project combines multiple ideas from different areas in order to simplify AI
71
+ use-cases and at the same time to fit it into traditional data infrastructure.
72
+
73
+ - **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
74
+ native language for AI. It’s powered by `Pydantic`_ data models.
75
+ - **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
76
+ group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
77
+ needed for distributed computations.
78
+ - **Resuming data processing** (in development). Introduces idempotent operations,
79
+ allowing data processing to resume from the last successful process file/record/batch
80
+ if it fails due to issues like failed LLM calls, ML inference or file download.
81
+
82
+ Additional relatively new ideas:
83
+
84
+ - **Functional style data processing.** Using a functional/chaining approach to data
85
+ processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
86
+ - **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
87
+ and implements data versioning, extending ideas from DVC (developed by the same team).
88
+
89
+
90
+ What DataChain is NOT?
91
+ ======================
92
+
93
+ - **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
94
+ `SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
95
+ version.
96
+ - **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
97
+ it delegates heavy data transformations to underlying data warehouses and focuses on
98
+ AI specific data enrichments and orchestrating all the pieces together.
99
+
41
100
 
42
101
  Quick Start
43
102
  -----------
@@ -1,3 +1,4 @@
1
+ # pip install defusedxml
1
2
  import json
2
3
 
3
4
  from PIL import (
@@ -63,7 +64,7 @@ if __name__ == "__main__":
63
64
  DataChain.from_storage(source, type="image")
64
65
  .settings(parallel=-1)
65
66
  .filter(C("file.name").glob("*.jpg"))
66
- .limit(10000)
67
+ .limit(5000)
67
68
  .map(
68
69
  image_description,
69
70
  params=["file"],
@@ -48,7 +48,7 @@ def openimage_detect(args):
48
48
  yield fstream, bbox
49
49
 
50
50
 
51
- source = "gs://datachain-demo/openimages-v6-test-jsonpairs"
51
+ source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
52
52
 
53
53
  (
54
54
  DataChain.from_storage(source)
@@ -36,7 +36,7 @@ def main():
36
36
  print("========================================================================")
37
37
  uri = "gs://datachain-demo/jsonl/object.jsonl"
38
38
  jsonl_ds = DataChain.from_json(uri, meta_type="jsonl", print_schema=True)
39
- print(jsonl_ds.to_pandas())
39
+ jsonl_ds.show()
40
40
 
41
41
  print()
42
42
  print("========================================================================")
@@ -49,8 +49,7 @@ def main():
49
49
  json_pairs_ds = DataChain.from_json(
50
50
  uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage"
51
51
  )
52
- print(json_pairs_ds.to_pandas())
53
- # print(list(json_pairs_ds.collect())[0])
52
+ json_pairs_ds.show()
54
53
 
55
54
  uri = "gs://datachain-demo/coco2017/annotations_captions/"
56
55
 
@@ -72,7 +71,7 @@ def main():
72
71
  static_json_ds = DataChain.from_json(
73
72
  uri, jmespath="licenses", spec=LicenseFeature, nrows=3
74
73
  )
75
- print(static_json_ds.to_pandas())
74
+ static_json_ds.show()
76
75
 
77
76
  print()
78
77
  print("========================================================================")
@@ -88,16 +87,16 @@ def main():
88
87
  print("========================================================================")
89
88
  static_csv_ds = DataChain.from_csv(uri, output=ChatDialog, object_name="chat")
90
89
  static_csv_ds.print_schema()
91
- print(static_csv_ds.to_pandas())
90
+ static_csv_ds.show()
92
91
 
93
- uri = "gs://datachain-demo/laion-aesthetics-csv"
92
+ uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
94
93
  print()
95
94
  print("========================================================================")
96
95
  print("dynamic CSV with header schema test parsing 3/3M objects")
97
96
  print("========================================================================")
98
97
  dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3)
99
98
  dynamic_csv_ds.print_schema()
100
- print(dynamic_csv_ds.to_pandas())
99
+ dynamic_csv_ds.show()
101
100
 
102
101
 
103
102
  if __name__ == "__main__":
@@ -64,7 +64,7 @@ if __name__ == "__main__":
64
64
  optimizer = optim.Adam(model.parameters(), lr=0.001)
65
65
 
66
66
  # Train the model
67
- num_epochs = 10
67
+ num_epochs = 3
68
68
  for epoch in range(num_epochs):
69
69
  for i, data in enumerate(train_loader):
70
70
  inputs, labels = data
@@ -1,5 +1,3 @@
1
- import pandas as pd
2
-
3
1
  from datachain import C, DataChain
4
2
  from datachain.lib.webdataset import process_webdataset
5
3
  from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
@@ -9,25 +7,36 @@ wds = (
9
7
  .filter(C("file.name").glob("00000000.tar"))
10
8
  .settings(cache=True)
11
9
  .gen(laion=process_webdataset(spec=WDSLaion), params="file")
10
+ .save() # materialize chain to avoid downloading data multiple times
11
+ )
12
+
13
+ meta_pq = (
14
+ DataChain.from_parquet("gs://datachain-demo/datacomp-small/metadata/0020f*.parquet")
15
+ .filter(
16
+ C("uid").in_(values[0] for values in wds.select("laion.json.uid").collect())
17
+ )
18
+ .map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
19
+ .save()
12
20
  )
13
21
 
14
22
  meta_emd = (
15
- DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata")
16
- .filter(C("file.name").glob("0020f*.npz"))
23
+ DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata/0020f*.npz")
17
24
  .gen(emd=process_laion_meta)
25
+ .filter(
26
+ C("emd.index").in_(
27
+ values[0] for values in meta_pq.select("source.index").collect()
28
+ )
29
+ )
18
30
  .map(stem=lambda file: file.get_file_stem(), params=["emd.file"], output=str)
19
31
  )
20
32
 
21
- meta_pq = DataChain.from_parquet(
22
- "gs://datachain-demo/datacomp-small/metadata/0020f*.parquet"
23
- ).map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
24
33
 
25
34
  meta = meta_emd.merge(
26
- meta_pq, on=["stem", "emd.index"], right_on=["stem", "source.index"]
35
+ meta_pq,
36
+ on=["stem", "emd.index"],
37
+ right_on=["stem", "source.index"],
27
38
  )
28
39
 
29
40
  res = wds.merge(meta, on="laion.json.uid", right_on="uid")
30
41
 
31
- df = res.limit(10).to_pandas()
32
- with pd.option_context("display.max_columns", None):
33
- print(df)
42
+ res.show(3)
@@ -31,6 +31,7 @@ filtered = (
31
31
  / least(C("laion.json.original_width"), C("laion.json.original_height"))
32
32
  < 3.0
33
33
  )
34
+ .save()
34
35
  )
35
36
  filtered.show(3)
36
37
 
@@ -236,36 +236,36 @@ class DatasetRowsFetcher(NodesThreadPool):
236
236
  import lz4.frame
237
237
  import pandas as pd
238
238
 
239
- metastore = self.metastore.clone() # metastore is not thread safe
240
- warehouse = self.warehouse.clone() # warehouse is not thread safe
241
- dataset = metastore.get_dataset(self.dataset_name)
242
-
243
- urls = list(urls)
244
- while urls:
245
- for url in urls:
246
- if self.should_check_for_status():
247
- self.check_for_status()
248
-
249
- r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
250
- if r.status_code == 404:
251
- time.sleep(PULL_DATASET_SLEEP_INTERVAL)
252
- # moving to the next url
253
- continue
239
+ # metastore and warehouse are not thread safe
240
+ with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
241
+ dataset = metastore.get_dataset(self.dataset_name)
254
242
 
255
- r.raise_for_status()
243
+ urls = list(urls)
244
+ while urls:
245
+ for url in urls:
246
+ if self.should_check_for_status():
247
+ self.check_for_status()
256
248
 
257
- df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
249
+ r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
250
+ if r.status_code == 404:
251
+ time.sleep(PULL_DATASET_SLEEP_INTERVAL)
252
+ # moving to the next url
253
+ continue
258
254
 
259
- self.fix_columns(df)
255
+ r.raise_for_status()
260
256
 
261
- # id will be autogenerated in DB
262
- df = df.drop("sys__id", axis=1)
257
+ df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
263
258
 
264
- inserted = warehouse.insert_dataset_rows(
265
- df, dataset, self.dataset_version
266
- )
267
- self.increase_counter(inserted) # type: ignore [arg-type]
268
- urls.remove(url)
259
+ self.fix_columns(df)
260
+
261
+ # id will be autogenerated in DB
262
+ df = df.drop("sys__id", axis=1)
263
+
264
+ inserted = warehouse.insert_dataset_rows(
265
+ df, dataset, self.dataset_version
266
+ )
267
+ self.increase_counter(inserted) # type: ignore [arg-type]
268
+ urls.remove(url)
269
269
 
270
270
 
271
271
  @dataclass
@@ -720,7 +720,6 @@ class Catalog:
720
720
  client.uri, posixpath.join(prefix, "")
721
721
  )
722
722
  source_metastore = self.metastore.clone(client.uri)
723
- source_warehouse = self.warehouse.clone()
724
723
 
725
724
  columns = [
726
725
  Column("vtype", String),
@@ -1217,16 +1216,14 @@ class Catalog:
1217
1216
  def get_temp_table_names(self) -> list[str]:
1218
1217
  return self.warehouse.get_temp_table_names()
1219
1218
 
1220
- def cleanup_temp_tables(self, names: Iterable[str]) -> None:
1219
+ def cleanup_tables(self, names: Iterable[str]) -> None:
1221
1220
  """
1222
- Drop tables created temporarily when processing datasets.
1221
+ Drop tables passed.
1223
1222
 
1224
- This should be implemented even if temporary tables are used to
1225
- ensure that they are cleaned up as soon as they are no longer
1226
- needed. When running the same `DatasetQuery` multiple times we
1227
- may use the same temporary table names.
1223
+ This should be implemented to ensure that the provided tables
1224
+ are cleaned up as soon as they are no longer needed.
1228
1225
  """
1229
- self.warehouse.cleanup_temp_tables(names)
1226
+ self.warehouse.cleanup_tables(names)
1230
1227
  self.id_generator.delete_uris(names)
1231
1228
 
1232
1229
  def create_dataset_from_sources(
@@ -1837,25 +1834,29 @@ class Catalog:
1837
1834
  if signed_urls:
1838
1835
  shuffle(signed_urls)
1839
1836
 
1840
- rows_fetcher = DatasetRowsFetcher(
1841
- self.metastore.clone(),
1842
- self.warehouse.clone(),
1843
- remote_config,
1844
- dataset.name,
1845
- version,
1846
- schema,
1847
- )
1848
- try:
1849
- rows_fetcher.run(
1850
- batched(
1851
- signed_urls,
1852
- math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
1853
- ),
1854
- dataset_save_progress_bar,
1837
+ with (
1838
+ self.metastore.clone() as metastore,
1839
+ self.warehouse.clone() as warehouse,
1840
+ ):
1841
+ rows_fetcher = DatasetRowsFetcher(
1842
+ metastore,
1843
+ warehouse,
1844
+ remote_config,
1845
+ dataset.name,
1846
+ version,
1847
+ schema,
1855
1848
  )
1856
- except:
1857
- self.remove_dataset(dataset.name, version)
1858
- raise
1849
+ try:
1850
+ rows_fetcher.run(
1851
+ batched(
1852
+ signed_urls,
1853
+ math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
1854
+ ),
1855
+ dataset_save_progress_bar,
1856
+ )
1857
+ except:
1858
+ self.remove_dataset(dataset.name, version)
1859
+ raise
1859
1860
 
1860
1861
  dataset = self.metastore.update_dataset_status(
1861
1862
  dataset,
@@ -910,7 +910,7 @@ def garbage_collect(catalog: "Catalog"):
910
910
  print("Nothing to clean up.")
911
911
  else:
912
912
  print(f"Garbage collecting {len(temp_tables)} tables.")
913
- catalog.cleanup_temp_tables(temp_tables)
913
+ catalog.cleanup_tables(temp_tables)
914
914
 
915
915
 
916
916
  def completion(shell: str) -> str:
@@ -4,7 +4,6 @@ from collections.abc import Iterator
4
4
  from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
5
5
 
6
6
  import sqlalchemy as sa
7
- from attrs import frozen
8
7
  from sqlalchemy.sql import FROM_LINTING
9
8
  from sqlalchemy.sql.roles import DDLRole
10
9
 
@@ -23,13 +22,18 @@ logger = logging.getLogger("datachain")
23
22
  SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
24
23
 
25
24
 
26
- @frozen
27
25
  class DatabaseEngine(ABC, Serializable):
28
26
  dialect: ClassVar["Dialect"]
29
27
 
30
28
  engine: "Engine"
31
29
  metadata: "MetaData"
32
30
 
31
+ def __enter__(self) -> "DatabaseEngine":
32
+ return self
33
+
34
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
35
+ self.close()
36
+
33
37
  @abstractmethod
34
38
  def clone(self) -> "DatabaseEngine":
35
39
  """Clones DatabaseEngine implementation."""
@@ -33,6 +33,16 @@ class AbstractIDGenerator(ABC, Serializable):
33
33
  def cleanup_for_tests(self):
34
34
  """Cleanup for tests."""
35
35
 
36
+ def close(self) -> None:
37
+ """Closes any active database connections."""
38
+
39
+ def close_on_exit(self) -> None:
40
+ """Closes any active database or HTTP connections, called on Session exit or
41
+ for test cleanup only, as some ID Generator implementations may handle this
42
+ differently.
43
+ """
44
+ self.close()
45
+
36
46
  @abstractmethod
37
47
  def init_id(self, uri: str) -> None:
38
48
  """Initializes the ID generator for the given URI with zero last_id."""
@@ -83,6 +93,10 @@ class AbstractDBIDGenerator(AbstractIDGenerator):
83
93
  def clone(self) -> "AbstractDBIDGenerator":
84
94
  """Clones AbstractIDGenerator implementation."""
85
95
 
96
+ def close(self) -> None:
97
+ """Closes any active database connections."""
98
+ self.db.close()
99
+
86
100
  @property
87
101
  def db(self) -> "DatabaseEngine":
88
102
  return self._db
@@ -78,6 +78,13 @@ class AbstractMetastore(ABC, Serializable):
78
78
  self.uri = uri
79
79
  self.partial_id: Optional[int] = partial_id
80
80
 
81
+ def __enter__(self) -> "AbstractMetastore":
82
+ """Returns self upon entering context manager."""
83
+ return self
84
+
85
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
86
+ """Default behavior is to do nothing, as connections may be shared."""
87
+
81
88
  @abstractmethod
82
89
  def clone(
83
90
  self,
@@ -97,7 +104,13 @@ class AbstractMetastore(ABC, Serializable):
97
104
  def close(self) -> None:
98
105
  """Closes any active database or HTTP connections."""
99
106
 
100
- def cleanup_temp_tables(self, temp_table_names: list[str]) -> None:
107
+ def close_on_exit(self) -> None:
108
+ """Closes any active database or HTTP connections, called on Session exit or
109
+ for test cleanup only, as some Metastore implementations may handle this
110
+ differently."""
111
+ self.close()
112
+
113
+ def cleanup_tables(self, temp_table_names: list[str]) -> None:
101
114
  """Cleanup temp tables."""
102
115
 
103
116
  def cleanup_for_tests(self) -> None:
@@ -457,7 +470,7 @@ class AbstractDBMetastore(AbstractMetastore):
457
470
  """Closes any active database connections."""
458
471
  self.db.close()
459
472
 
460
- def cleanup_temp_tables(self, temp_table_names: list[str]) -> None:
473
+ def cleanup_tables(self, temp_table_names: list[str]) -> None:
461
474
  """Cleanup temp tables."""
462
475
  self.id_generator.delete_uris(temp_table_names)
463
476