datachain 0.2.16__tar.gz → 0.2.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (258) hide show
  1. {datachain-0.2.16 → datachain-0.2.18}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.2.16/src/datachain.egg-info → datachain-0.2.18}/PKG-INFO +71 -12
  3. {datachain-0.2.16 → datachain-0.2.18}/README.rst +70 -11
  4. {datachain-0.2.16 → datachain-0.2.18}/examples/get_started/json-csv-reader.py +9 -0
  5. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/catalog/catalog.py +47 -44
  6. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/data_storage/db_engine.py +6 -2
  7. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/data_storage/id_generator.py +14 -0
  8. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/data_storage/metastore.py +13 -0
  9. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/data_storage/sqlite.py +45 -6
  10. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/data_storage/warehouse.py +13 -0
  11. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/arrow.py +22 -7
  12. datachain-0.2.18/src/datachain/lib/convert/sql_to_python.py +18 -0
  13. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/dc.py +53 -6
  14. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/file.py +3 -3
  15. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/signal_schema.py +33 -5
  16. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/listing.py +22 -10
  17. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/query/dataset.py +17 -20
  18. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/query/session.py +19 -4
  19. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/functions/__init__.py +3 -2
  20. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/functions/array.py +8 -0
  21. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/sqlite/base.py +5 -0
  22. {datachain-0.2.16 → datachain-0.2.18/src/datachain.egg-info}/PKG-INFO +71 -12
  23. {datachain-0.2.16 → datachain-0.2.18}/src/datachain.egg-info/SOURCES.txt +1 -0
  24. {datachain-0.2.16 → datachain-0.2.18}/tests/conftest.py +50 -23
  25. {datachain-0.2.16 → datachain-0.2.18}/tests/func/test_datachain.py +25 -15
  26. {datachain-0.2.16 → datachain-0.2.18}/tests/func/test_dataset_query.py +43 -0
  27. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_arrow.py +0 -17
  28. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_datachain.py +402 -163
  29. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_datachain_merge.py +24 -20
  30. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_feature_utils.py +4 -4
  31. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_signal_schema.py +29 -2
  32. datachain-0.2.18/tests/unit/lib/test_sql_to_python.py +28 -0
  33. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_catalog_loader.py +24 -30
  34. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_data_storage.py +17 -17
  35. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_database_engine.py +9 -11
  36. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_id_generator.py +6 -8
  37. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_metastore.py +7 -9
  38. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_warehouse.py +7 -9
  39. datachain-0.2.16/src/datachain/lib/convert/sql_to_python.py +0 -23
  40. {datachain-0.2.16 → datachain-0.2.18}/.cruft.json +0 -0
  41. {datachain-0.2.16 → datachain-0.2.18}/.gitattributes +0 -0
  42. {datachain-0.2.16 → datachain-0.2.18}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  43. {datachain-0.2.16 → datachain-0.2.18}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  44. {datachain-0.2.16 → datachain-0.2.18}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  45. {datachain-0.2.16 → datachain-0.2.18}/.github/codecov.yaml +0 -0
  46. {datachain-0.2.16 → datachain-0.2.18}/.github/dependabot.yml +0 -0
  47. {datachain-0.2.16 → datachain-0.2.18}/.github/workflows/benchmarks.yml +0 -0
  48. {datachain-0.2.16 → datachain-0.2.18}/.github/workflows/release.yml +0 -0
  49. {datachain-0.2.16 → datachain-0.2.18}/.github/workflows/tests.yml +0 -0
  50. {datachain-0.2.16 → datachain-0.2.18}/.github/workflows/update-template.yaml +0 -0
  51. {datachain-0.2.16 → datachain-0.2.18}/.gitignore +0 -0
  52. {datachain-0.2.16 → datachain-0.2.18}/CODE_OF_CONDUCT.rst +0 -0
  53. {datachain-0.2.16 → datachain-0.2.18}/CONTRIBUTING.rst +0 -0
  54. {datachain-0.2.16 → datachain-0.2.18}/LICENSE +0 -0
  55. {datachain-0.2.16 → datachain-0.2.18}/docs/assets/captioned_cartoons.png +0 -0
  56. {datachain-0.2.16 → datachain-0.2.18}/docs/assets/datachain.png +0 -0
  57. {datachain-0.2.16 → datachain-0.2.18}/docs/assets/flowchart.png +0 -0
  58. {datachain-0.2.16 → datachain-0.2.18}/docs/index.md +0 -0
  59. {datachain-0.2.16 → datachain-0.2.18}/docs/references/datachain.md +0 -0
  60. {datachain-0.2.16 → datachain-0.2.18}/docs/references/datatype.md +0 -0
  61. {datachain-0.2.16 → datachain-0.2.18}/docs/references/file.md +0 -0
  62. {datachain-0.2.16 → datachain-0.2.18}/docs/references/index.md +0 -0
  63. {datachain-0.2.16 → datachain-0.2.18}/docs/references/sql.md +0 -0
  64. {datachain-0.2.16 → datachain-0.2.18}/docs/references/torch.md +0 -0
  65. {datachain-0.2.16 → datachain-0.2.18}/docs/references/udf.md +0 -0
  66. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  67. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
  68. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
  69. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
  70. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
  71. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
  72. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/README.md +0 -0
  73. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  74. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
  75. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
  76. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
  77. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
  78. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
  79. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  80. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
  81. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  82. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  83. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  84. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  85. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  86. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  87. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  88. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  89. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  90. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  91. {datachain-0.2.16 → datachain-0.2.18}/examples/computer_vision/openimage-detect.py +0 -0
  92. {datachain-0.2.16 → datachain-0.2.18}/examples/get_started/common_sql_functions.py +0 -0
  93. {datachain-0.2.16 → datachain-0.2.18}/examples/get_started/json-metadata-tutorial.ipynb +0 -0
  94. {datachain-0.2.16 → datachain-0.2.18}/examples/get_started/torch-loader.py +0 -0
  95. {datachain-0.2.16 → datachain-0.2.18}/examples/get_started/udfs/parallel.py +0 -0
  96. {datachain-0.2.16 → datachain-0.2.18}/examples/get_started/udfs/simple.py +0 -0
  97. {datachain-0.2.16 → datachain-0.2.18}/examples/get_started/udfs/stateful.py +0 -0
  98. {datachain-0.2.16 → datachain-0.2.18}/examples/llm/llm_chatbot_evaluation.ipynb +0 -0
  99. {datachain-0.2.16 → datachain-0.2.18}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  100. {datachain-0.2.16 → datachain-0.2.18}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  101. {datachain-0.2.16 → datachain-0.2.18}/examples/llm_and_nlp/llm-claude.py +0 -0
  102. {datachain-0.2.16 → datachain-0.2.18}/examples/llm_and_nlp/unstructured-text.py +0 -0
  103. {datachain-0.2.16 → datachain-0.2.18}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
  104. {datachain-0.2.16 → datachain-0.2.18}/examples/multimodal/clip_inference.py +0 -0
  105. {datachain-0.2.16 → datachain-0.2.18}/examples/multimodal/hf_pipeline.py +0 -0
  106. {datachain-0.2.16 → datachain-0.2.18}/examples/multimodal/openai_image_desc_lib.py +0 -0
  107. {datachain-0.2.16 → datachain-0.2.18}/examples/multimodal/wds.py +0 -0
  108. {datachain-0.2.16 → datachain-0.2.18}/examples/multimodal/wds_filtered.py +0 -0
  109. {datachain-0.2.16 → datachain-0.2.18}/mkdocs.yml +0 -0
  110. {datachain-0.2.16 → datachain-0.2.18}/noxfile.py +0 -0
  111. {datachain-0.2.16 → datachain-0.2.18}/pyproject.toml +0 -0
  112. {datachain-0.2.16 → datachain-0.2.18}/setup.cfg +0 -0
  113. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/__init__.py +0 -0
  114. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/__main__.py +0 -0
  115. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/asyn.py +0 -0
  116. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/cache.py +0 -0
  117. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/catalog/__init__.py +0 -0
  118. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/catalog/datasource.py +0 -0
  119. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/catalog/loader.py +0 -0
  120. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/catalog/subclass.py +0 -0
  121. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/cli.py +0 -0
  122. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/cli_utils.py +0 -0
  123. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/client/__init__.py +0 -0
  124. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/client/azure.py +0 -0
  125. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/client/fileslice.py +0 -0
  126. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/client/fsspec.py +0 -0
  127. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/client/gcs.py +0 -0
  128. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/client/local.py +0 -0
  129. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/client/s3.py +0 -0
  130. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/config.py +0 -0
  131. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/data_storage/__init__.py +0 -0
  132. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/data_storage/job.py +0 -0
  133. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/data_storage/schema.py +0 -0
  134. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/data_storage/serializer.py +0 -0
  135. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/dataset.py +0 -0
  136. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/error.py +0 -0
  137. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/job.py +0 -0
  138. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/__init__.py +0 -0
  139. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/clip.py +0 -0
  140. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/convert/__init__.py +0 -0
  141. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/convert/flatten.py +0 -0
  142. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/convert/python_to_sql.py +0 -0
  143. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/convert/unflatten.py +0 -0
  144. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  145. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/data_model.py +0 -0
  146. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/dataset_info.py +0 -0
  147. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/image.py +0 -0
  148. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/meta_formats.py +0 -0
  149. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/model_store.py +0 -0
  150. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/pytorch.py +0 -0
  151. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/settings.py +0 -0
  152. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/text.py +0 -0
  153. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/udf.py +0 -0
  154. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/udf_signature.py +0 -0
  155. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/utils.py +0 -0
  156. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/vfile.py +0 -0
  157. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/webdataset.py +0 -0
  158. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/lib/webdataset_laion.py +0 -0
  159. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/node.py +0 -0
  160. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/nodes_fetcher.py +0 -0
  161. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/nodes_thread_pool.py +0 -0
  162. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/progress.py +0 -0
  163. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/py.typed +0 -0
  164. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/query/__init__.py +0 -0
  165. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/query/batch.py +0 -0
  166. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/query/builtins.py +0 -0
  167. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/query/dispatch.py +0 -0
  168. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/query/metrics.py +0 -0
  169. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/query/params.py +0 -0
  170. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/query/schema.py +0 -0
  171. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/query/udf.py +0 -0
  172. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/remote/__init__.py +0 -0
  173. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/remote/studio.py +0 -0
  174. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/__init__.py +0 -0
  175. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/default/__init__.py +0 -0
  176. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/default/base.py +0 -0
  177. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/functions/conditional.py +0 -0
  178. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/functions/path.py +0 -0
  179. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/functions/random.py +0 -0
  180. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/functions/string.py +0 -0
  181. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/selectable.py +0 -0
  182. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/sqlite/__init__.py +0 -0
  183. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/sqlite/types.py +0 -0
  184. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/sqlite/vector.py +0 -0
  185. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/types.py +0 -0
  186. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/sql/utils.py +0 -0
  187. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/storage.py +0 -0
  188. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/torch/__init__.py +0 -0
  189. {datachain-0.2.16 → datachain-0.2.18}/src/datachain/utils.py +0 -0
  190. {datachain-0.2.16 → datachain-0.2.18}/src/datachain.egg-info/dependency_links.txt +0 -0
  191. {datachain-0.2.16 → datachain-0.2.18}/src/datachain.egg-info/entry_points.txt +0 -0
  192. {datachain-0.2.16 → datachain-0.2.18}/src/datachain.egg-info/requires.txt +0 -0
  193. {datachain-0.2.16 → datachain-0.2.18}/src/datachain.egg-info/top_level.txt +0 -0
  194. {datachain-0.2.16 → datachain-0.2.18}/tests/__init__.py +0 -0
  195. {datachain-0.2.16 → datachain-0.2.18}/tests/benchmarks/__init__.py +0 -0
  196. {datachain-0.2.16 → datachain-0.2.18}/tests/benchmarks/conftest.py +0 -0
  197. {datachain-0.2.16 → datachain-0.2.18}/tests/benchmarks/test_ls.py +0 -0
  198. {datachain-0.2.16 → datachain-0.2.18}/tests/benchmarks/test_version.py +0 -0
  199. {datachain-0.2.16 → datachain-0.2.18}/tests/data.py +0 -0
  200. {datachain-0.2.16 → datachain-0.2.18}/tests/examples/__init__.py +0 -0
  201. {datachain-0.2.16 → datachain-0.2.18}/tests/examples/test_wds_e2e.py +0 -0
  202. {datachain-0.2.16 → datachain-0.2.18}/tests/examples/wds_data.py +0 -0
  203. {datachain-0.2.16 → datachain-0.2.18}/tests/func/__init__.py +0 -0
  204. {datachain-0.2.16 → datachain-0.2.18}/tests/func/test_catalog.py +0 -0
  205. {datachain-0.2.16 → datachain-0.2.18}/tests/func/test_client.py +0 -0
  206. {datachain-0.2.16 → datachain-0.2.18}/tests/func/test_datasets.py +0 -0
  207. {datachain-0.2.16 → datachain-0.2.18}/tests/func/test_feature_pickling.py +0 -0
  208. {datachain-0.2.16 → datachain-0.2.18}/tests/func/test_ls.py +0 -0
  209. {datachain-0.2.16 → datachain-0.2.18}/tests/func/test_pull.py +0 -0
  210. {datachain-0.2.16 → datachain-0.2.18}/tests/func/test_pytorch.py +0 -0
  211. {datachain-0.2.16 → datachain-0.2.18}/tests/func/test_query.py +0 -0
  212. {datachain-0.2.16 → datachain-0.2.18}/tests/scripts/feature_class.py +0 -0
  213. {datachain-0.2.16 → datachain-0.2.18}/tests/scripts/feature_class_parallel.py +0 -0
  214. {datachain-0.2.16 → datachain-0.2.18}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  215. {datachain-0.2.16 → datachain-0.2.18}/tests/scripts/name_len_normal.py +0 -0
  216. {datachain-0.2.16 → datachain-0.2.18}/tests/scripts/name_len_slow.py +0 -0
  217. {datachain-0.2.16 → datachain-0.2.18}/tests/test_cli_e2e.py +0 -0
  218. {datachain-0.2.16 → datachain-0.2.18}/tests/test_query_e2e.py +0 -0
  219. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/__init__.py +0 -0
  220. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/__init__.py +0 -0
  221. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/conftest.py +0 -0
  222. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_clip.py +0 -0
  223. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  224. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_feature.py +0 -0
  225. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_file.py +0 -0
  226. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_image.py +0 -0
  227. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_text.py +0 -0
  228. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_udf_signature.py +0 -0
  229. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_utils.py +0 -0
  230. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/lib/test_webdataset.py +0 -0
  231. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/sql/__init__.py +0 -0
  232. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/sql/sqlite/__init__.py +0 -0
  233. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/sql/sqlite/test_utils.py +0 -0
  234. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/sql/test_array.py +0 -0
  235. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/sql/test_conditional.py +0 -0
  236. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/sql/test_path.py +0 -0
  237. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/sql/test_random.py +0 -0
  238. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/sql/test_selectable.py +0 -0
  239. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/sql/test_string.py +0 -0
  240. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_asyn.py +0 -0
  241. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_cache.py +0 -0
  242. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_catalog.py +0 -0
  243. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_cli_parsing.py +0 -0
  244. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_client.py +0 -0
  245. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_client_s3.py +0 -0
  246. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_dataset.py +0 -0
  247. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_dispatch.py +0 -0
  248. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_fileslice.py +0 -0
  249. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_listing.py +0 -0
  250. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_module_exports.py +0 -0
  251. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_query_metrics.py +0 -0
  252. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_query_params.py +0 -0
  253. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_serializer.py +0 -0
  254. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_session.py +0 -0
  255. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_storage.py +0 -0
  256. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_udf.py +0 -0
  257. {datachain-0.2.16 → datachain-0.2.18}/tests/unit/test_utils.py +0 -0
  258. {datachain-0.2.16 → datachain-0.2.18}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.5.5'
27
+ rev: 'v0.5.6'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.16
3
+ Version: 0.2.18
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -100,28 +100,87 @@ Requires-Dist: types-requests; extra == "dev"
100
100
  AI 🔗 DataChain
101
101
  ----------------
102
102
 
103
- DataChain is an open-source Python library for processing and curating unstructured
104
- data at scale.
103
+ DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
104
+ AI engineers build a metadata layer on top of unstructured files and analyze data using
105
+ this layer.
105
106
 
106
- 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
107
+ 📂 **Raw Files Processing**
108
+ Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
109
+ Local), version and update datasets.
107
110
 
108
- 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
111
+ 🌟 **Metadata layer.**
112
+ Build a metadata layer on top of files using structured sources like CSV, Parquet,
113
+ and JSON files.
109
114
 
110
- 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
115
+ **Metadata enrichment.**
116
+ Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
111
117
 
118
+ 🛠️ **Data Transformation.**
119
+ Transform metadata using traditional methods like filtering, grouping, joining, and
120
+ others.
112
121
 
113
- Datachain supports parallel processing, parallel data
114
- downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
115
-
116
- The typical use cases include Computer Vision data curation, LLM analytics,
117
- and validation of multimodal AI applications.
122
+ 🐍 **User-friendly interface.**
123
+ Operate efficiently with familiar Python objects and object fields, eliminating the
124
+ need for SQL.
118
125
 
119
126
 
120
127
  .. code:: console
121
128
 
122
129
  $ pip install datachain
123
130
 
124
- |Flowchart|
131
+
132
+ Data Structures
133
+ ===============
134
+
135
+ DataChain introduces expressive data structures tailored for AI-specific workload:
136
+
137
+ - **Dataset:** Preserves the file-references and meta-information. Takes care of Python
138
+ object serialization, dataset versioning and difference. Operations on dataset:
139
+
140
+ - **Transformations:** traditional data-frame or SQL operations such as filtering,
141
+ grouping, joining.
142
+ - **Enrichments:** mapping, aggregating and generating using customer’s Python
143
+ code. This is needed to work with ML inference and LLM calls.
144
+
145
+ - **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
146
+ mode - only when needed.
147
+
148
+ DataChain name comes from these major data structures: dataset and chaining.
149
+
150
+
151
+ What’s new in DataChain?
152
+ ========================
153
+
154
+ The project combines multiple ideas from different areas in order to simplify AI
155
+ use-cases and at the same time to fit it into traditional data infrastructure.
156
+
157
+ - **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
158
+ native language for AI. It’s powered by `Pydantic`_ data models.
159
+ - **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
160
+ group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
161
+ needed for distributed computations.
162
+ - **Resuming data processing** (in development). Introduces idempotent operations,
163
+ allowing data processing to resume from the last successful process file/record/batch
164
+ if it fails due to issues like failed LLM calls, ML inference or file download.
165
+
166
+ Additional relatively new ideas:
167
+
168
+ - **Functional style data processing.** Using a functional/chaining approach to data
169
+ processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
170
+ - **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
171
+ and implements data versioning, extending ideas from DVC (developed by the same team).
172
+
173
+
174
+ What DataChain is NOT?
175
+ ======================
176
+
177
+ - **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
178
+ `SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
179
+ version.
180
+ - **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
181
+ it delegates heavy data transformations to underlying data warehouses and focuses on
182
+ AI specific data enrichments and orchestrating all the pieces together.
183
+
125
184
 
126
185
  Quick Start
127
186
  -----------
@@ -16,28 +16,87 @@
16
16
  AI 🔗 DataChain
17
17
  ----------------
18
18
 
19
- DataChain is an open-source Python library for processing and curating unstructured
20
- data at scale.
19
+ DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
20
+ AI engineers build a metadata layer on top of unstructured files and analyze data using
21
+ this layer.
21
22
 
22
- 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
23
+ 📂 **Raw Files Processing**
24
+ Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
25
+ Local), version and update datasets.
23
26
 
24
- 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
27
+ 🌟 **Metadata layer.**
28
+ Build a metadata layer on top of files using structured sources like CSV, Parquet,
29
+ and JSON files.
25
30
 
26
- 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
31
+ **Metadata enrichment.**
32
+ Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
27
33
 
34
+ 🛠️ **Data Transformation.**
35
+ Transform metadata using traditional methods like filtering, grouping, joining, and
36
+ others.
28
37
 
29
- Datachain supports parallel processing, parallel data
30
- downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
31
-
32
- The typical use cases include Computer Vision data curation, LLM analytics,
33
- and validation of multimodal AI applications.
38
+ 🐍 **User-friendly interface.**
39
+ Operate efficiently with familiar Python objects and object fields, eliminating the
40
+ need for SQL.
34
41
 
35
42
 
36
43
  .. code:: console
37
44
 
38
45
  $ pip install datachain
39
46
 
40
- |Flowchart|
47
+
48
+ Data Structures
49
+ ===============
50
+
51
+ DataChain introduces expressive data structures tailored for AI-specific workload:
52
+
53
+ - **Dataset:** Preserves the file-references and meta-information. Takes care of Python
54
+ object serialization, dataset versioning and difference. Operations on dataset:
55
+
56
+ - **Transformations:** traditional data-frame or SQL operations such as filtering,
57
+ grouping, joining.
58
+ - **Enrichments:** mapping, aggregating and generating using customer’s Python
59
+ code. This is needed to work with ML inference and LLM calls.
60
+
61
+ - **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
62
+ mode - only when needed.
63
+
64
+ DataChain name comes from these major data structures: dataset and chaining.
65
+
66
+
67
+ What’s new in DataChain?
68
+ ========================
69
+
70
+ The project combines multiple ideas from different areas in order to simplify AI
71
+ use-cases and at the same time to fit it into traditional data infrastructure.
72
+
73
+ - **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
74
+ native language for AI. It’s powered by `Pydantic`_ data models.
75
+ - **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
76
+ group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
77
+ needed for distributed computations.
78
+ - **Resuming data processing** (in development). Introduces idempotent operations,
79
+ allowing data processing to resume from the last successful process file/record/batch
80
+ if it fails due to issues like failed LLM calls, ML inference or file download.
81
+
82
+ Additional relatively new ideas:
83
+
84
+ - **Functional style data processing.** Using a functional/chaining approach to data
85
+ processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
86
+ - **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
87
+ and implements data versioning, extending ideas from DVC (developed by the same team).
88
+
89
+
90
+ What DataChain is NOT?
91
+ ======================
92
+
93
+ - **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
94
+ `SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
95
+ version.
96
+ - **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
97
+ it delegates heavy data transformations to underlying data warehouses and focuses on
98
+ AI specific data enrichments and orchestrating all the pieces together.
99
+
41
100
 
42
101
  Quick Start
43
102
  -----------
@@ -89,6 +89,15 @@ def main():
89
89
  static_csv_ds.print_schema()
90
90
  static_csv_ds.show()
91
91
 
92
+ uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
93
+ print()
94
+ print("========================================================================")
95
+ print("dynamic CSV with header schema test parsing 3/3M objects")
96
+ print("========================================================================")
97
+ dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3)
98
+ dynamic_csv_ds.print_schema()
99
+ dynamic_csv_ds.show()
100
+
92
101
 
93
102
  if __name__ == "__main__":
94
103
  main()
@@ -236,36 +236,36 @@ class DatasetRowsFetcher(NodesThreadPool):
236
236
  import lz4.frame
237
237
  import pandas as pd
238
238
 
239
- metastore = self.metastore.clone() # metastore is not thread safe
240
- warehouse = self.warehouse.clone() # warehouse is not thread safe
241
- dataset = metastore.get_dataset(self.dataset_name)
242
-
243
- urls = list(urls)
244
- while urls:
245
- for url in urls:
246
- if self.should_check_for_status():
247
- self.check_for_status()
248
-
249
- r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
250
- if r.status_code == 404:
251
- time.sleep(PULL_DATASET_SLEEP_INTERVAL)
252
- # moving to the next url
253
- continue
239
+ # metastore and warehouse are not thread safe
240
+ with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
241
+ dataset = metastore.get_dataset(self.dataset_name)
254
242
 
255
- r.raise_for_status()
243
+ urls = list(urls)
244
+ while urls:
245
+ for url in urls:
246
+ if self.should_check_for_status():
247
+ self.check_for_status()
256
248
 
257
- df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
249
+ r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
250
+ if r.status_code == 404:
251
+ time.sleep(PULL_DATASET_SLEEP_INTERVAL)
252
+ # moving to the next url
253
+ continue
258
254
 
259
- self.fix_columns(df)
255
+ r.raise_for_status()
260
256
 
261
- # id will be autogenerated in DB
262
- df = df.drop("sys__id", axis=1)
257
+ df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
263
258
 
264
- inserted = warehouse.insert_dataset_rows(
265
- df, dataset, self.dataset_version
266
- )
267
- self.increase_counter(inserted) # type: ignore [arg-type]
268
- urls.remove(url)
259
+ self.fix_columns(df)
260
+
261
+ # id will be autogenerated in DB
262
+ df = df.drop("sys__id", axis=1)
263
+
264
+ inserted = warehouse.insert_dataset_rows(
265
+ df, dataset, self.dataset_version
266
+ )
267
+ self.increase_counter(inserted) # type: ignore [arg-type]
268
+ urls.remove(url)
269
269
 
270
270
 
271
271
  @dataclass
@@ -720,7 +720,6 @@ class Catalog:
720
720
  client.uri, posixpath.join(prefix, "")
721
721
  )
722
722
  source_metastore = self.metastore.clone(client.uri)
723
- source_warehouse = self.warehouse.clone()
724
723
 
725
724
  columns = [
726
725
  Column("vtype", String),
@@ -1835,25 +1834,29 @@ class Catalog:
1835
1834
  if signed_urls:
1836
1835
  shuffle(signed_urls)
1837
1836
 
1838
- rows_fetcher = DatasetRowsFetcher(
1839
- self.metastore.clone(),
1840
- self.warehouse.clone(),
1841
- remote_config,
1842
- dataset.name,
1843
- version,
1844
- schema,
1845
- )
1846
- try:
1847
- rows_fetcher.run(
1848
- batched(
1849
- signed_urls,
1850
- math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
1851
- ),
1852
- dataset_save_progress_bar,
1837
+ with (
1838
+ self.metastore.clone() as metastore,
1839
+ self.warehouse.clone() as warehouse,
1840
+ ):
1841
+ rows_fetcher = DatasetRowsFetcher(
1842
+ metastore,
1843
+ warehouse,
1844
+ remote_config,
1845
+ dataset.name,
1846
+ version,
1847
+ schema,
1853
1848
  )
1854
- except:
1855
- self.remove_dataset(dataset.name, version)
1856
- raise
1849
+ try:
1850
+ rows_fetcher.run(
1851
+ batched(
1852
+ signed_urls,
1853
+ math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
1854
+ ),
1855
+ dataset_save_progress_bar,
1856
+ )
1857
+ except:
1858
+ self.remove_dataset(dataset.name, version)
1859
+ raise
1857
1860
 
1858
1861
  dataset = self.metastore.update_dataset_status(
1859
1862
  dataset,
@@ -4,7 +4,6 @@ from collections.abc import Iterator
4
4
  from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
5
5
 
6
6
  import sqlalchemy as sa
7
- from attrs import frozen
8
7
  from sqlalchemy.sql import FROM_LINTING
9
8
  from sqlalchemy.sql.roles import DDLRole
10
9
 
@@ -23,13 +22,18 @@ logger = logging.getLogger("datachain")
23
22
  SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
24
23
 
25
24
 
26
- @frozen
27
25
  class DatabaseEngine(ABC, Serializable):
28
26
  dialect: ClassVar["Dialect"]
29
27
 
30
28
  engine: "Engine"
31
29
  metadata: "MetaData"
32
30
 
31
+ def __enter__(self) -> "DatabaseEngine":
32
+ return self
33
+
34
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
35
+ self.close()
36
+
33
37
  @abstractmethod
34
38
  def clone(self) -> "DatabaseEngine":
35
39
  """Clones DatabaseEngine implementation."""
@@ -33,6 +33,16 @@ class AbstractIDGenerator(ABC, Serializable):
33
33
  def cleanup_for_tests(self):
34
34
  """Cleanup for tests."""
35
35
 
36
+ def close(self) -> None:
37
+ """Closes any active database connections."""
38
+
39
+ def close_on_exit(self) -> None:
40
+ """Closes any active database or HTTP connections, called on Session exit or
41
+ for test cleanup only, as some ID Generator implementations may handle this
42
+ differently.
43
+ """
44
+ self.close()
45
+
36
46
  @abstractmethod
37
47
  def init_id(self, uri: str) -> None:
38
48
  """Initializes the ID generator for the given URI with zero last_id."""
@@ -83,6 +93,10 @@ class AbstractDBIDGenerator(AbstractIDGenerator):
83
93
  def clone(self) -> "AbstractDBIDGenerator":
84
94
  """Clones AbstractIDGenerator implementation."""
85
95
 
96
+ def close(self) -> None:
97
+ """Closes any active database connections."""
98
+ self.db.close()
99
+
86
100
  @property
87
101
  def db(self) -> "DatabaseEngine":
88
102
  return self._db
@@ -78,6 +78,13 @@ class AbstractMetastore(ABC, Serializable):
78
78
  self.uri = uri
79
79
  self.partial_id: Optional[int] = partial_id
80
80
 
81
+ def __enter__(self) -> "AbstractMetastore":
82
+ """Returns self upon entering context manager."""
83
+ return self
84
+
85
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
86
+ """Default behavior is to do nothing, as connections may be shared."""
87
+
81
88
  @abstractmethod
82
89
  def clone(
83
90
  self,
@@ -97,6 +104,12 @@ class AbstractMetastore(ABC, Serializable):
97
104
  def close(self) -> None:
98
105
  """Closes any active database or HTTP connections."""
99
106
 
107
+ def close_on_exit(self) -> None:
108
+ """Closes any active database or HTTP connections, called on Session exit or
109
+ for test cleanup only, as some Metastore implementations may handle this
110
+ differently."""
111
+ self.close()
112
+
100
113
  def cleanup_tables(self, temp_table_names: list[str]) -> None:
101
114
  """Cleanup temp tables."""
102
115
 
@@ -15,7 +15,6 @@ from typing import (
15
15
  )
16
16
 
17
17
  import sqlalchemy
18
- from attrs import frozen
19
18
  from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
20
19
  from sqlalchemy.dialects import sqlite
21
20
  from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
@@ -40,6 +39,7 @@ from datachain.utils import DataChainDir
40
39
 
41
40
  if TYPE_CHECKING:
42
41
  from sqlalchemy.dialects.sqlite import Insert
42
+ from sqlalchemy.engine.base import Engine
43
43
  from sqlalchemy.schema import SchemaItem
44
44
  from sqlalchemy.sql.elements import ColumnClause, ColumnElement, TextClause
45
45
  from sqlalchemy.sql.selectable import Select
@@ -52,6 +52,8 @@ RETRY_START_SEC = 0.01
52
52
  RETRY_MAX_TIMES = 10
53
53
  RETRY_FACTOR = 2
54
54
 
55
+ DETECT_TYPES = sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES
56
+
55
57
  Column = Union[str, "ColumnClause[Any]", "TextClause"]
56
58
 
57
59
  datachain.sql.sqlite.setup()
@@ -80,26 +82,41 @@ def retry_sqlite_locks(func):
80
82
  return wrapper
81
83
 
82
84
 
83
- @frozen
84
85
  class SQLiteDatabaseEngine(DatabaseEngine):
85
86
  dialect = sqlite_dialect
86
87
 
87
88
  db: sqlite3.Connection
88
89
  db_file: Optional[str]
90
+ is_closed: bool
91
+
92
+ def __init__(
93
+ self,
94
+ engine: "Engine",
95
+ metadata: "MetaData",
96
+ db: sqlite3.Connection,
97
+ db_file: Optional[str] = None,
98
+ ):
99
+ self.engine = engine
100
+ self.metadata = metadata
101
+ self.db = db
102
+ self.db_file = db_file
103
+ self.is_closed = False
89
104
 
90
105
  @classmethod
91
106
  def from_db_file(cls, db_file: Optional[str] = None) -> "SQLiteDatabaseEngine":
92
- detect_types = sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES
107
+ return cls(*cls._connect(db_file=db_file))
93
108
 
109
+ @staticmethod
110
+ def _connect(db_file: Optional[str] = None):
94
111
  try:
95
112
  if db_file == ":memory:":
96
113
  # Enable multithreaded usage of the same in-memory db
97
114
  db = sqlite3.connect(
98
- "file::memory:?cache=shared", uri=True, detect_types=detect_types
115
+ "file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
99
116
  )
100
117
  else:
101
118
  db = sqlite3.connect(
102
- db_file or DataChainDir.find().db, detect_types=detect_types
119
+ db_file or DataChainDir.find().db, detect_types=DETECT_TYPES
103
120
  )
104
121
  create_user_defined_sql_functions(db)
105
122
  engine = sqlalchemy.create_engine(
@@ -118,7 +135,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
118
135
 
119
136
  load_usearch_extension(db)
120
137
 
121
- return cls(engine, MetaData(), db, db_file)
138
+ return engine, MetaData(), db, db_file
122
139
  except RuntimeError:
123
140
  raise DataChainError("Can't connect to SQLite DB") from None
124
141
 
@@ -138,6 +155,16 @@ class SQLiteDatabaseEngine(DatabaseEngine):
138
155
  {},
139
156
  )
140
157
 
158
+ def _reconnect(self) -> None:
159
+ if not self.is_closed:
160
+ raise RuntimeError("Cannot reconnect on still-open DB!")
161
+ engine, metadata, db, db_file = self._connect(db_file=self.db_file)
162
+ self.engine = engine
163
+ self.metadata = metadata
164
+ self.db = db
165
+ self.db_file = db_file
166
+ self.is_closed = False
167
+
141
168
  @retry_sqlite_locks
142
169
  def execute(
143
170
  self,
@@ -145,6 +172,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
145
172
  cursor: Optional[sqlite3.Cursor] = None,
146
173
  conn=None,
147
174
  ) -> sqlite3.Cursor:
175
+ if self.is_closed:
176
+ # Reconnect in case of being closed previously.
177
+ self._reconnect()
148
178
  if cursor is not None:
149
179
  result = cursor.execute(*self.compile_to_args(query))
150
180
  elif conn is not None:
@@ -179,6 +209,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
179
209
 
180
210
  def close(self) -> None:
181
211
  self.db.close()
212
+ self.is_closed = True
182
213
 
183
214
  @contextmanager
184
215
  def transaction(self):
@@ -359,6 +390,10 @@ class SQLiteMetastore(AbstractDBMetastore):
359
390
 
360
391
  self._init_tables()
361
392
 
393
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
394
+ """Close connection upon exit from context manager."""
395
+ self.close()
396
+
362
397
  def clone(
363
398
  self,
364
399
  uri: StorageURI = StorageURI(""),
@@ -521,6 +556,10 @@ class SQLiteWarehouse(AbstractWarehouse):
521
556
 
522
557
  self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
523
558
 
559
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
560
+ """Close connection upon exit from context manager."""
561
+ self.close()
562
+
524
563
  def clone(self, use_new_connection: bool = False) -> "SQLiteWarehouse":
525
564
  return SQLiteWarehouse(self.id_generator.clone(), db=self.db.clone())
526
565
 
@@ -70,6 +70,13 @@ class AbstractWarehouse(ABC, Serializable):
70
70
  def __init__(self, id_generator: "AbstractIDGenerator"):
71
71
  self.id_generator = id_generator
72
72
 
73
+ def __enter__(self) -> "AbstractWarehouse":
74
+ return self
75
+
76
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
77
+ # Default behavior is to do nothing, as connections may be shared.
78
+ pass
79
+
73
80
  def cleanup_for_tests(self):
74
81
  """Cleanup for tests."""
75
82
 
@@ -158,6 +165,12 @@ class AbstractWarehouse(ABC, Serializable):
158
165
  """Closes any active database connections."""
159
166
  self.db.close()
160
167
 
168
+ def close_on_exit(self) -> None:
169
+ """Closes any active database or HTTP connections, called on Session exit or
170
+ for test cleanup only, as some Warehouse implementations may handle this
171
+ differently."""
172
+ self.close()
173
+
161
174
  #
162
175
  # Query Tables
163
176
  #
@@ -1,5 +1,6 @@
1
1
  import re
2
2
  from collections.abc import Sequence
3
+ from tempfile import NamedTemporaryFile
3
4
  from typing import TYPE_CHECKING, Optional
4
5
 
5
6
  import pyarrow as pa
@@ -43,13 +44,17 @@ class ArrowGenerator(Generator):
43
44
  self.kwargs = kwargs
44
45
 
45
46
  def process(self, file: File):
46
- path = file.get_path()
47
- ds = dataset(
48
- path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
49
- )
47
+ if self.nrows:
48
+ path = _nrows_file(file, self.nrows)
49
+ ds = dataset(path, schema=self.input_schema, **self.kwargs)
50
+ else:
51
+ path = file.get_path()
52
+ ds = dataset(
53
+ path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
54
+ )
50
55
  index = 0
51
56
  with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
52
- for record_batch in ds.to_batches(use_threads=False):
57
+ for record_batch in ds.to_batches():
53
58
  for record in record_batch.to_pylist():
54
59
  vals = list(record.values())
55
60
  if self.output_schema:
@@ -60,8 +65,6 @@ class ArrowGenerator(Generator):
60
65
  else:
61
66
  yield vals
62
67
  index += 1
63
- if self.nrows and index >= self.nrows:
64
- return
65
68
  pbar.update(len(record_batch))
66
69
 
67
70
 
@@ -125,3 +128,15 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
125
128
  if isinstance(col_type, pa.lib.DictionaryType):
126
129
  return _arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
127
130
  raise TypeError(f"{col_type!r} datatypes not supported")
131
+
132
+
133
+ def _nrows_file(file: File, nrows: int) -> str:
134
+ tf = NamedTemporaryFile(delete=False)
135
+ with file.open(mode="r") as reader:
136
+ with open(tf.name, "a") as writer:
137
+ for row, line in enumerate(reader):
138
+ if row >= nrows:
139
+ break
140
+ writer.write(line)
141
+ writer.write("\n")
142
+ return tf.name
@@ -0,0 +1,18 @@
1
+ from decimal import Decimal
2
+ from typing import Any
3
+
4
+ from sqlalchemy import ColumnElement
5
+
6
+
7
+ def sql_to_python(args_map: dict[str, ColumnElement]) -> dict[str, Any]:
8
+ res = {}
9
+ for name, sql_exp in args_map.items():
10
+ try:
11
+ type_ = sql_exp.type.python_type
12
+ if type_ == Decimal:
13
+ type_ = float
14
+ except NotImplementedError:
15
+ type_ = str
16
+ res[name] = type_
17
+
18
+ return res