datachain 0.28.1__tar.gz → 0.29.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (408) hide show
  1. {datachain-0.28.1 → datachain-0.29.0}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.28.1 → datachain-0.29.0}/PKG-INFO +1 -1
  3. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/warehouse.py +2 -1
  4. datachain-0.29.0/src/datachain/lib/dc/database.py +330 -0
  5. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/datachain.py +140 -13
  6. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/records.py +4 -2
  7. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/settings.py +23 -0
  8. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/signal_schema.py +2 -2
  9. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/udf.py +27 -4
  10. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/dataset.py +18 -20
  11. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/utils.py +37 -22
  12. {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/PKG-INFO +1 -1
  13. {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/SOURCES.txt +2 -0
  14. {datachain-0.28.1 → datachain-0.29.0}/tests/examples/test_examples.py +0 -1
  15. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_datachain.py +19 -0
  16. datachain-0.29.0/tests/func/test_to_database.py +778 -0
  17. datachain-0.29.0/tests/unit/lib/test_settings.py +61 -0
  18. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_utils.py +19 -0
  19. datachain-0.28.1/src/datachain/lib/dc/database.py +0 -153
  20. {datachain-0.28.1 → datachain-0.29.0}/.cruft.json +0 -0
  21. {datachain-0.28.1 → datachain-0.29.0}/.gitattributes +0 -0
  22. {datachain-0.28.1 → datachain-0.29.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  23. {datachain-0.28.1 → datachain-0.29.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  24. {datachain-0.28.1 → datachain-0.29.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  25. {datachain-0.28.1 → datachain-0.29.0}/.github/codecov.yaml +0 -0
  26. {datachain-0.28.1 → datachain-0.29.0}/.github/dependabot.yml +0 -0
  27. {datachain-0.28.1 → datachain-0.29.0}/.github/workflows/benchmarks.yml +0 -0
  28. {datachain-0.28.1 → datachain-0.29.0}/.github/workflows/release.yml +0 -0
  29. {datachain-0.28.1 → datachain-0.29.0}/.github/workflows/tests-studio.yml +0 -0
  30. {datachain-0.28.1 → datachain-0.29.0}/.github/workflows/tests.yml +0 -0
  31. {datachain-0.28.1 → datachain-0.29.0}/.github/workflows/update-template.yaml +0 -0
  32. {datachain-0.28.1 → datachain-0.29.0}/.gitignore +0 -0
  33. {datachain-0.28.1 → datachain-0.29.0}/CODE_OF_CONDUCT.rst +0 -0
  34. {datachain-0.28.1 → datachain-0.29.0}/LICENSE +0 -0
  35. {datachain-0.28.1 → datachain-0.29.0}/README.rst +0 -0
  36. {datachain-0.28.1 → datachain-0.29.0}/docs/assets/captioned_cartoons.png +0 -0
  37. {datachain-0.28.1 → datachain-0.29.0}/docs/assets/datachain-white.svg +0 -0
  38. {datachain-0.28.1 → datachain-0.29.0}/docs/assets/datachain.svg +0 -0
  39. {datachain-0.28.1 → datachain-0.29.0}/docs/commands/auth/login.md +0 -0
  40. {datachain-0.28.1 → datachain-0.29.0}/docs/commands/auth/logout.md +0 -0
  41. {datachain-0.28.1 → datachain-0.29.0}/docs/commands/auth/team.md +0 -0
  42. {datachain-0.28.1 → datachain-0.29.0}/docs/commands/auth/token.md +0 -0
  43. {datachain-0.28.1 → datachain-0.29.0}/docs/commands/index.md +0 -0
  44. {datachain-0.28.1 → datachain-0.29.0}/docs/commands/job/cancel.md +0 -0
  45. {datachain-0.28.1 → datachain-0.29.0}/docs/commands/job/clusters.md +0 -0
  46. {datachain-0.28.1 → datachain-0.29.0}/docs/commands/job/logs.md +0 -0
  47. {datachain-0.28.1 → datachain-0.29.0}/docs/commands/job/ls.md +0 -0
  48. {datachain-0.28.1 → datachain-0.29.0}/docs/commands/job/run.md +0 -0
  49. {datachain-0.28.1 → datachain-0.29.0}/docs/contributing.md +0 -0
  50. {datachain-0.28.1 → datachain-0.29.0}/docs/css/github-permalink-style.css +0 -0
  51. {datachain-0.28.1 → datachain-0.29.0}/docs/examples.md +0 -0
  52. {datachain-0.28.1 → datachain-0.29.0}/docs/guide/db_migrations.md +0 -0
  53. {datachain-0.28.1 → datachain-0.29.0}/docs/guide/delta.md +0 -0
  54. {datachain-0.28.1 → datachain-0.29.0}/docs/guide/env.md +0 -0
  55. {datachain-0.28.1 → datachain-0.29.0}/docs/guide/index.md +0 -0
  56. {datachain-0.28.1 → datachain-0.29.0}/docs/guide/namespaces.md +0 -0
  57. {datachain-0.28.1 → datachain-0.29.0}/docs/guide/processing.md +0 -0
  58. {datachain-0.28.1 → datachain-0.29.0}/docs/guide/remotes.md +0 -0
  59. {datachain-0.28.1 → datachain-0.29.0}/docs/guide/retry.md +0 -0
  60. {datachain-0.28.1 → datachain-0.29.0}/docs/index.md +0 -0
  61. {datachain-0.28.1 → datachain-0.29.0}/docs/overrides/main.html +0 -0
  62. {datachain-0.28.1 → datachain-0.29.0}/docs/quick-start.md +0 -0
  63. {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/arrowrow.md +0 -0
  64. {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/bbox.md +0 -0
  65. {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/file.md +0 -0
  66. {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/imagefile.md +0 -0
  67. {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/index.md +0 -0
  68. {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/pose.md +0 -0
  69. {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/segment.md +0 -0
  70. {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/tarvfile.md +0 -0
  71. {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/textfile.md +0 -0
  72. {datachain-0.28.1 → datachain-0.29.0}/docs/references/data-types/videofile.md +0 -0
  73. {datachain-0.28.1 → datachain-0.29.0}/docs/references/datachain.md +0 -0
  74. {datachain-0.28.1 → datachain-0.29.0}/docs/references/func.md +0 -0
  75. {datachain-0.28.1 → datachain-0.29.0}/docs/references/index.md +0 -0
  76. {datachain-0.28.1 → datachain-0.29.0}/docs/references/toolkit.md +0 -0
  77. {datachain-0.28.1 → datachain-0.29.0}/docs/references/torch.md +0 -0
  78. {datachain-0.28.1 → datachain-0.29.0}/docs/references/udf.md +0 -0
  79. {datachain-0.28.1 → datachain-0.29.0}/docs/tutorials.md +0 -0
  80. {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  81. {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  82. {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/openimage-detect.py +0 -0
  83. {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  84. {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  85. {datachain-0.28.1 → datachain-0.29.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  86. {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/common_sql_functions.py +0 -0
  87. {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/json-csv-reader.py +0 -0
  88. {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/torch-loader.py +0 -0
  89. {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/udfs/parallel.py +0 -0
  90. {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/udfs/simple.py +0 -0
  91. {datachain-0.28.1 → datachain-0.29.0}/examples/get_started/udfs/stateful.py +0 -0
  92. {datachain-0.28.1 → datachain-0.29.0}/examples/incremental_processing/delta.py +0 -0
  93. {datachain-0.28.1 → datachain-0.29.0}/examples/incremental_processing/retry.py +0 -0
  94. {datachain-0.28.1 → datachain-0.29.0}/examples/incremental_processing/utils.py +0 -0
  95. {datachain-0.28.1 → datachain-0.29.0}/examples/llm_and_nlp/claude-query.py +0 -0
  96. {datachain-0.28.1 → datachain-0.29.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  97. {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/audio-to-text.py +0 -0
  98. {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/clip_inference.py +0 -0
  99. {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/hf_pipeline.py +0 -0
  100. {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  101. {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/wds.py +0 -0
  102. {datachain-0.28.1 → datachain-0.29.0}/examples/multimodal/wds_filtered.py +0 -0
  103. {datachain-0.28.1 → datachain-0.29.0}/mkdocs.yml +0 -0
  104. {datachain-0.28.1 → datachain-0.29.0}/noxfile.py +0 -0
  105. {datachain-0.28.1 → datachain-0.29.0}/pyproject.toml +0 -0
  106. {datachain-0.28.1 → datachain-0.29.0}/setup.cfg +0 -0
  107. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/__init__.py +0 -0
  108. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/__main__.py +0 -0
  109. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/asyn.py +0 -0
  110. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cache.py +0 -0
  111. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/catalog/__init__.py +0 -0
  112. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/catalog/catalog.py +0 -0
  113. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/catalog/datasource.py +0 -0
  114. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/catalog/loader.py +0 -0
  115. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/__init__.py +0 -0
  116. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/__init__.py +0 -0
  117. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/datasets.py +0 -0
  118. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/du.py +0 -0
  119. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/index.py +0 -0
  120. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/ls.py +0 -0
  121. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/misc.py +0 -0
  122. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/query.py +0 -0
  123. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/commands/show.py +0 -0
  124. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/parser/__init__.py +0 -0
  125. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/parser/job.py +0 -0
  126. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/parser/studio.py +0 -0
  127. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/parser/utils.py +0 -0
  128. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/cli/utils.py +0 -0
  129. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/__init__.py +0 -0
  130. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/azure.py +0 -0
  131. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/fileslice.py +0 -0
  132. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/fsspec.py +0 -0
  133. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/gcs.py +0 -0
  134. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/hf.py +0 -0
  135. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/local.py +0 -0
  136. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/client/s3.py +0 -0
  137. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/config.py +0 -0
  138. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/__init__.py +0 -0
  139. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/db_engine.py +0 -0
  140. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/job.py +0 -0
  141. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/metastore.py +0 -0
  142. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/schema.py +0 -0
  143. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/serializer.py +0 -0
  144. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/data_storage/sqlite.py +0 -0
  145. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/dataset.py +0 -0
  146. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/delta.py +0 -0
  147. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/diff/__init__.py +0 -0
  148. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/error.py +0 -0
  149. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/fs/__init__.py +0 -0
  150. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/fs/reference.py +0 -0
  151. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/fs/utils.py +0 -0
  152. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/__init__.py +0 -0
  153. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/aggregate.py +0 -0
  154. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/array.py +0 -0
  155. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/base.py +0 -0
  156. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/conditional.py +0 -0
  157. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/func.py +0 -0
  158. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/numeric.py +0 -0
  159. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/path.py +0 -0
  160. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/random.py +0 -0
  161. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/string.py +0 -0
  162. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/func/window.py +0 -0
  163. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/job.py +0 -0
  164. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/__init__.py +0 -0
  165. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/arrow.py +0 -0
  166. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/audio.py +0 -0
  167. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/clip.py +0 -0
  168. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/__init__.py +0 -0
  169. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/flatten.py +0 -0
  170. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  171. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  172. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/unflatten.py +0 -0
  173. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  174. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/data_model.py +0 -0
  175. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dataset_info.py +0 -0
  176. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/__init__.py +0 -0
  177. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/csv.py +0 -0
  178. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/datasets.py +0 -0
  179. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/hf.py +0 -0
  180. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/json.py +0 -0
  181. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/listings.py +0 -0
  182. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/pandas.py +0 -0
  183. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/parquet.py +0 -0
  184. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/storage.py +0 -0
  185. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/utils.py +0 -0
  186. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/dc/values.py +0 -0
  187. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/file.py +0 -0
  188. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/hf.py +0 -0
  189. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/image.py +0 -0
  190. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/listing.py +0 -0
  191. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/listing_info.py +0 -0
  192. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/meta_formats.py +0 -0
  193. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/model_store.py +0 -0
  194. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/namespaces.py +0 -0
  195. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/projects.py +0 -0
  196. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/pytorch.py +0 -0
  197. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/tar.py +0 -0
  198. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/text.py +0 -0
  199. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/udf_signature.py +0 -0
  200. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/utils.py +0 -0
  201. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/video.py +0 -0
  202. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/webdataset.py +0 -0
  203. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/lib/webdataset_laion.py +0 -0
  204. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/listing.py +0 -0
  205. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/__init__.py +0 -0
  206. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/bbox.py +0 -0
  207. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/pose.py +0 -0
  208. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/segment.py +0 -0
  209. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  210. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  211. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/ultralytics/pose.py +0 -0
  212. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/ultralytics/segment.py +0 -0
  213. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/model/utils.py +0 -0
  214. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/namespace.py +0 -0
  215. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/node.py +0 -0
  216. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/nodes_fetcher.py +0 -0
  217. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/nodes_thread_pool.py +0 -0
  218. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/progress.py +0 -0
  219. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/project.py +0 -0
  220. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/py.typed +0 -0
  221. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/__init__.py +0 -0
  222. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/batch.py +0 -0
  223. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/dispatch.py +0 -0
  224. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/metrics.py +0 -0
  225. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/params.py +0 -0
  226. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/queue.py +0 -0
  227. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/schema.py +0 -0
  228. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/session.py +0 -0
  229. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/udf.py +0 -0
  230. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/query/utils.py +0 -0
  231. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/remote/__init__.py +0 -0
  232. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/remote/studio.py +0 -0
  233. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/script_meta.py +0 -0
  234. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/semver.py +0 -0
  235. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/__init__.py +0 -0
  236. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/default/__init__.py +0 -0
  237. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/default/base.py +0 -0
  238. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/__init__.py +0 -0
  239. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/aggregate.py +0 -0
  240. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/array.py +0 -0
  241. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/conditional.py +0 -0
  242. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/numeric.py +0 -0
  243. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/path.py +0 -0
  244. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/random.py +0 -0
  245. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/functions/string.py +0 -0
  246. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/selectable.py +0 -0
  247. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  248. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/sqlite/base.py +0 -0
  249. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/sqlite/types.py +0 -0
  250. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/sqlite/vector.py +0 -0
  251. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/types.py +0 -0
  252. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/sql/utils.py +0 -0
  253. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/studio.py +0 -0
  254. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/telemetry.py +0 -0
  255. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/toolkit/__init__.py +0 -0
  256. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/toolkit/split.py +0 -0
  257. {datachain-0.28.1 → datachain-0.29.0}/src/datachain/torch/__init__.py +0 -0
  258. {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  259. {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/entry_points.txt +0 -0
  260. {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/requires.txt +0 -0
  261. {datachain-0.28.1 → datachain-0.29.0}/src/datachain.egg-info/top_level.txt +0 -0
  262. {datachain-0.28.1 → datachain-0.29.0}/tests/__init__.py +0 -0
  263. {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/__init__.py +0 -0
  264. {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/conftest.py +0 -0
  265. {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  266. {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  267. {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/datasets/.gitignore +0 -0
  268. {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  269. {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/test_datachain.py +0 -0
  270. {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/test_ls.py +0 -0
  271. {datachain-0.28.1 → datachain-0.29.0}/tests/benchmarks/test_version.py +0 -0
  272. {datachain-0.28.1 → datachain-0.29.0}/tests/conftest.py +0 -0
  273. {datachain-0.28.1 → datachain-0.29.0}/tests/data.py +0 -0
  274. {datachain-0.28.1 → datachain-0.29.0}/tests/examples/__init__.py +0 -0
  275. {datachain-0.28.1 → datachain-0.29.0}/tests/examples/test_wds_e2e.py +0 -0
  276. {datachain-0.28.1 → datachain-0.29.0}/tests/examples/wds_data.py +0 -0
  277. {datachain-0.28.1 → datachain-0.29.0}/tests/func/__init__.py +0 -0
  278. {datachain-0.28.1 → datachain-0.29.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  279. {datachain-0.28.1 → datachain-0.29.0}/tests/func/data/lena.jpg +0 -0
  280. {datachain-0.28.1 → datachain-0.29.0}/tests/func/fake-service-account-credentials.json +0 -0
  281. {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/__init__.py +0 -0
  282. {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_aggregate.py +0 -0
  283. {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_array.py +0 -0
  284. {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_conditional.py +0 -0
  285. {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_numeric.py +0 -0
  286. {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_path.py +0 -0
  287. {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_random.py +0 -0
  288. {datachain-0.28.1 → datachain-0.29.0}/tests/func/functions/test_string.py +0 -0
  289. {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/__init__.py +0 -0
  290. {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/data/running-mask0.png +0 -0
  291. {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/data/running-mask1.png +0 -0
  292. {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/data/running.jpg +0 -0
  293. {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/data/ships.jpg +0 -0
  294. {datachain-0.28.1 → datachain-0.29.0}/tests/func/model/test_yolo.py +0 -0
  295. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_audio.py +0 -0
  296. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_batching.py +0 -0
  297. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_catalog.py +0 -0
  298. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_client.py +0 -0
  299. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_cloud_transfer.py +0 -0
  300. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_data_storage.py +0 -0
  301. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_datachain_merge.py +0 -0
  302. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_dataset_query.py +0 -0
  303. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_datasets.py +0 -0
  304. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_delta.py +0 -0
  305. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_feature_pickling.py +0 -0
  306. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_file.py +0 -0
  307. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_hf.py +0 -0
  308. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_hidden_field.py +0 -0
  309. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_image.py +0 -0
  310. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_listing.py +0 -0
  311. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_ls.py +0 -0
  312. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_meta_formats.py +0 -0
  313. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_metastore.py +0 -0
  314. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_metrics.py +0 -0
  315. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_pull.py +0 -0
  316. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_pytorch.py +0 -0
  317. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_query.py +0 -0
  318. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_read_database.py +0 -0
  319. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_read_dataset_remote.py +0 -0
  320. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  321. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_retry.py +0 -0
  322. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_session.py +0 -0
  323. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_studio_datetime_parsing.py +0 -0
  324. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_toolkit.py +0 -0
  325. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_video.py +0 -0
  326. {datachain-0.28.1 → datachain-0.29.0}/tests/func/test_warehouse.py +0 -0
  327. {datachain-0.28.1 → datachain-0.29.0}/tests/scripts/feature_class.py +0 -0
  328. {datachain-0.28.1 → datachain-0.29.0}/tests/scripts/feature_class_exception.py +0 -0
  329. {datachain-0.28.1 → datachain-0.29.0}/tests/scripts/feature_class_parallel.py +0 -0
  330. {datachain-0.28.1 → datachain-0.29.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  331. {datachain-0.28.1 → datachain-0.29.0}/tests/scripts/name_len_slow.py +0 -0
  332. {datachain-0.28.1 → datachain-0.29.0}/tests/test_atomicity.py +0 -0
  333. {datachain-0.28.1 → datachain-0.29.0}/tests/test_cli_e2e.py +0 -0
  334. {datachain-0.28.1 → datachain-0.29.0}/tests/test_cli_studio.py +0 -0
  335. {datachain-0.28.1 → datachain-0.29.0}/tests/test_import_time.py +0 -0
  336. {datachain-0.28.1 → datachain-0.29.0}/tests/test_query_e2e.py +0 -0
  337. {datachain-0.28.1 → datachain-0.29.0}/tests/test_telemetry.py +0 -0
  338. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/__init__.py +0 -0
  339. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/__init__.py +0 -0
  340. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/conftest.py +0 -0
  341. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_arrow.py +0 -0
  342. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_audio.py +0 -0
  343. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_clip.py +0 -0
  344. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_datachain.py +0 -0
  345. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  346. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  347. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_diff.py +0 -0
  348. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_feature.py +0 -0
  349. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_feature_utils.py +0 -0
  350. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_file.py +0 -0
  351. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_hf.py +0 -0
  352. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_image.py +0 -0
  353. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_listing_info.py +0 -0
  354. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_namespace.py +0 -0
  355. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_partition_by.py +0 -0
  356. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_project.py +0 -0
  357. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  358. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_schema.py +0 -0
  359. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_signal_schema.py +0 -0
  360. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  361. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_text.py +0 -0
  362. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_udf.py +0 -0
  363. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_udf_signature.py +0 -0
  364. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_utils.py +0 -0
  365. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/lib/test_webdataset.py +0 -0
  366. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/model/__init__.py +0 -0
  367. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/model/test_bbox.py +0 -0
  368. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/model/test_pose.py +0 -0
  369. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/model/test_segment.py +0 -0
  370. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/model/test_utils.py +0 -0
  371. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/__init__.py +0 -0
  372. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  373. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  374. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  375. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_array.py +0 -0
  376. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_conditional.py +0 -0
  377. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_path.py +0 -0
  378. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_random.py +0 -0
  379. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_selectable.py +0 -0
  380. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/sql/test_string.py +0 -0
  381. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_asyn.py +0 -0
  382. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_cache.py +0 -0
  383. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_catalog.py +0 -0
  384. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_catalog_loader.py +0 -0
  385. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_cli_parsing.py +0 -0
  386. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_client.py +0 -0
  387. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_client_gcs.py +0 -0
  388. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_client_s3.py +0 -0
  389. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_config.py +0 -0
  390. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_data_storage.py +0 -0
  391. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_database_engine.py +0 -0
  392. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_dataset.py +0 -0
  393. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_dispatch.py +0 -0
  394. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_fileslice.py +0 -0
  395. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_func.py +0 -0
  396. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_listing.py +0 -0
  397. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_metastore.py +0 -0
  398. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_module_exports.py +0 -0
  399. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_pytorch.py +0 -0
  400. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_query.py +0 -0
  401. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_query_metrics.py +0 -0
  402. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_query_params.py +0 -0
  403. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_script_meta.py +0 -0
  404. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_semver.py +0 -0
  405. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_serializer.py +0 -0
  406. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_session.py +0 -0
  407. {datachain-0.28.1 → datachain-0.29.0}/tests/unit/test_warehouse.py +0 -0
  408. {datachain-0.28.1 → datachain-0.29.0}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.12.5'
27
+ rev: 'v0.12.7'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.28.1
3
+ Version: 0.29.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -21,6 +21,7 @@ from datachain.lib.file import File
21
21
  from datachain.lib.signal_schema import SignalSchema
22
22
  from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
23
23
  from datachain.query.batch import RowsOutput
24
+ from datachain.query.schema import ColumnMeta
24
25
  from datachain.query.utils import get_query_id_column
25
26
  from datachain.sql.functions import path as pathfunc
26
27
  from datachain.sql.types import Int, SQLType
@@ -400,7 +401,7 @@ class AbstractWarehouse(ABC, Serializable):
400
401
  expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
401
402
  sa.func.count(table.c.sys__id),
402
403
  )
403
- size_column_names = [s.replace(".", "__") + "__size" for s in file_signals]
404
+ size_column_names = [ColumnMeta.to_db_name(s) + "__size" for s in file_signals]
404
405
  size_columns = [c for c in table.columns if c.name in size_column_names]
405
406
 
406
407
  if size_columns:
@@ -0,0 +1,330 @@
1
+ import contextlib
2
+ import itertools
3
+ import os
4
+ import sqlite3
5
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
+
7
+ import sqlalchemy
8
+
9
+ from datachain.query.schema import ColumnMeta
10
+
11
+ DEFAULT_DATABASE_BATCH_SIZE = 10_000
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Iterator, Mapping, Sequence
15
+
16
+ import sqlalchemy.orm # noqa: TC004
17
+
18
+ from datachain.lib.data_model import DataType
19
+ from datachain.query import Session
20
+
21
+ from .datachain import DataChain
22
+
23
+ ConnectionType = Union[
24
+ str,
25
+ sqlalchemy.engine.URL,
26
+ sqlalchemy.engine.interfaces.Connectable,
27
+ sqlalchemy.engine.Engine,
28
+ sqlalchemy.engine.Connection,
29
+ sqlalchemy.orm.Session,
30
+ sqlite3.Connection,
31
+ ]
32
+
33
+
34
+ @contextlib.contextmanager
35
+ def _connect(
36
+ connection: "ConnectionType",
37
+ ) -> "Iterator[sqlalchemy.engine.Connection]":
38
+ import sqlalchemy.orm
39
+
40
+ with contextlib.ExitStack() as stack:
41
+ engine_kwargs = {"echo": bool(os.environ.get("DEBUG_SHOW_SQL_QUERIES"))}
42
+ if isinstance(connection, (str, sqlalchemy.URL)):
43
+ engine = sqlalchemy.create_engine(connection, **engine_kwargs)
44
+ stack.callback(engine.dispose)
45
+ yield stack.enter_context(engine.connect())
46
+ elif isinstance(connection, sqlite3.Connection):
47
+ engine = sqlalchemy.create_engine(
48
+ "sqlite://", creator=lambda: connection, **engine_kwargs
49
+ )
50
+ # do not close the connection, as it is managed by the caller
51
+ yield engine.connect()
52
+ elif isinstance(connection, sqlalchemy.Engine):
53
+ yield stack.enter_context(connection.connect())
54
+ elif isinstance(connection, sqlalchemy.Connection):
55
+ # do not close the connection, as it is managed by the caller
56
+ yield connection
57
+ elif isinstance(connection, sqlalchemy.orm.Session):
58
+ # For Session objects, get the underlying bind (Engine or Connection)
59
+ # Sessions don't support DDL operations directly
60
+ bind = connection.get_bind()
61
+ if isinstance(bind, sqlalchemy.Engine):
62
+ yield stack.enter_context(bind.connect())
63
+ else:
64
+ # bind is already a Connection
65
+ yield bind
66
+ else:
67
+ raise TypeError(f"Unsupported connection type: {type(connection).__name__}")
68
+
69
+
70
+ def to_database(
71
+ chain: "DataChain",
72
+ table_name: str,
73
+ connection: "ConnectionType",
74
+ *,
75
+ batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
76
+ on_conflict: Optional[str] = None,
77
+ column_mapping: Optional[dict[str, Optional[str]]] = None,
78
+ ) -> None:
79
+ """
80
+ Implementation function for exporting DataChain to database tables.
81
+
82
+ This is the core implementation that handles the actual database operations.
83
+ For user-facing documentation, see DataChain.to_database() method.
84
+ """
85
+ from datachain.utils import batched
86
+
87
+ if on_conflict and on_conflict not in ("ignore", "update"):
88
+ raise ValueError(
89
+ f"on_conflict must be 'ignore' or 'update', got: {on_conflict}"
90
+ )
91
+
92
+ signals_schema = chain.signals_schema.clone_without_sys_signals()
93
+ all_columns = [
94
+ sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
95
+ for c in signals_schema.db_signals(as_columns=True)
96
+ ]
97
+
98
+ column_mapping = column_mapping or {}
99
+ normalized_column_mapping = _normalize_column_mapping(column_mapping)
100
+ column_indices_and_names, columns = _prepare_columns(
101
+ all_columns, normalized_column_mapping
102
+ )
103
+
104
+ with _connect(connection) as conn:
105
+ metadata = sqlalchemy.MetaData()
106
+ table = sqlalchemy.Table(table_name, metadata, *columns)
107
+
108
+ # Check if table already exists to determine if we should clean up on error.
109
+ inspector = sqlalchemy.inspect(conn)
110
+ assert inspector # to satisfy mypy
111
+ table_existed_before = table_name in inspector.get_table_names()
112
+
113
+ try:
114
+ table.create(conn, checkfirst=True)
115
+ rows_iter = chain._leaf_values()
116
+ for batch in batched(rows_iter, batch_rows):
117
+ _process_batch(
118
+ conn, table, batch, on_conflict, column_indices_and_names
119
+ )
120
+ conn.commit()
121
+ except Exception:
122
+ if not table_existed_before:
123
+ try:
124
+ table.drop(conn, checkfirst=True)
125
+ conn.commit()
126
+ except sqlalchemy.exc.SQLAlchemyError:
127
+ pass
128
+ raise
129
+
130
+
131
+ def _normalize_column_mapping(
132
+ column_mapping: dict[str, Optional[str]],
133
+ ) -> dict[str, Optional[str]]:
134
+ """
135
+ Convert column mapping keys from DataChain format (dots) to database format
136
+ (double underscores).
137
+
138
+ This allows users to specify column mappings using the intuitive DataChain
139
+ format like: {"nested_data.value": "data_value"} instead of
140
+ {"nested_data__value": "data_value"}
141
+ """
142
+ if not column_mapping:
143
+ return {}
144
+
145
+ normalized_mapping: dict[str, Optional[str]] = {}
146
+ original_keys: dict[str, str] = {}
147
+ for key, value in column_mapping.items():
148
+ db_key = ColumnMeta.to_db_name(key)
149
+ if db_key in normalized_mapping:
150
+ prev = original_keys[db_key]
151
+ raise ValueError(
152
+ "Column mapping collision: multiple keys map to the same "
153
+ f"database column name '{db_key}': '{prev}' and '{key}'. "
154
+ )
155
+ normalized_mapping[db_key] = value
156
+ original_keys[db_key] = key
157
+
158
+ # If it's a defaultdict, preserve the default factory
159
+ if hasattr(column_mapping, "default_factory"):
160
+ from collections import defaultdict
161
+
162
+ default_factory = column_mapping.default_factory
163
+ result: dict[str, Optional[str]] = defaultdict(default_factory)
164
+ result.update(normalized_mapping)
165
+ return result
166
+
167
+ return normalized_mapping
168
+
169
+
170
+ def _prepare_columns(all_columns, column_mapping):
171
+ """Prepare column mapping and column definitions."""
172
+ column_indices_and_names = [] # List of (index, target_name) tuples
173
+ columns = []
174
+ for idx, col in enumerate(all_columns):
175
+ if col.name in column_mapping or hasattr(column_mapping, "default_factory"):
176
+ mapped_name = column_mapping[col.name]
177
+ if mapped_name:
178
+ columns.append(sqlalchemy.Column(mapped_name, col.type))
179
+ column_indices_and_names.append((idx, mapped_name))
180
+ else:
181
+ columns.append(col)
182
+ column_indices_and_names.append((idx, col.name))
183
+ return column_indices_and_names, columns
184
+
185
+
186
+ def _process_batch(conn, table, batch, on_conflict, column_indices_and_names):
187
+ """Process a batch of rows with conflict resolution."""
188
+
189
+ def prepare_row(row_values):
190
+ """Convert a row tuple to a dictionary with proper DB column names."""
191
+ return {
192
+ target_name: row_values[idx]
193
+ for idx, target_name in column_indices_and_names
194
+ }
195
+
196
+ rows_to_insert = [prepare_row(row) for row in batch]
197
+
198
+ supports_conflict = on_conflict and conn.engine.name in ("postgresql", "sqlite")
199
+
200
+ if supports_conflict:
201
+ # Use dialect-specific insert for conflict resolution
202
+ if conn.engine.name == "postgresql":
203
+ from sqlalchemy.dialects.postgresql import insert as pg_insert
204
+
205
+ insert_stmt = pg_insert(table)
206
+ elif conn.engine.name == "sqlite":
207
+ from sqlalchemy.dialects.sqlite import insert as sqlite_insert
208
+
209
+ insert_stmt = sqlite_insert(table)
210
+ else:
211
+ insert_stmt = table.insert()
212
+
213
+ if supports_conflict:
214
+ if on_conflict == "ignore":
215
+ insert_stmt = insert_stmt.on_conflict_do_nothing()
216
+ elif on_conflict == "update":
217
+ update_values = {
218
+ col.name: insert_stmt.excluded[col.name] for col in table.columns
219
+ }
220
+ insert_stmt = insert_stmt.on_conflict_do_update(set_=update_values)
221
+ elif on_conflict:
222
+ import warnings
223
+
224
+ warnings.warn(
225
+ f"Database does not support conflict resolution. "
226
+ f"Ignoring on_conflict='{on_conflict}' parameter.",
227
+ UserWarning,
228
+ stacklevel=2,
229
+ )
230
+
231
+ conn.execute(insert_stmt, rows_to_insert)
232
+
233
+
234
+ def read_database(
235
+ query: Union[str, "sqlalchemy.sql.expression.Executable"],
236
+ connection: "ConnectionType",
237
+ params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
238
+ *,
239
+ output: Optional["dict[str, DataType]"] = None,
240
+ session: Optional["Session"] = None,
241
+ settings: Optional[dict] = None,
242
+ in_memory: bool = False,
243
+ infer_schema_length: Optional[int] = 100,
244
+ ) -> "DataChain":
245
+ """
246
+ Read the results of a SQL query into a DataChain, using a given database connection.
247
+
248
+ Args:
249
+ query:
250
+ The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
251
+ `Executable` object.
252
+ connection: SQLAlchemy connectable, str, or a sqlite3 connection
253
+ Using SQLAlchemy makes it possible to use any DB supported by that
254
+ library. If a DBAPI2 object, only sqlite3 is supported. The user is
255
+ responsible for engine disposal and connection closure for the
256
+ SQLAlchemy connectable; str connections are closed automatically.
257
+ params: Parameters to pass to execute method.
258
+ output: A dictionary mapping column names to types, used to override the
259
+ schema inferred from the query results.
260
+ session: Session to use for the chain.
261
+ settings: Settings to use for the chain.
262
+ in_memory: If True, creates an in-memory session. Defaults to False.
263
+ infer_schema_length:
264
+ The maximum number of rows to scan for inferring schema.
265
+ If set to `None`, the full data may be scanned.
266
+ The rows used for schema inference are stored in memory,
267
+ so large values can lead to high memory usage.
268
+ Only applies if the `output` parameter is not set for the given column.
269
+
270
+ Examples:
271
+ Reading from a SQL query against a user-supplied connection:
272
+ ```python
273
+ query = "SELECT key, value FROM tbl"
274
+ chain = dc.read_database(query, connection, output={"value": float})
275
+ ```
276
+
277
+ Load data from a SQLAlchemy driver/engine:
278
+ ```python
279
+ from sqlalchemy import create_engine
280
+ engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
281
+ chain = dc.read_database("select * from tbl", engine)
282
+ ```
283
+
284
+ Load data from a parameterized SQLAlchemy query:
285
+ ```python
286
+ query = "SELECT key, value FROM tbl WHERE value > :value"
287
+ dc.read_database(query, engine, params={"value": 50})
288
+ ```
289
+
290
+ Notes:
291
+ - This function works with a variety of databases — including,
292
+ but not limited to, SQLite, DuckDB, PostgreSQL, and Snowflake,
293
+ provided the appropriate driver is installed.
294
+ - This call is blocking, and will execute the query and return once the
295
+ results are saved.
296
+ """
297
+ from datachain.lib.dc.records import read_records
298
+
299
+ output = output or {}
300
+ if isinstance(query, str):
301
+ query = sqlalchemy.text(query)
302
+ kw = {"execution_options": {"stream_results": True}} # use server-side cursors
303
+ with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
304
+ cols = result.keys()
305
+ to_infer = [k for k in cols if k not in output] # preserve the order
306
+ rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
307
+ records = (row._asdict() for row in itertools.chain(rows, result))
308
+ return read_records(
309
+ records,
310
+ session=session,
311
+ settings=settings,
312
+ in_memory=in_memory,
313
+ schema=inferred_schema | output,
314
+ )
315
+
316
+
317
+ def _infer_schema(
318
+ result: "sqlalchemy.engine.Result",
319
+ to_infer: list[str],
320
+ infer_schema_length: Optional[int] = 100,
321
+ ) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
322
+ from datachain.lib.convert.values_to_tuples import values_to_tuples
323
+
324
+ if not to_infer:
325
+ return [], {}
326
+
327
+ rows = list(itertools.islice(result, infer_schema_length))
328
+ values = {col: [row._mapping[col] for row in rows] for col in to_infer}
329
+ _, output_schema, _ = values_to_tuples("", **values)
330
+ return rows, output_schema
@@ -58,6 +58,7 @@ from datachain.query.schema import DEFAULT_DELIMITER, Column
58
58
  from datachain.sql.functions import path as pathfunc
59
59
  from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
60
60
 
61
+ from .database import DEFAULT_DATABASE_BATCH_SIZE
61
62
  from .utils import (
62
63
  DatasetMergeError,
63
64
  DatasetPrepareError,
@@ -77,11 +78,23 @@ UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
77
78
  DEFAULT_PARQUET_CHUNK_SIZE = 100_000
78
79
 
79
80
  if TYPE_CHECKING:
81
+ import sqlite3
82
+
80
83
  import pandas as pd
81
84
  from typing_extensions import ParamSpec, Self
82
85
 
83
86
  P = ParamSpec("P")
84
87
 
88
+ ConnectionType = Union[
89
+ str,
90
+ sqlalchemy.engine.URL,
91
+ sqlalchemy.engine.interfaces.Connectable,
92
+ sqlalchemy.engine.Engine,
93
+ sqlalchemy.engine.Connection,
94
+ "sqlalchemy.orm.Session",
95
+ sqlite3.Connection,
96
+ ]
97
+
85
98
 
86
99
  T = TypeVar("T", bound="DataChain")
87
100
 
@@ -324,6 +337,7 @@ class DataChain:
324
337
  sys: Optional[bool] = None,
325
338
  namespace: Optional[str] = None,
326
339
  project: Optional[str] = None,
340
+ batch_rows: Optional[int] = None,
327
341
  ) -> "Self":
328
342
  """Change settings for chain.
329
343
 
@@ -331,22 +345,24 @@ class DataChain:
331
345
  It returns chain, so, it can be chained later with next operation.
332
346
 
333
347
  Parameters:
334
- cache : data caching (default=False)
348
+ cache : data caching. (default=False)
335
349
  parallel : number of thread for processors. True is a special value to
336
- enable all available CPUs (default=1)
350
+ enable all available CPUs. (default=1)
337
351
  workers : number of distributed workers. Only for Studio mode. (default=1)
338
- min_task_size : minimum number of tasks (default=1)
339
- prefetch: number of workers to use for downloading files in advance.
352
+ min_task_size : minimum number of tasks. (default=1)
353
+ prefetch : number of workers to use for downloading files in advance.
340
354
  This is enabled by default and uses 2 workers.
341
355
  To disable prefetching, set it to 0.
342
- namespace: namespace name.
343
- project: project name.
356
+ namespace : namespace name.
357
+ project : project name.
358
+ batch_rows : row limit per insert to balance speed and memory usage.
359
+ (default=2000)
344
360
 
345
361
  Example:
346
362
  ```py
347
363
  chain = (
348
364
  chain
349
- .settings(cache=True, parallel=8)
365
+ .settings(cache=True, parallel=8, batch_rows=300)
350
366
  .map(laion=process_webdataset(spec=WDSLaion), params="file")
351
367
  )
352
368
  ```
@@ -356,7 +372,14 @@ class DataChain:
356
372
  settings = copy.copy(self._settings)
357
373
  settings.add(
358
374
  Settings(
359
- cache, parallel, workers, min_task_size, prefetch, namespace, project
375
+ cache,
376
+ parallel,
377
+ workers,
378
+ min_task_size,
379
+ prefetch,
380
+ namespace,
381
+ project,
382
+ batch_rows,
360
383
  )
361
384
  )
362
385
  return self._evolve(settings=settings, _sys=sys)
@@ -711,7 +734,7 @@ class DataChain:
711
734
 
712
735
  return self._evolve(
713
736
  query=self._query.add_signals(
714
- udf_obj.to_udf_wrapper(),
737
+ udf_obj.to_udf_wrapper(self._settings.batch_rows),
715
738
  **self._settings.to_dict(),
716
739
  ),
717
740
  signal_schema=self.signals_schema | udf_obj.output,
@@ -749,7 +772,7 @@ class DataChain:
749
772
  udf_obj.prefetch = prefetch
750
773
  return self._evolve(
751
774
  query=self._query.generate(
752
- udf_obj.to_udf_wrapper(),
775
+ udf_obj.to_udf_wrapper(self._settings.batch_rows),
753
776
  **self._settings.to_dict(),
754
777
  ),
755
778
  signal_schema=udf_obj.output,
@@ -885,7 +908,7 @@ class DataChain:
885
908
  udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
886
909
  return self._evolve(
887
910
  query=self._query.generate(
888
- udf_obj.to_udf_wrapper(),
911
+ udf_obj.to_udf_wrapper(self._settings.batch_rows),
889
912
  partition_by=processed_partition_by,
890
913
  **self._settings.to_dict(),
891
914
  ),
@@ -917,11 +940,24 @@ class DataChain:
917
940
  )
918
941
  chain.save("new_dataset")
919
942
  ```
943
+
944
+ .. deprecated:: 0.29.0
945
+ This method is deprecated and will be removed in a future version.
946
+ Use `agg()` instead, which provides the similar functionality.
920
947
  """
948
+ import warnings
949
+
950
+ warnings.warn(
951
+ "batch_map() is deprecated and will be removed in a future version. "
952
+ "Use agg() instead, which provides the similar functionality.",
953
+ DeprecationWarning,
954
+ stacklevel=2,
955
+ )
921
956
  udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
957
+
922
958
  return self._evolve(
923
959
  query=self._query.add_signals(
924
- udf_obj.to_udf_wrapper(batch),
960
+ udf_obj.to_udf_wrapper(self._settings.batch_rows, batch=batch),
925
961
  **self._settings.to_dict(),
926
962
  ),
927
963
  signal_schema=self.signals_schema | udf_obj.output,
@@ -2253,6 +2289,97 @@ class DataChain:
2253
2289
  """
2254
2290
  self.to_json(path, fs_kwargs, include_outer_list=False)
2255
2291
 
2292
+ def to_database(
2293
+ self,
2294
+ table_name: str,
2295
+ connection: "ConnectionType",
2296
+ *,
2297
+ batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
2298
+ on_conflict: Optional[str] = None,
2299
+ column_mapping: Optional[dict[str, Optional[str]]] = None,
2300
+ ) -> None:
2301
+ """Save chain to a database table using a given database connection.
2302
+
2303
+ This method exports all DataChain records to a database table, creating the
2304
+ table if it doesn't exist and appending data if it does. The table schema
2305
+ is automatically inferred from the DataChain's signal schema.
2306
+
2307
+ Parameters:
2308
+ table_name: Name of the database table to create/write to.
2309
+ connection: SQLAlchemy connectable, str, or a sqlite3 connection
2310
+ Using SQLAlchemy makes it possible to use any DB supported by that
2311
+ library. If a DBAPI2 object, only sqlite3 is supported. The user is
2312
+ responsible for engine disposal and connection closure for the
2313
+ SQLAlchemy connectable; str connections are closed automatically.
2314
+ batch_rows: Number of rows to insert per batch for optimal performance.
2315
+ Larger batches are faster but use more memory. Default: 10,000.
2316
+ on_conflict: Strategy for handling duplicate rows (requires table
2317
+ constraints):
2318
+ - None: Raise error (`sqlalchemy.exc.IntegrityError`) on conflict
2319
+ (default)
2320
+ - "ignore": Skip duplicate rows silently
2321
+ - "update": Update existing rows with new values
2322
+ column_mapping: Optional mapping to rename or skip columns:
2323
+ - Dict mapping DataChain column names to database column names
2324
+ - Set values to None to skip columns entirely, or use `defaultdict` to
2325
+ skip all columns except those specified.
2326
+
2327
+ Examples:
2328
+ Basic usage with PostgreSQL:
2329
+ ```py
2330
+ import sqlalchemy as sa
2331
+ import datachain as dc
2332
+
2333
+ chain = dc.read_storage("s3://my-bucket/")
2334
+ engine = sa.create_engine("postgresql://user:pass@localhost/mydb")
2335
+ chain.to_database("files_table", engine)
2336
+ ```
2337
+
2338
+ Using SQLite with connection string:
2339
+ ```py
2340
+ chain.to_database("my_table", "sqlite:///data.db")
2341
+ ```
2342
+
2343
+ Column mapping and renaming:
2344
+ ```py
2345
+ mapping = {
2346
+ "user.id": "id",
2347
+ "user.name": "name",
2348
+ "user.password": None # Skip this column
2349
+ }
2350
+ chain.to_database("users", engine, column_mapping=mapping)
2351
+ ```
2352
+
2353
+ Handling conflicts (requires PRIMARY KEY or UNIQUE constraints):
2354
+ ```py
2355
+ # Skip duplicates
2356
+ chain.to_database("my_table", engine, on_conflict="ignore")
2357
+
2358
+ # Update existing records
2359
+ chain.to_database("my_table", engine, on_conflict="update")
2360
+ ```
2361
+
2362
+ Working with different databases:
2363
+ ```py
2364
+ # MySQL
2365
+ mysql_engine = sa.create_engine("mysql+pymysql://user:pass@host/db")
2366
+ chain.to_database("mysql_table", mysql_engine)
2367
+
2368
+ # SQLite in-memory
2369
+ chain.to_database("temp_table", "sqlite:///:memory:")
2370
+ ```
2371
+ """
2372
+ from .database import to_database
2373
+
2374
+ to_database(
2375
+ self,
2376
+ table_name,
2377
+ connection,
2378
+ batch_rows=batch_rows,
2379
+ on_conflict=on_conflict,
2380
+ column_mapping=column_mapping,
2381
+ )
2382
+
2256
2383
  @classmethod
2257
2384
  def from_records(
2258
2385
  cls,
@@ -2340,7 +2467,7 @@ class DataChain:
2340
2467
  def setup(self, **kwargs) -> "Self":
2341
2468
  """Setup variables to pass to UDF functions.
2342
2469
 
2343
- Use before running map/gen/agg/batch_map to save an object and pass it as an
2470
+ Use before running map/gen/agg to save an object and pass it as an
2344
2471
  argument to the UDF.
2345
2472
 
2346
2473
  The value must be a callable (a `lambda: <value>` syntax can be used to quickly
@@ -15,6 +15,8 @@ if TYPE_CHECKING:
15
15
 
16
16
  P = ParamSpec("P")
17
17
 
18
+ READ_RECORDS_BATCH_SIZE = 10000
19
+
18
20
 
19
21
  def read_records(
20
22
  to_insert: Optional[Union[dict, Iterable[dict]]],
@@ -41,7 +43,7 @@ def read_records(
41
43
  Notes:
42
44
  This call blocks until all records are inserted.
43
45
  """
44
- from datachain.query.dataset import INSERT_BATCH_SIZE, adjust_outputs, get_col_types
46
+ from datachain.query.dataset import adjust_outputs, get_col_types
45
47
  from datachain.sql.types import SQLType
46
48
  from datachain.utils import batched
47
49
 
@@ -94,7 +96,7 @@ def read_records(
94
96
  {c.name: c.type for c in columns if isinstance(c.type, SQLType)},
95
97
  )
96
98
  records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
97
- for chunk in batched(records, INSERT_BATCH_SIZE):
99
+ for chunk in batched(records, READ_RECORDS_BATCH_SIZE):
98
100
  warehouse.insert_rows(table, chunk)
99
101
  warehouse.insert_rows_done(table)
100
102
  return read_dataset(name=dsr.full_name, session=session, settings=settings)