datachain 0.14.5__tar.gz → 0.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (355) hide show
  1. {datachain-0.14.5 → datachain-0.15.0}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.14.5/src/datachain.egg-info → datachain-0.15.0}/PKG-INFO +1 -1
  3. {datachain-0.14.5 → datachain-0.15.0}/docs/references/datachain.md +4 -0
  4. {datachain-0.14.5 → datachain-0.15.0}/examples/multimodal/wds_filtered.py +1 -1
  5. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/__init__.py +4 -0
  6. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/catalog/catalog.py +10 -0
  7. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/catalog/loader.py +11 -7
  8. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/values_to_tuples.py +23 -14
  9. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/__init__.py +4 -1
  10. datachain-0.15.0/src/datachain/lib/dc/database.py +151 -0
  11. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/datachain.py +15 -5
  12. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/datasets.py +43 -0
  13. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/pandas.py +8 -1
  14. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/records.py +12 -14
  15. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/signal_schema.py +10 -1
  16. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/dataset.py +10 -12
  17. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/dispatch.py +7 -2
  18. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/schema.py +4 -1
  19. {datachain-0.14.5 → datachain-0.15.0/src/datachain.egg-info}/PKG-INFO +1 -1
  20. {datachain-0.14.5 → datachain-0.15.0}/src/datachain.egg-info/SOURCES.txt +2 -0
  21. {datachain-0.14.5 → datachain-0.15.0}/tests/conftest.py +4 -1
  22. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_catalog.py +3 -3
  23. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_datachain.py +28 -4
  24. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_dataset_query.py +0 -60
  25. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_hidden_field.py +1 -1
  26. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_pull.py +9 -7
  27. datachain-0.15.0/tests/func/test_read_database.py +175 -0
  28. {datachain-0.14.5 → datachain-0.15.0}/tests/test_import_time.py +1 -1
  29. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_datachain.py +83 -2
  30. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_feature_utils.py +0 -5
  31. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_catalog_loader.py +21 -10
  32. {datachain-0.14.5 → datachain-0.15.0}/.cruft.json +0 -0
  33. {datachain-0.14.5 → datachain-0.15.0}/.gitattributes +0 -0
  34. {datachain-0.14.5 → datachain-0.15.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  35. {datachain-0.14.5 → datachain-0.15.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  36. {datachain-0.14.5 → datachain-0.15.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  37. {datachain-0.14.5 → datachain-0.15.0}/.github/codecov.yaml +0 -0
  38. {datachain-0.14.5 → datachain-0.15.0}/.github/dependabot.yml +0 -0
  39. {datachain-0.14.5 → datachain-0.15.0}/.github/workflows/benchmarks.yml +0 -0
  40. {datachain-0.14.5 → datachain-0.15.0}/.github/workflows/release.yml +0 -0
  41. {datachain-0.14.5 → datachain-0.15.0}/.github/workflows/tests-studio.yml +0 -0
  42. {datachain-0.14.5 → datachain-0.15.0}/.github/workflows/tests.yml +0 -0
  43. {datachain-0.14.5 → datachain-0.15.0}/.github/workflows/update-template.yaml +0 -0
  44. {datachain-0.14.5 → datachain-0.15.0}/.gitignore +0 -0
  45. {datachain-0.14.5 → datachain-0.15.0}/CODE_OF_CONDUCT.rst +0 -0
  46. {datachain-0.14.5 → datachain-0.15.0}/LICENSE +0 -0
  47. {datachain-0.14.5 → datachain-0.15.0}/README.rst +0 -0
  48. {datachain-0.14.5 → datachain-0.15.0}/docs/assets/captioned_cartoons.png +0 -0
  49. {datachain-0.14.5 → datachain-0.15.0}/docs/assets/datachain-white.svg +0 -0
  50. {datachain-0.14.5 → datachain-0.15.0}/docs/assets/datachain.svg +0 -0
  51. {datachain-0.14.5 → datachain-0.15.0}/docs/contributing.md +0 -0
  52. {datachain-0.14.5 → datachain-0.15.0}/docs/css/github-permalink-style.css +0 -0
  53. {datachain-0.14.5 → datachain-0.15.0}/docs/examples.md +0 -0
  54. {datachain-0.14.5 → datachain-0.15.0}/docs/index.md +0 -0
  55. {datachain-0.14.5 → datachain-0.15.0}/docs/overrides/main.html +0 -0
  56. {datachain-0.14.5 → datachain-0.15.0}/docs/quick-start.md +0 -0
  57. {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/arrowrow.md +0 -0
  58. {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/bbox.md +0 -0
  59. {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/file.md +0 -0
  60. {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/imagefile.md +0 -0
  61. {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/index.md +0 -0
  62. {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/pose.md +0 -0
  63. {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/segment.md +0 -0
  64. {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/tarvfile.md +0 -0
  65. {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/textfile.md +0 -0
  66. {datachain-0.14.5 → datachain-0.15.0}/docs/references/data-types/videofile.md +0 -0
  67. {datachain-0.14.5 → datachain-0.15.0}/docs/references/func.md +0 -0
  68. {datachain-0.14.5 → datachain-0.15.0}/docs/references/index.md +0 -0
  69. {datachain-0.14.5 → datachain-0.15.0}/docs/references/remotes.md +0 -0
  70. {datachain-0.14.5 → datachain-0.15.0}/docs/references/toolkit.md +0 -0
  71. {datachain-0.14.5 → datachain-0.15.0}/docs/references/torch.md +0 -0
  72. {datachain-0.14.5 → datachain-0.15.0}/docs/references/udf.md +0 -0
  73. {datachain-0.14.5 → datachain-0.15.0}/docs/tutorials.md +0 -0
  74. {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  75. {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  76. {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/openimage-detect.py +0 -0
  77. {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  78. {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  79. {datachain-0.14.5 → datachain-0.15.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  80. {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/common_sql_functions.py +0 -0
  81. {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/json-csv-reader.py +0 -0
  82. {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/torch-loader.py +0 -0
  83. {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/udfs/parallel.py +0 -0
  84. {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/udfs/simple.py +0 -0
  85. {datachain-0.14.5 → datachain-0.15.0}/examples/get_started/udfs/stateful.py +0 -0
  86. {datachain-0.14.5 → datachain-0.15.0}/examples/llm_and_nlp/claude-query.py +0 -0
  87. {datachain-0.14.5 → datachain-0.15.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  88. {datachain-0.14.5 → datachain-0.15.0}/examples/multimodal/clip_inference.py +0 -0
  89. {datachain-0.14.5 → datachain-0.15.0}/examples/multimodal/hf_pipeline.py +0 -0
  90. {datachain-0.14.5 → datachain-0.15.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  91. {datachain-0.14.5 → datachain-0.15.0}/examples/multimodal/wds.py +0 -0
  92. {datachain-0.14.5 → datachain-0.15.0}/mkdocs.yml +0 -0
  93. {datachain-0.14.5 → datachain-0.15.0}/noxfile.py +0 -0
  94. {datachain-0.14.5 → datachain-0.15.0}/pyproject.toml +0 -0
  95. {datachain-0.14.5 → datachain-0.15.0}/setup.cfg +0 -0
  96. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/__main__.py +0 -0
  97. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/asyn.py +0 -0
  98. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cache.py +0 -0
  99. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/catalog/__init__.py +0 -0
  100. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/catalog/datasource.py +0 -0
  101. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/__init__.py +0 -0
  102. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/__init__.py +0 -0
  103. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/datasets.py +0 -0
  104. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/du.py +0 -0
  105. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/index.py +0 -0
  106. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/ls.py +0 -0
  107. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/misc.py +0 -0
  108. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/query.py +0 -0
  109. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/commands/show.py +0 -0
  110. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/parser/__init__.py +0 -0
  111. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/parser/job.py +0 -0
  112. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/parser/studio.py +0 -0
  113. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/parser/utils.py +0 -0
  114. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/cli/utils.py +0 -0
  115. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/__init__.py +0 -0
  116. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/azure.py +0 -0
  117. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/fileslice.py +0 -0
  118. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/fsspec.py +0 -0
  119. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/gcs.py +0 -0
  120. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/hf.py +0 -0
  121. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/local.py +0 -0
  122. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/client/s3.py +0 -0
  123. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/config.py +0 -0
  124. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/__init__.py +0 -0
  125. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/db_engine.py +0 -0
  126. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/job.py +0 -0
  127. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/metastore.py +0 -0
  128. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/schema.py +0 -0
  129. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/serializer.py +0 -0
  130. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/sqlite.py +0 -0
  131. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/data_storage/warehouse.py +0 -0
  132. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/dataset.py +0 -0
  133. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/diff/__init__.py +0 -0
  134. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/error.py +0 -0
  135. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/fs/__init__.py +0 -0
  136. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/fs/reference.py +0 -0
  137. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/fs/utils.py +0 -0
  138. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/__init__.py +0 -0
  139. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/aggregate.py +0 -0
  140. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/array.py +0 -0
  141. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/base.py +0 -0
  142. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/conditional.py +0 -0
  143. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/func.py +0 -0
  144. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/numeric.py +0 -0
  145. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/path.py +0 -0
  146. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/random.py +0 -0
  147. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/string.py +0 -0
  148. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/func/window.py +0 -0
  149. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/job.py +0 -0
  150. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/__init__.py +0 -0
  151. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/arrow.py +0 -0
  152. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/clip.py +0 -0
  153. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/__init__.py +0 -0
  154. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/flatten.py +0 -0
  155. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  156. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  157. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/convert/unflatten.py +0 -0
  158. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/data_model.py +0 -0
  159. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dataset_info.py +0 -0
  160. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/csv.py +0 -0
  161. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/hf.py +0 -0
  162. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/json.py +0 -0
  163. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/listings.py +0 -0
  164. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/parquet.py +0 -0
  165. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/storage.py +0 -0
  166. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/utils.py +0 -0
  167. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/dc/values.py +0 -0
  168. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/file.py +0 -0
  169. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/hf.py +0 -0
  170. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/image.py +0 -0
  171. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/listing.py +0 -0
  172. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/listing_info.py +0 -0
  173. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/meta_formats.py +0 -0
  174. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/model_store.py +0 -0
  175. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/pytorch.py +0 -0
  176. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/settings.py +0 -0
  177. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/tar.py +0 -0
  178. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/text.py +0 -0
  179. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/udf.py +0 -0
  180. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/udf_signature.py +0 -0
  181. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/utils.py +0 -0
  182. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/video.py +0 -0
  183. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/webdataset.py +0 -0
  184. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/lib/webdataset_laion.py +0 -0
  185. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/listing.py +0 -0
  186. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/__init__.py +0 -0
  187. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/bbox.py +0 -0
  188. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/pose.py +0 -0
  189. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/segment.py +0 -0
  190. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  191. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  192. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/ultralytics/pose.py +0 -0
  193. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/ultralytics/segment.py +0 -0
  194. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/model/utils.py +0 -0
  195. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/node.py +0 -0
  196. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/nodes_fetcher.py +0 -0
  197. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/nodes_thread_pool.py +0 -0
  198. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/progress.py +0 -0
  199. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/py.typed +0 -0
  200. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/__init__.py +0 -0
  201. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/batch.py +0 -0
  202. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/metrics.py +0 -0
  203. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/params.py +0 -0
  204. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/queue.py +0 -0
  205. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/session.py +0 -0
  206. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/udf.py +0 -0
  207. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/query/utils.py +0 -0
  208. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/remote/__init__.py +0 -0
  209. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/remote/studio.py +0 -0
  210. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/script_meta.py +0 -0
  211. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/__init__.py +0 -0
  212. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/default/__init__.py +0 -0
  213. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/default/base.py +0 -0
  214. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/__init__.py +0 -0
  215. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/aggregate.py +0 -0
  216. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/array.py +0 -0
  217. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/conditional.py +0 -0
  218. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/numeric.py +0 -0
  219. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/path.py +0 -0
  220. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/random.py +0 -0
  221. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/functions/string.py +0 -0
  222. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/selectable.py +0 -0
  223. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  224. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/sqlite/base.py +0 -0
  225. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/sqlite/types.py +0 -0
  226. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/sqlite/vector.py +0 -0
  227. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/types.py +0 -0
  228. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/sql/utils.py +0 -0
  229. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/studio.py +0 -0
  230. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/telemetry.py +0 -0
  231. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/toolkit/__init__.py +0 -0
  232. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/toolkit/split.py +0 -0
  233. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/torch/__init__.py +0 -0
  234. {datachain-0.14.5 → datachain-0.15.0}/src/datachain/utils.py +0 -0
  235. {datachain-0.14.5 → datachain-0.15.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  236. {datachain-0.14.5 → datachain-0.15.0}/src/datachain.egg-info/entry_points.txt +0 -0
  237. {datachain-0.14.5 → datachain-0.15.0}/src/datachain.egg-info/requires.txt +0 -0
  238. {datachain-0.14.5 → datachain-0.15.0}/src/datachain.egg-info/top_level.txt +0 -0
  239. {datachain-0.14.5 → datachain-0.15.0}/tests/__init__.py +0 -0
  240. {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/__init__.py +0 -0
  241. {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/conftest.py +0 -0
  242. {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  243. {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  244. {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/datasets/.gitignore +0 -0
  245. {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  246. {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/test_datachain.py +0 -0
  247. {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/test_ls.py +0 -0
  248. {datachain-0.14.5 → datachain-0.15.0}/tests/benchmarks/test_version.py +0 -0
  249. {datachain-0.14.5 → datachain-0.15.0}/tests/data.py +0 -0
  250. {datachain-0.14.5 → datachain-0.15.0}/tests/examples/__init__.py +0 -0
  251. {datachain-0.14.5 → datachain-0.15.0}/tests/examples/test_examples.py +0 -0
  252. {datachain-0.14.5 → datachain-0.15.0}/tests/examples/test_wds_e2e.py +0 -0
  253. {datachain-0.14.5 → datachain-0.15.0}/tests/examples/wds_data.py +0 -0
  254. {datachain-0.14.5 → datachain-0.15.0}/tests/func/__init__.py +0 -0
  255. {datachain-0.14.5 → datachain-0.15.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  256. {datachain-0.14.5 → datachain-0.15.0}/tests/func/data/lena.jpg +0 -0
  257. {datachain-0.14.5 → datachain-0.15.0}/tests/func/fake-service-account-credentials.json +0 -0
  258. {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/__init__.py +0 -0
  259. {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/data/running-mask0.png +0 -0
  260. {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/data/running-mask1.png +0 -0
  261. {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/data/running.jpg +0 -0
  262. {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/data/ships.jpg +0 -0
  263. {datachain-0.14.5 → datachain-0.15.0}/tests/func/model/test_yolo.py +0 -0
  264. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_client.py +0 -0
  265. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_cloud_transfer.py +0 -0
  266. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_data_storage.py +0 -0
  267. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_datachain_merge.py +0 -0
  268. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_datasets.py +0 -0
  269. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_feature_pickling.py +0 -0
  270. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_file.py +0 -0
  271. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_hf.py +0 -0
  272. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_image.py +0 -0
  273. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_listing.py +0 -0
  274. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_ls.py +0 -0
  275. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_meta_formats.py +0 -0
  276. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_metrics.py +0 -0
  277. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_pytorch.py +0 -0
  278. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_query.py +0 -0
  279. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_session.py +0 -0
  280. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_toolkit.py +0 -0
  281. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_video.py +0 -0
  282. {datachain-0.14.5 → datachain-0.15.0}/tests/func/test_warehouse.py +0 -0
  283. {datachain-0.14.5 → datachain-0.15.0}/tests/scripts/feature_class.py +0 -0
  284. {datachain-0.14.5 → datachain-0.15.0}/tests/scripts/feature_class_exception.py +0 -0
  285. {datachain-0.14.5 → datachain-0.15.0}/tests/scripts/feature_class_parallel.py +0 -0
  286. {datachain-0.14.5 → datachain-0.15.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  287. {datachain-0.14.5 → datachain-0.15.0}/tests/scripts/name_len_slow.py +0 -0
  288. {datachain-0.14.5 → datachain-0.15.0}/tests/test_atomicity.py +0 -0
  289. {datachain-0.14.5 → datachain-0.15.0}/tests/test_cli_e2e.py +0 -0
  290. {datachain-0.14.5 → datachain-0.15.0}/tests/test_cli_studio.py +0 -0
  291. {datachain-0.14.5 → datachain-0.15.0}/tests/test_query_e2e.py +0 -0
  292. {datachain-0.14.5 → datachain-0.15.0}/tests/test_telemetry.py +0 -0
  293. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/__init__.py +0 -0
  294. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/__init__.py +0 -0
  295. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/conftest.py +0 -0
  296. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_arrow.py +0 -0
  297. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_clip.py +0 -0
  298. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  299. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  300. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_diff.py +0 -0
  301. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_feature.py +0 -0
  302. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_file.py +0 -0
  303. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_hf.py +0 -0
  304. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_image.py +0 -0
  305. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_listing_info.py +0 -0
  306. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  307. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_schema.py +0 -0
  308. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_signal_schema.py +0 -0
  309. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  310. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_text.py +0 -0
  311. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_udf_signature.py +0 -0
  312. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_utils.py +0 -0
  313. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/lib/test_webdataset.py +0 -0
  314. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/model/__init__.py +0 -0
  315. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/model/test_bbox.py +0 -0
  316. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/model/test_pose.py +0 -0
  317. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/model/test_segment.py +0 -0
  318. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/model/test_utils.py +0 -0
  319. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/__init__.py +0 -0
  320. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  321. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  322. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  323. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_array.py +0 -0
  324. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_conditional.py +0 -0
  325. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_path.py +0 -0
  326. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_random.py +0 -0
  327. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_selectable.py +0 -0
  328. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/sql/test_string.py +0 -0
  329. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_asyn.py +0 -0
  330. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_cache.py +0 -0
  331. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_catalog.py +0 -0
  332. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_cli_parsing.py +0 -0
  333. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_client.py +0 -0
  334. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_client_gcs.py +0 -0
  335. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_client_s3.py +0 -0
  336. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_config.py +0 -0
  337. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_data_storage.py +0 -0
  338. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_database_engine.py +0 -0
  339. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_dataset.py +0 -0
  340. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_dispatch.py +0 -0
  341. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_fileslice.py +0 -0
  342. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_func.py +0 -0
  343. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_listing.py +0 -0
  344. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_metastore.py +0 -0
  345. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_module_exports.py +0 -0
  346. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_pytorch.py +0 -0
  347. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_query.py +0 -0
  348. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_query_metrics.py +0 -0
  349. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_query_params.py +0 -0
  350. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_script_meta.py +0 -0
  351. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_serializer.py +0 -0
  352. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_session.py +0 -0
  353. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_utils.py +0 -0
  354. {datachain-0.14.5 → datachain-0.15.0}/tests/unit/test_warehouse.py +0 -0
  355. {datachain-0.14.5 → datachain-0.15.0}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.11.4'
27
+ rev: 'v0.11.5'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.5
3
+ Version: 0.15.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -31,6 +31,10 @@ for examples of how to create a chain.
31
31
 
32
32
  ::: datachain.lib.dc.values.read_values
33
33
 
34
+ ::: datachain.lib.dc.database.read_database
35
+
36
+ ::: datachain.lib.dc.database.ConnectionType
37
+
34
38
  ::: datachain.lib.dc.DataChain
35
39
 
36
40
  ::: datachain.lib.utils.DataChainError
@@ -27,7 +27,7 @@ filtered = (
27
27
  / func.least("laion.json.original_width", "laion.json.original_height")
28
28
  < 3.0
29
29
  )
30
- .save()
30
+ .persist()
31
31
  )
32
32
 
33
33
  filtered.show(3)
@@ -5,8 +5,10 @@ from datachain.lib.dc import (
5
5
  DataChain,
6
6
  Sys,
7
7
  datasets,
8
+ delete_dataset,
8
9
  listings,
9
10
  read_csv,
11
+ read_database,
10
12
  read_dataset,
11
13
  read_hf,
12
14
  read_json,
@@ -61,11 +63,13 @@ __all__ = [
61
63
  "VideoFragment",
62
64
  "VideoFrame",
63
65
  "datasets",
66
+ "delete_dataset",
64
67
  "is_chain_type",
65
68
  "listings",
66
69
  "metrics",
67
70
  "param",
68
71
  "read_csv",
72
+ "read_database",
69
73
  "read_dataset",
70
74
  "read_hf",
71
75
  "read_json",
@@ -1299,7 +1299,17 @@ class Catalog:
1299
1299
  name: str,
1300
1300
  version: Optional[int] = None,
1301
1301
  force: Optional[bool] = False,
1302
+ studio: Optional[bool] = False,
1302
1303
  ):
1304
+ from datachain.remote.studio import StudioClient
1305
+
1306
+ if studio:
1307
+ client = StudioClient()
1308
+ response = client.rm_dataset(name, version=version, force=force)
1309
+ if not response.ok:
1310
+ raise DataChainError(response.message)
1311
+ return
1312
+
1303
1313
  dataset = self.get_dataset(name)
1304
1314
  if not version and not force:
1305
1315
  raise ValueError(f"Missing dataset version from input for dataset {name}")
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import sys
2
3
  from importlib import import_module
3
4
  from typing import TYPE_CHECKING, Any, Optional
4
5
 
@@ -15,6 +16,7 @@ METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
15
16
  WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
16
17
  WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
17
18
  WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
19
+ DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
18
20
  DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
19
21
 
20
22
  IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
@@ -100,19 +102,21 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
100
102
  return warehouse_class(**warehouse_args)
101
103
 
102
104
 
103
- def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
104
- distributed_import_path = os.environ.get(DISTRIBUTED_IMPORT_PATH)
105
+ def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
106
+ if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
107
+ return None
105
108
 
106
- if not distributed_import_path:
107
- raise RuntimeError(
108
- f"{DISTRIBUTED_IMPORT_PATH} import path is required "
109
- "for distributed UDF processing."
110
- )
111
109
  # Distributed class paths are specified as (for example): module.classname
112
110
  if "." not in distributed_import_path:
113
111
  raise RuntimeError(
114
112
  f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
115
113
  )
114
+
115
+ # Optional: set the Python path to look for the module
116
+ distributed_import_pythonpath = os.environ.get(DISTRIBUTED_IMPORT_PYTHONPATH)
117
+ if distributed_import_pythonpath and distributed_import_pythonpath not in sys.path:
118
+ sys.path.insert(0, distributed_import_pythonpath)
119
+
116
120
  module_name, _, class_name = distributed_import_path.rpartition(".")
117
121
  distributed = import_module(module_name)
118
122
  return getattr(distributed, class_name)
@@ -1,5 +1,6 @@
1
+ import itertools
1
2
  from collections.abc import Sequence
2
- from typing import Any, Union
3
+ from typing import Any, Optional, Union
3
4
 
4
5
  from datachain.lib.data_model import (
5
6
  DataType,
@@ -66,21 +67,29 @@ def values_to_tuples( # noqa: C901, PLR0912
66
67
  f"signal '{k}' is not present in the output",
67
68
  )
68
69
  else:
69
- if len_ == 0:
70
- raise ValuesToTupleError(ds_name, f"signal '{k}' is empty list")
71
-
72
- first_element = next(iter(v))
73
- typ = type(first_element)
74
- if not is_chain_type(typ):
75
- raise ValuesToTupleError(
76
- ds_name,
77
- f"signal '{k}' has unsupported type '{typ.__name__}'."
78
- f" Please use DataModel types: {DataTypeNames}",
70
+ # FIXME: Stops as soon as it finds the first non-None value.
71
+ # If a non-None value appears early, it won't check the remaining items for
72
+ # `None` values.
73
+ try:
74
+ pos, first_not_none_element = next(
75
+ itertools.dropwhile(lambda pair: pair[1] is None, enumerate(v))
79
76
  )
80
- if isinstance(first_element, list):
81
- types_map[k] = list[type(first_element[0])] # type: ignore[assignment, misc]
77
+ except StopIteration:
78
+ typ = str # default to str if all values are None or has length 0
79
+ nullable = True
82
80
  else:
83
- types_map[k] = typ
81
+ nullable = pos > 0
82
+ typ = type(first_not_none_element) # type: ignore[assignment]
83
+ if not is_chain_type(typ):
84
+ raise ValuesToTupleError(
85
+ ds_name,
86
+ f"signal '{k}' has unsupported type '{typ.__name__}'."
87
+ f" Please use DataModel types: {DataTypeNames}",
88
+ )
89
+ if isinstance(first_not_none_element, list):
90
+ typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
91
+
92
+ types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
84
93
 
85
94
  if length < 0:
86
95
  length = len_
@@ -1,6 +1,7 @@
1
1
  from .csv import read_csv
2
+ from .database import read_database
2
3
  from .datachain import C, Column, DataChain
3
- from .datasets import datasets, read_dataset
4
+ from .datasets import datasets, delete_dataset, read_dataset
4
5
  from .hf import read_hf
5
6
  from .json import read_json
6
7
  from .listings import listings
@@ -19,8 +20,10 @@ __all__ = [
19
20
  "DatasetPrepareError",
20
21
  "Sys",
21
22
  "datasets",
23
+ "delete_dataset",
22
24
  "listings",
23
25
  "read_csv",
26
+ "read_database",
24
27
  "read_dataset",
25
28
  "read_hf",
26
29
  "read_json",
@@ -0,0 +1,151 @@
1
+ import contextlib
2
+ import itertools
3
+ import os
4
+ import sqlite3
5
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
+
7
+ import sqlalchemy
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import Iterator, Mapping, Sequence
11
+
12
+ import sqlalchemy.orm # noqa: TC004
13
+
14
+ from datachain.lib.data_model import DataType
15
+ from datachain.query import Session
16
+
17
+ from .datachain import DataChain
18
+
19
+ ConnectionType = Union[
20
+ str,
21
+ sqlalchemy.engine.URL,
22
+ sqlalchemy.engine.interfaces.Connectable,
23
+ sqlalchemy.engine.Engine,
24
+ sqlalchemy.engine.Connection,
25
+ sqlalchemy.orm.Session,
26
+ sqlite3.Connection,
27
+ ]
28
+
29
+
30
+ @contextlib.contextmanager
31
+ def _connect(
32
+ connection: "ConnectionType",
33
+ ) -> "Iterator[Union[sqlalchemy.engine.Connection, sqlalchemy.orm.Session]]":
34
+ import sqlalchemy.orm
35
+
36
+ with contextlib.ExitStack() as stack:
37
+ engine_kwargs = {"echo": bool(os.environ.get("DEBUG_SHOW_SQL_QUERIES"))}
38
+ if isinstance(connection, (str, sqlalchemy.URL)):
39
+ engine = sqlalchemy.create_engine(connection, **engine_kwargs)
40
+ stack.callback(engine.dispose)
41
+ yield stack.enter_context(engine.connect())
42
+ elif isinstance(connection, sqlite3.Connection):
43
+ engine = sqlalchemy.create_engine(
44
+ "sqlite://", creator=lambda: connection, **engine_kwargs
45
+ )
46
+ # do not close the connection, as it is managed by the caller
47
+ yield engine.connect()
48
+ elif isinstance(connection, sqlalchemy.Engine):
49
+ yield stack.enter_context(connection.connect())
50
+ elif isinstance(connection, (sqlalchemy.Connection, sqlalchemy.orm.Session)):
51
+ # do not close the connection, as it is managed by the caller
52
+ yield connection
53
+ else:
54
+ raise TypeError(f"Unsupported connection type: {type(connection).__name__}")
55
+
56
+
57
+ def _infer_schema(
58
+ result: "sqlalchemy.engine.Result",
59
+ to_infer: list[str],
60
+ infer_schema_length: Optional[int] = 100,
61
+ ) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
62
+ from datachain.lib.convert.values_to_tuples import values_to_tuples
63
+
64
+ if not to_infer:
65
+ return [], {}
66
+
67
+ rows = list(itertools.islice(result, infer_schema_length))
68
+ values = {col: [row._mapping[col] for row in rows] for col in to_infer}
69
+ _, output_schema, _ = values_to_tuples("", **values)
70
+ return rows, output_schema
71
+
72
+
73
+ def read_database(
74
+ query: Union[str, "sqlalchemy.sql.expression.Executable"],
75
+ connection: "ConnectionType",
76
+ params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
77
+ *,
78
+ output: Optional["dict[str, DataType]"] = None,
79
+ session: Optional["Session"] = None,
80
+ settings: Optional[dict] = None,
81
+ in_memory: bool = False,
82
+ infer_schema_length: Optional[int] = 100,
83
+ ) -> "DataChain":
84
+ """
85
+ Read the results of a SQL query into a DataChain, using a given database connection.
86
+
87
+ Args:
88
+ query:
89
+ The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
90
+ `Executable` object.
91
+ connection: SQLAlchemy connectable, str, or a sqlite3 connection
92
+ Using SQLAlchemy makes it possible to use any DB supported by that
93
+ library. If a DBAPI2 object, only sqlite3 is supported. The user is
94
+ responsible for engine disposal and connection closure for the
95
+ SQLAlchemy connectable; str connections are closed automatically.
96
+ params: Parameters to pass to execute method.
97
+ output: A dictionary mapping column names to types, used to override the
98
+ schema inferred from the query results.
99
+ session: Session to use for the chain.
100
+ settings: Settings to use for the chain.
101
+ in_memory: If True, creates an in-memory session. Defaults to False.
102
+ infer_schema_length:
103
+ The maximum number of rows to scan for inferring schema.
104
+ If set to `None`, the full data may be scanned.
105
+ The rows used for schema inference are stored in memory,
106
+ so large values can lead to high memory usage.
107
+ Only applies if the `output` parameter is not set for the given column.
108
+
109
+ Examples:
110
+ Reading from a SQL query against a user-supplied connection:
111
+ ```python
112
+ query = "SELECT key, value FROM tbl"
113
+ chain = dc.read_database(query, connection, output={"value": float})
114
+ ```
115
+
116
+ Load data from a SQLAlchemy driver/engine:
117
+ ```python
118
+ from sqlalchemy import create_engine
119
+ engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
120
+ chain = dc.read_database("select * from tbl", engine)
121
+ ```
122
+
123
+ Load data from a parameterized SQLAlchemy query:
124
+ ```python
125
+ query = "SELECT key, value FROM tbl WHERE value > :value"
126
+ dc.read_database(query, engine, params={"value": 50})
127
+ ```
128
+
129
+ Notes:
130
+ This function works with a variety of databases — including, but not limited to,
131
+ SQLite, DuckDB, PostgreSQL, and Snowflake, provided the appropriate driver is
132
+ installed.
133
+ """
134
+ from datachain.lib.dc.records import read_records
135
+
136
+ output = output or {}
137
+ if isinstance(query, str):
138
+ query = sqlalchemy.text(query)
139
+ kw = {"execution_options": {"stream_results": True}} # use server-side cursors
140
+ with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
141
+ cols = result.keys()
142
+ to_infer = [k for k in cols if k not in output] # preserve the order
143
+ rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
144
+ records = (row._asdict() for row in itertools.chain(rows, result))
145
+ return read_records(
146
+ records,
147
+ session=session,
148
+ settings=settings,
149
+ in_memory=in_memory,
150
+ schema=inferred_schema | output,
151
+ )
@@ -133,7 +133,7 @@ class DataChain:
133
133
  .choices[0]
134
134
  .message.content,
135
135
  )
136
- .save()
136
+ .persist()
137
137
  )
138
138
 
139
139
  try:
@@ -443,9 +443,20 @@ class DataChain:
443
443
  )
444
444
  return listings(*args, **kwargs)
445
445
 
446
+ def persist(self) -> "Self":
447
+ """Saves temporary chain that will be removed after the process ends.
448
+ Temporary datasets are useful for optimization, for example when we have
449
+ multiple chains starting with identical sub-chain. We can then persist that
450
+ common chain and use it to calculate other chains, to avoid re-calculation
451
+ every time.
452
+ It returns the chain itself.
453
+ """
454
+ schema = self.signals_schema.clone_without_sys_signals().serialize()
455
+ return self._evolve(query=self._query.save(feature_schema=schema))
456
+
446
457
  def save( # type: ignore[override]
447
458
  self,
448
- name: Optional[str] = None,
459
+ name: str,
449
460
  version: Optional[int] = None,
450
461
  description: Optional[str] = None,
451
462
  labels: Optional[list[str]] = None,
@@ -454,8 +465,7 @@ class DataChain:
454
465
  """Save to a Dataset. It returns the chain itself.
455
466
 
456
467
  Parameters:
457
- name : dataset name. Empty name saves to a temporary dataset that will be
458
- removed after process ends. Temp dataset are useful for optimization.
468
+ name : dataset name.
459
469
  version : version of a dataset. Default - the last version that exist.
460
470
  description : description of a dataset.
461
471
  labels : labels of a dataset.
@@ -1112,7 +1122,7 @@ class DataChain:
1112
1122
  if self._query.attached:
1113
1123
  chain = self
1114
1124
  else:
1115
- chain = self.save()
1125
+ chain = self.persist()
1116
1126
  assert chain.name is not None # for mypy
1117
1127
  return PytorchDataset(
1118
1128
  chain.name,
@@ -166,3 +166,46 @@ def datasets(
166
166
  output={column: DatasetInfo},
167
167
  **{column: datasets_values}, # type: ignore[arg-type]
168
168
  )
169
+
170
+
171
+ def delete_dataset(
172
+ name: str,
173
+ version: Optional[int] = None,
174
+ force: Optional[bool] = False,
175
+ studio: Optional[bool] = False,
176
+ session: Optional[Session] = None,
177
+ in_memory: bool = False,
178
+ ) -> None:
179
+ """Removes specific dataset version or all dataset versions, depending on
180
+ a force flag.
181
+
182
+ Args:
183
+ name : Dataset name
184
+ version : Optional dataset version
185
+ force: If true, all datasets versions will be removed. Defaults to False.
186
+ studio: If True, removes dataset from Studio only,
187
+ otherwise remove from local. Defaults to False.
188
+ session: Optional session instance. If not provided, uses default session.
189
+ in_memory: If True, creates an in-memory session. Defaults to False.
190
+
191
+ Returns: None
192
+
193
+ Example:
194
+ ```py
195
+ import datachain as dc
196
+ dc.delete_dataset("cats")
197
+ ```
198
+
199
+ ```py
200
+ import datachain as dc
201
+ dc.delete_dataset("cats", version=1)
202
+ ```
203
+ """
204
+
205
+ session = Session.get(session, in_memory=in_memory)
206
+ catalog = session.catalog
207
+ if not force:
208
+ version = version or catalog.get_dataset(name).latest_version
209
+ else:
210
+ version = None
211
+ catalog.remove_dataset(name, version=version, force=force, studio=studio)
@@ -37,7 +37,14 @@ def read_pandas( # type: ignore[override]
37
37
  """
38
38
  from .utils import DatasetPrepareError
39
39
 
40
- fr_map = {col.lower(): df[col].tolist() for col in df.columns}
40
+ def get_col_name(col):
41
+ if isinstance(col, tuple):
42
+ # Join tuple elements with underscore for MultiIndex columns
43
+ return "_".join(map(str, col)).lower()
44
+ # Handle regular string column names
45
+ return str(col).lower()
46
+
47
+ fr_map = {get_col_name(col): df[col].tolist() for col in df.columns}
41
48
 
42
49
  for c in fr_map:
43
50
  if not c.isidentifier():
@@ -1,8 +1,5 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- Union,
5
- )
1
+ from collections.abc import Iterable
2
+ from typing import TYPE_CHECKING, Optional, Union
6
3
 
7
4
  import sqlalchemy
8
5
 
@@ -12,6 +9,7 @@ from datachain.lib.file import (
12
9
  )
13
10
  from datachain.lib.signal_schema import SignalSchema
14
11
  from datachain.query import Session
12
+ from datachain.query.schema import Column
15
13
 
16
14
  if TYPE_CHECKING:
17
15
  from typing_extensions import ParamSpec
@@ -22,7 +20,7 @@ if TYPE_CHECKING:
22
20
 
23
21
 
24
22
  def read_records(
25
- to_insert: Optional[Union[dict, list[dict]]],
23
+ to_insert: Optional[Union[dict, Iterable[dict]]],
26
24
  session: Optional[Session] = None,
27
25
  settings: Optional[dict] = None,
28
26
  in_memory: bool = False,
@@ -54,10 +52,11 @@ def read_records(
54
52
 
55
53
  if schema:
56
54
  signal_schema = SignalSchema(schema)
57
- columns = [
58
- sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
59
- for c in signal_schema.db_signals(as_columns=True) # type: ignore[assignment]
60
- ]
55
+ columns = []
56
+ for c in signal_schema.db_signals(as_columns=True):
57
+ assert isinstance(c, Column)
58
+ kw = {"nullable": c.nullable} if c.nullable is not None else {}
59
+ columns.append(sqlalchemy.Column(c.name, c.type, **kw))
61
60
  else:
62
61
  columns = [
63
62
  sqlalchemy.Column(name, typ)
@@ -83,8 +82,7 @@ def read_records(
83
82
 
84
83
  warehouse = catalog.warehouse
85
84
  dr = warehouse.dataset_rows(dsr)
86
- db = warehouse.db
87
- insert_q = dr.get_table().insert()
88
- for record in to_insert:
89
- db.execute(insert_q.values(**record))
85
+ table = dr.get_table()
86
+ warehouse.insert_rows(table, to_insert)
87
+ warehouse.insert_rows_done(table)
90
88
  return read_dataset(name=dsr.name, session=session, settings=settings)
@@ -581,7 +581,11 @@ class SignalSchema:
581
581
  signals = [
582
582
  DEFAULT_DELIMITER.join(path)
583
583
  if not as_columns
584
- else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
584
+ else Column(
585
+ DEFAULT_DELIMITER.join(path),
586
+ python_to_sql(_type),
587
+ nullable=is_optional(_type),
588
+ )
585
589
  for path, _type, has_subtree, _ in self.get_flat_tree(
586
590
  include_hidden=include_hidden
587
591
  )
@@ -990,3 +994,8 @@ class SignalSchema:
990
994
  }
991
995
 
992
996
  return SignalSchema.deserialize(schema)
997
+
998
+
999
+ def is_optional(type_: Any) -> bool:
1000
+ """Check if a type is Optional."""
1001
+ return get_origin(type_) is Union and type(None) in get_args(type_)
@@ -437,9 +437,17 @@ class UDFStep(Step, ABC):
437
437
  "distributed processing."
438
438
  )
439
439
 
440
- from datachain.catalog.loader import get_udf_distributor_class
440
+ from datachain.catalog.loader import (
441
+ DISTRIBUTED_IMPORT_PATH,
442
+ get_udf_distributor_class,
443
+ )
444
+
445
+ if not (udf_distributor_class := get_udf_distributor_class()):
446
+ raise RuntimeError(
447
+ f"{DISTRIBUTED_IMPORT_PATH} import path is required "
448
+ "for distributed UDF processing."
449
+ )
441
450
 
442
- udf_distributor_class = get_udf_distributor_class()
443
451
  udf_distributor = udf_distributor_class(
444
452
  catalog=catalog,
445
453
  table=udf_table,
@@ -1162,16 +1170,6 @@ class DatasetQuery:
1162
1170
  )
1163
1171
  return sqlalchemy.table(table_name)
1164
1172
 
1165
- @staticmethod
1166
- def delete(
1167
- name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
1168
- ) -> None:
1169
- from datachain.catalog import get_catalog
1170
-
1171
- catalog = catalog or get_catalog()
1172
- version = version or catalog.get_dataset(name).latest_version
1173
- catalog.remove_dataset(name, version)
1174
-
1175
1173
  @property
1176
1174
  def attached(self) -> bool:
1177
1175
  """
@@ -13,7 +13,7 @@ from multiprocess import get_context
13
13
 
14
14
  from datachain.catalog import Catalog
15
15
  from datachain.catalog.catalog import clone_catalog_with_cache
16
- from datachain.catalog.loader import get_udf_distributor_class
16
+ from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
17
17
  from datachain.lib.udf import _get_cache
18
18
  from datachain.query.batch import RowsOutput, RowsOutputBatch
19
19
  from datachain.query.dataset import (
@@ -91,7 +91,12 @@ def udf_entrypoint() -> int:
91
91
 
92
92
 
93
93
  def udf_worker_entrypoint() -> int:
94
- return get_udf_distributor_class().run_worker()
94
+ if not (udf_distributor_class := get_udf_distributor_class()):
95
+ raise RuntimeError(
96
+ f"{DISTRIBUTED_IMPORT_PATH} import path is required "
97
+ "for distributed UDF processing."
98
+ )
99
+ return udf_distributor_class.run_worker()
95
100
 
96
101
 
97
102
  class UDFDispatcher:
@@ -40,12 +40,15 @@ class ColumnMeta(type):
40
40
  class Column(sa.ColumnClause, metaclass=ColumnMeta):
41
41
  inherit_cache: Optional[bool] = True
42
42
 
43
- def __init__(self, text, type_=None, is_literal=False, _selectable=None):
43
+ def __init__(
44
+ self, text, type_=None, is_literal=False, nullable=None, _selectable=None
45
+ ):
44
46
  """Dataset column."""
45
47
  self.name = ColumnMeta.to_db_name(text)
46
48
  super().__init__(
47
49
  self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
48
50
  )
51
+ self.nullable = nullable
49
52
 
50
53
  def __getattr__(self, name: str):
51
54
  return Column(self.name + DEFAULT_DELIMITER + name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.5
3
+ Version: 0.15.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -168,6 +168,7 @@ src/datachain/lib/convert/unflatten.py
168
168
  src/datachain/lib/convert/values_to_tuples.py
169
169
  src/datachain/lib/dc/__init__.py
170
170
  src/datachain/lib/dc/csv.py
171
+ src/datachain/lib/dc/database.py
171
172
  src/datachain/lib/dc/datachain.py
172
173
  src/datachain/lib/dc/datasets.py
173
174
  src/datachain/lib/dc/hf.py
@@ -267,6 +268,7 @@ tests/func/test_metrics.py
267
268
  tests/func/test_pull.py
268
269
  tests/func/test_pytorch.py
269
270
  tests/func/test_query.py
271
+ tests/func/test_read_database.py
270
272
  tests/func/test_session.py
271
273
  tests/func/test_toolkit.py
272
274
  tests/func/test_video.py
@@ -631,10 +631,13 @@ def dataset_rows():
631
631
 
632
632
 
633
633
  @pytest.fixture
634
- def studio_datasets(requests_mock):
634
+ def studio_token():
635
635
  with Config(ConfigLevel.GLOBAL).edit() as conf:
636
636
  conf["studio"] = {"token": "isat_access_token", "team": "team_name"}
637
637
 
638
+
639
+ @pytest.fixture
640
+ def studio_datasets(requests_mock, studio_token):
638
641
  common_version_info = {
639
642
  "status": 1,
640
643
  "created_at": "2024-02-23T10:42:31.842944+00:00",