datachain 0.14.5__tar.gz → 0.16.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (355) hide show
  1. {datachain-0.14.5 → datachain-0.16.0}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.14.5/src/datachain.egg-info → datachain-0.16.0}/PKG-INFO +1 -1
  3. {datachain-0.14.5 → datachain-0.16.0}/docs/references/datachain.md +4 -0
  4. {datachain-0.14.5 → datachain-0.16.0}/examples/multimodal/wds_filtered.py +1 -1
  5. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/__init__.py +4 -0
  6. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/catalog/catalog.py +19 -9
  7. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/catalog/loader.py +11 -7
  8. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/__init__.py +1 -1
  9. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/datasets.py +3 -3
  10. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/show.py +2 -2
  11. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/parser/__init__.py +2 -2
  12. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/metastore.py +5 -5
  13. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/dataset.py +8 -8
  14. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/values_to_tuples.py +23 -14
  15. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dataset_info.py +18 -0
  16. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/__init__.py +4 -1
  17. datachain-0.16.0/src/datachain/lib/dc/database.py +151 -0
  18. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/datachain.py +19 -8
  19. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/datasets.py +52 -0
  20. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/pandas.py +8 -1
  21. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/records.py +12 -14
  22. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/signal_schema.py +10 -1
  23. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/udf.py +2 -1
  24. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/dataset.py +12 -14
  25. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/dispatch.py +7 -2
  26. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/schema.py +4 -1
  27. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/remote/studio.py +2 -2
  28. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/studio.py +2 -2
  29. {datachain-0.14.5 → datachain-0.16.0/src/datachain.egg-info}/PKG-INFO +1 -1
  30. {datachain-0.14.5 → datachain-0.16.0}/src/datachain.egg-info/SOURCES.txt +2 -0
  31. {datachain-0.14.5 → datachain-0.16.0}/tests/conftest.py +11 -8
  32. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_catalog.py +3 -3
  33. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_datachain.py +32 -8
  34. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_dataset_query.py +0 -60
  35. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_datasets.py +7 -7
  36. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_hidden_field.py +1 -1
  37. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_pull.py +10 -8
  38. datachain-0.16.0/tests/func/test_read_database.py +175 -0
  39. {datachain-0.14.5 → datachain-0.16.0}/tests/test_cli_studio.py +4 -4
  40. {datachain-0.14.5 → datachain-0.16.0}/tests/test_import_time.py +1 -1
  41. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_datachain.py +118 -2
  42. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_feature_utils.py +0 -5
  43. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_catalog_loader.py +21 -10
  44. {datachain-0.14.5 → datachain-0.16.0}/.cruft.json +0 -0
  45. {datachain-0.14.5 → datachain-0.16.0}/.gitattributes +0 -0
  46. {datachain-0.14.5 → datachain-0.16.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  47. {datachain-0.14.5 → datachain-0.16.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  48. {datachain-0.14.5 → datachain-0.16.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  49. {datachain-0.14.5 → datachain-0.16.0}/.github/codecov.yaml +0 -0
  50. {datachain-0.14.5 → datachain-0.16.0}/.github/dependabot.yml +0 -0
  51. {datachain-0.14.5 → datachain-0.16.0}/.github/workflows/benchmarks.yml +0 -0
  52. {datachain-0.14.5 → datachain-0.16.0}/.github/workflows/release.yml +0 -0
  53. {datachain-0.14.5 → datachain-0.16.0}/.github/workflows/tests-studio.yml +0 -0
  54. {datachain-0.14.5 → datachain-0.16.0}/.github/workflows/tests.yml +0 -0
  55. {datachain-0.14.5 → datachain-0.16.0}/.github/workflows/update-template.yaml +0 -0
  56. {datachain-0.14.5 → datachain-0.16.0}/.gitignore +0 -0
  57. {datachain-0.14.5 → datachain-0.16.0}/CODE_OF_CONDUCT.rst +0 -0
  58. {datachain-0.14.5 → datachain-0.16.0}/LICENSE +0 -0
  59. {datachain-0.14.5 → datachain-0.16.0}/README.rst +0 -0
  60. {datachain-0.14.5 → datachain-0.16.0}/docs/assets/captioned_cartoons.png +0 -0
  61. {datachain-0.14.5 → datachain-0.16.0}/docs/assets/datachain-white.svg +0 -0
  62. {datachain-0.14.5 → datachain-0.16.0}/docs/assets/datachain.svg +0 -0
  63. {datachain-0.14.5 → datachain-0.16.0}/docs/contributing.md +0 -0
  64. {datachain-0.14.5 → datachain-0.16.0}/docs/css/github-permalink-style.css +0 -0
  65. {datachain-0.14.5 → datachain-0.16.0}/docs/examples.md +0 -0
  66. {datachain-0.14.5 → datachain-0.16.0}/docs/index.md +0 -0
  67. {datachain-0.14.5 → datachain-0.16.0}/docs/overrides/main.html +0 -0
  68. {datachain-0.14.5 → datachain-0.16.0}/docs/quick-start.md +0 -0
  69. {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/arrowrow.md +0 -0
  70. {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/bbox.md +0 -0
  71. {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/file.md +0 -0
  72. {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/imagefile.md +0 -0
  73. {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/index.md +0 -0
  74. {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/pose.md +0 -0
  75. {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/segment.md +0 -0
  76. {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/tarvfile.md +0 -0
  77. {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/textfile.md +0 -0
  78. {datachain-0.14.5 → datachain-0.16.0}/docs/references/data-types/videofile.md +0 -0
  79. {datachain-0.14.5 → datachain-0.16.0}/docs/references/func.md +0 -0
  80. {datachain-0.14.5 → datachain-0.16.0}/docs/references/index.md +0 -0
  81. {datachain-0.14.5 → datachain-0.16.0}/docs/references/remotes.md +0 -0
  82. {datachain-0.14.5 → datachain-0.16.0}/docs/references/toolkit.md +0 -0
  83. {datachain-0.14.5 → datachain-0.16.0}/docs/references/torch.md +0 -0
  84. {datachain-0.14.5 → datachain-0.16.0}/docs/references/udf.md +0 -0
  85. {datachain-0.14.5 → datachain-0.16.0}/docs/tutorials.md +0 -0
  86. {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  87. {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  88. {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/openimage-detect.py +0 -0
  89. {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  90. {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  91. {datachain-0.14.5 → datachain-0.16.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  92. {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/common_sql_functions.py +0 -0
  93. {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/json-csv-reader.py +0 -0
  94. {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/torch-loader.py +0 -0
  95. {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/udfs/parallel.py +0 -0
  96. {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/udfs/simple.py +0 -0
  97. {datachain-0.14.5 → datachain-0.16.0}/examples/get_started/udfs/stateful.py +0 -0
  98. {datachain-0.14.5 → datachain-0.16.0}/examples/llm_and_nlp/claude-query.py +0 -0
  99. {datachain-0.14.5 → datachain-0.16.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  100. {datachain-0.14.5 → datachain-0.16.0}/examples/multimodal/clip_inference.py +0 -0
  101. {datachain-0.14.5 → datachain-0.16.0}/examples/multimodal/hf_pipeline.py +0 -0
  102. {datachain-0.14.5 → datachain-0.16.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  103. {datachain-0.14.5 → datachain-0.16.0}/examples/multimodal/wds.py +0 -0
  104. {datachain-0.14.5 → datachain-0.16.0}/mkdocs.yml +0 -0
  105. {datachain-0.14.5 → datachain-0.16.0}/noxfile.py +0 -0
  106. {datachain-0.14.5 → datachain-0.16.0}/pyproject.toml +0 -0
  107. {datachain-0.14.5 → datachain-0.16.0}/setup.cfg +0 -0
  108. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/__main__.py +0 -0
  109. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/asyn.py +0 -0
  110. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cache.py +0 -0
  111. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/catalog/__init__.py +0 -0
  112. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/catalog/datasource.py +0 -0
  113. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/__init__.py +0 -0
  114. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/du.py +0 -0
  115. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/index.py +0 -0
  116. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/ls.py +0 -0
  117. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/misc.py +0 -0
  118. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/commands/query.py +0 -0
  119. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/parser/job.py +0 -0
  120. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/parser/studio.py +0 -0
  121. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/parser/utils.py +0 -0
  122. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/cli/utils.py +0 -0
  123. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/__init__.py +0 -0
  124. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/azure.py +0 -0
  125. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/fileslice.py +0 -0
  126. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/fsspec.py +0 -0
  127. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/gcs.py +0 -0
  128. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/hf.py +0 -0
  129. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/local.py +0 -0
  130. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/client/s3.py +0 -0
  131. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/config.py +0 -0
  132. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/__init__.py +0 -0
  133. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/db_engine.py +0 -0
  134. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/job.py +0 -0
  135. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/schema.py +0 -0
  136. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/serializer.py +0 -0
  137. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/sqlite.py +0 -0
  138. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/data_storage/warehouse.py +0 -0
  139. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/diff/__init__.py +0 -0
  140. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/error.py +0 -0
  141. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/fs/__init__.py +0 -0
  142. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/fs/reference.py +0 -0
  143. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/fs/utils.py +0 -0
  144. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/__init__.py +0 -0
  145. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/aggregate.py +0 -0
  146. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/array.py +0 -0
  147. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/base.py +0 -0
  148. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/conditional.py +0 -0
  149. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/func.py +0 -0
  150. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/numeric.py +0 -0
  151. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/path.py +0 -0
  152. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/random.py +0 -0
  153. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/string.py +0 -0
  154. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/func/window.py +0 -0
  155. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/job.py +0 -0
  156. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/__init__.py +0 -0
  157. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/arrow.py +0 -0
  158. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/clip.py +0 -0
  159. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/__init__.py +0 -0
  160. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/flatten.py +0 -0
  161. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  162. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  163. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/convert/unflatten.py +0 -0
  164. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/data_model.py +0 -0
  165. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/csv.py +0 -0
  166. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/hf.py +0 -0
  167. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/json.py +0 -0
  168. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/listings.py +0 -0
  169. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/parquet.py +0 -0
  170. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/storage.py +0 -0
  171. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/utils.py +0 -0
  172. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/dc/values.py +0 -0
  173. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/file.py +0 -0
  174. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/hf.py +0 -0
  175. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/image.py +0 -0
  176. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/listing.py +0 -0
  177. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/listing_info.py +0 -0
  178. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/meta_formats.py +0 -0
  179. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/model_store.py +0 -0
  180. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/pytorch.py +0 -0
  181. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/settings.py +0 -0
  182. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/tar.py +0 -0
  183. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/text.py +0 -0
  184. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/udf_signature.py +0 -0
  185. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/utils.py +0 -0
  186. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/video.py +0 -0
  187. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/webdataset.py +0 -0
  188. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/lib/webdataset_laion.py +0 -0
  189. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/listing.py +0 -0
  190. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/__init__.py +0 -0
  191. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/bbox.py +0 -0
  192. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/pose.py +0 -0
  193. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/segment.py +0 -0
  194. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  195. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  196. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/ultralytics/pose.py +0 -0
  197. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/ultralytics/segment.py +0 -0
  198. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/model/utils.py +0 -0
  199. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/node.py +0 -0
  200. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/nodes_fetcher.py +0 -0
  201. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/nodes_thread_pool.py +0 -0
  202. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/progress.py +0 -0
  203. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/py.typed +0 -0
  204. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/__init__.py +0 -0
  205. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/batch.py +0 -0
  206. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/metrics.py +0 -0
  207. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/params.py +0 -0
  208. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/queue.py +0 -0
  209. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/session.py +0 -0
  210. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/udf.py +0 -0
  211. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/query/utils.py +0 -0
  212. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/remote/__init__.py +0 -0
  213. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/script_meta.py +0 -0
  214. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/__init__.py +0 -0
  215. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/default/__init__.py +0 -0
  216. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/default/base.py +0 -0
  217. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/__init__.py +0 -0
  218. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/aggregate.py +0 -0
  219. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/array.py +0 -0
  220. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/conditional.py +0 -0
  221. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/numeric.py +0 -0
  222. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/path.py +0 -0
  223. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/random.py +0 -0
  224. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/functions/string.py +0 -0
  225. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/selectable.py +0 -0
  226. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  227. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/sqlite/base.py +0 -0
  228. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/sqlite/types.py +0 -0
  229. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/sqlite/vector.py +0 -0
  230. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/types.py +0 -0
  231. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/sql/utils.py +0 -0
  232. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/telemetry.py +0 -0
  233. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/toolkit/__init__.py +0 -0
  234. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/toolkit/split.py +0 -0
  235. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/torch/__init__.py +0 -0
  236. {datachain-0.14.5 → datachain-0.16.0}/src/datachain/utils.py +0 -0
  237. {datachain-0.14.5 → datachain-0.16.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  238. {datachain-0.14.5 → datachain-0.16.0}/src/datachain.egg-info/entry_points.txt +0 -0
  239. {datachain-0.14.5 → datachain-0.16.0}/src/datachain.egg-info/requires.txt +0 -0
  240. {datachain-0.14.5 → datachain-0.16.0}/src/datachain.egg-info/top_level.txt +0 -0
  241. {datachain-0.14.5 → datachain-0.16.0}/tests/__init__.py +0 -0
  242. {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/__init__.py +0 -0
  243. {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/conftest.py +0 -0
  244. {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  245. {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  246. {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/datasets/.gitignore +0 -0
  247. {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  248. {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/test_datachain.py +0 -0
  249. {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/test_ls.py +0 -0
  250. {datachain-0.14.5 → datachain-0.16.0}/tests/benchmarks/test_version.py +0 -0
  251. {datachain-0.14.5 → datachain-0.16.0}/tests/data.py +0 -0
  252. {datachain-0.14.5 → datachain-0.16.0}/tests/examples/__init__.py +0 -0
  253. {datachain-0.14.5 → datachain-0.16.0}/tests/examples/test_examples.py +0 -0
  254. {datachain-0.14.5 → datachain-0.16.0}/tests/examples/test_wds_e2e.py +0 -0
  255. {datachain-0.14.5 → datachain-0.16.0}/tests/examples/wds_data.py +0 -0
  256. {datachain-0.14.5 → datachain-0.16.0}/tests/func/__init__.py +0 -0
  257. {datachain-0.14.5 → datachain-0.16.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  258. {datachain-0.14.5 → datachain-0.16.0}/tests/func/data/lena.jpg +0 -0
  259. {datachain-0.14.5 → datachain-0.16.0}/tests/func/fake-service-account-credentials.json +0 -0
  260. {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/__init__.py +0 -0
  261. {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/data/running-mask0.png +0 -0
  262. {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/data/running-mask1.png +0 -0
  263. {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/data/running.jpg +0 -0
  264. {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/data/ships.jpg +0 -0
  265. {datachain-0.14.5 → datachain-0.16.0}/tests/func/model/test_yolo.py +0 -0
  266. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_client.py +0 -0
  267. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_cloud_transfer.py +0 -0
  268. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_data_storage.py +0 -0
  269. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_datachain_merge.py +0 -0
  270. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_feature_pickling.py +0 -0
  271. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_file.py +0 -0
  272. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_hf.py +0 -0
  273. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_image.py +0 -0
  274. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_listing.py +0 -0
  275. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_ls.py +0 -0
  276. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_meta_formats.py +0 -0
  277. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_metrics.py +0 -0
  278. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_pytorch.py +0 -0
  279. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_query.py +0 -0
  280. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_session.py +0 -0
  281. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_toolkit.py +0 -0
  282. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_video.py +0 -0
  283. {datachain-0.14.5 → datachain-0.16.0}/tests/func/test_warehouse.py +0 -0
  284. {datachain-0.14.5 → datachain-0.16.0}/tests/scripts/feature_class.py +0 -0
  285. {datachain-0.14.5 → datachain-0.16.0}/tests/scripts/feature_class_exception.py +0 -0
  286. {datachain-0.14.5 → datachain-0.16.0}/tests/scripts/feature_class_parallel.py +0 -0
  287. {datachain-0.14.5 → datachain-0.16.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  288. {datachain-0.14.5 → datachain-0.16.0}/tests/scripts/name_len_slow.py +0 -0
  289. {datachain-0.14.5 → datachain-0.16.0}/tests/test_atomicity.py +0 -0
  290. {datachain-0.14.5 → datachain-0.16.0}/tests/test_cli_e2e.py +0 -0
  291. {datachain-0.14.5 → datachain-0.16.0}/tests/test_query_e2e.py +0 -0
  292. {datachain-0.14.5 → datachain-0.16.0}/tests/test_telemetry.py +0 -0
  293. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/__init__.py +0 -0
  294. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/__init__.py +0 -0
  295. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/conftest.py +0 -0
  296. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_arrow.py +0 -0
  297. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_clip.py +0 -0
  298. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  299. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  300. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_diff.py +0 -0
  301. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_feature.py +0 -0
  302. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_file.py +0 -0
  303. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_hf.py +0 -0
  304. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_image.py +0 -0
  305. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_listing_info.py +0 -0
  306. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  307. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_schema.py +0 -0
  308. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_signal_schema.py +0 -0
  309. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  310. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_text.py +0 -0
  311. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_udf_signature.py +0 -0
  312. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_utils.py +0 -0
  313. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/lib/test_webdataset.py +0 -0
  314. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/model/__init__.py +0 -0
  315. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/model/test_bbox.py +0 -0
  316. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/model/test_pose.py +0 -0
  317. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/model/test_segment.py +0 -0
  318. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/model/test_utils.py +0 -0
  319. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/__init__.py +0 -0
  320. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  321. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  322. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  323. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_array.py +0 -0
  324. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_conditional.py +0 -0
  325. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_path.py +0 -0
  326. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_random.py +0 -0
  327. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_selectable.py +0 -0
  328. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/sql/test_string.py +0 -0
  329. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_asyn.py +0 -0
  330. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_cache.py +0 -0
  331. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_catalog.py +0 -0
  332. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_cli_parsing.py +0 -0
  333. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_client.py +0 -0
  334. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_client_gcs.py +0 -0
  335. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_client_s3.py +0 -0
  336. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_config.py +0 -0
  337. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_data_storage.py +0 -0
  338. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_database_engine.py +0 -0
  339. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_dataset.py +0 -0
  340. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_dispatch.py +0 -0
  341. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_fileslice.py +0 -0
  342. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_func.py +0 -0
  343. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_listing.py +0 -0
  344. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_metastore.py +0 -0
  345. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_module_exports.py +0 -0
  346. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_pytorch.py +0 -0
  347. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_query.py +0 -0
  348. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_query_metrics.py +0 -0
  349. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_query_params.py +0 -0
  350. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_script_meta.py +0 -0
  351. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_serializer.py +0 -0
  352. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_session.py +0 -0
  353. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_utils.py +0 -0
  354. {datachain-0.14.5 → datachain-0.16.0}/tests/unit/test_warehouse.py +0 -0
  355. {datachain-0.14.5 → datachain-0.16.0}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.11.4'
27
+ rev: 'v0.11.5'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.5
3
+ Version: 0.16.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -31,6 +31,10 @@ for examples of how to create a chain.
31
31
 
32
32
  ::: datachain.lib.dc.values.read_values
33
33
 
34
+ ::: datachain.lib.dc.database.read_database
35
+
36
+ ::: datachain.lib.dc.database.ConnectionType
37
+
34
38
  ::: datachain.lib.dc.DataChain
35
39
 
36
40
  ::: datachain.lib.utils.DataChainError
@@ -27,7 +27,7 @@ filtered = (
27
27
  / func.least("laion.json.original_width", "laion.json.original_height")
28
28
  < 3.0
29
29
  )
30
- .save()
30
+ .persist()
31
31
  )
32
32
 
33
33
  filtered.show(3)
@@ -5,8 +5,10 @@ from datachain.lib.dc import (
5
5
  DataChain,
6
6
  Sys,
7
7
  datasets,
8
+ delete_dataset,
8
9
  listings,
9
10
  read_csv,
11
+ read_database,
10
12
  read_dataset,
11
13
  read_hf,
12
14
  read_json,
@@ -61,11 +63,13 @@ __all__ = [
61
63
  "VideoFragment",
62
64
  "VideoFrame",
63
65
  "datasets",
66
+ "delete_dataset",
64
67
  "is_chain_type",
65
68
  "listings",
66
69
  "metrics",
67
70
  "param",
68
71
  "read_csv",
72
+ "read_database",
69
73
  "read_dataset",
70
74
  "read_hf",
71
75
  "read_json",
@@ -776,7 +776,7 @@ class Catalog:
776
776
  listing: Optional[bool] = False,
777
777
  uuid: Optional[str] = None,
778
778
  description: Optional[str] = None,
779
- labels: Optional[list[str]] = None,
779
+ attrs: Optional[list[str]] = None,
780
780
  ) -> "DatasetRecord":
781
781
  """
782
782
  Creates new dataset of a specific version.
@@ -794,16 +794,16 @@ class Catalog:
794
794
  dataset = self.get_dataset(name)
795
795
  default_version = dataset.next_version
796
796
 
797
- if (description or labels) and (
798
- dataset.description != description or dataset.labels != labels
797
+ if (description or attrs) and (
798
+ dataset.description != description or dataset.attrs != attrs
799
799
  ):
800
800
  description = description or dataset.description
801
- labels = labels or dataset.labels
801
+ attrs = attrs or dataset.attrs
802
802
 
803
803
  self.update_dataset(
804
804
  dataset,
805
805
  description=description,
806
- labels=labels,
806
+ attrs=attrs,
807
807
  )
808
808
 
809
809
  except DatasetNotFoundError:
@@ -817,7 +817,7 @@ class Catalog:
817
817
  schema=schema,
818
818
  ignore_if_exists=True,
819
819
  description=description,
820
- labels=labels,
820
+ attrs=attrs,
821
821
  )
822
822
 
823
823
  version = version or default_version
@@ -1299,7 +1299,17 @@ class Catalog:
1299
1299
  name: str,
1300
1300
  version: Optional[int] = None,
1301
1301
  force: Optional[bool] = False,
1302
+ studio: Optional[bool] = False,
1302
1303
  ):
1304
+ from datachain.remote.studio import StudioClient
1305
+
1306
+ if studio:
1307
+ client = StudioClient()
1308
+ response = client.rm_dataset(name, version=version, force=force)
1309
+ if not response.ok:
1310
+ raise DataChainError(response.message)
1311
+ return
1312
+
1303
1313
  dataset = self.get_dataset(name)
1304
1314
  if not version and not force:
1305
1315
  raise ValueError(f"Missing dataset version from input for dataset {name}")
@@ -1324,15 +1334,15 @@ class Catalog:
1324
1334
  name: str,
1325
1335
  new_name: Optional[str] = None,
1326
1336
  description: Optional[str] = None,
1327
- labels: Optional[list[str]] = None,
1337
+ attrs: Optional[list[str]] = None,
1328
1338
  ) -> DatasetRecord:
1329
1339
  update_data = {}
1330
1340
  if new_name:
1331
1341
  update_data["name"] = new_name
1332
1342
  if description is not None:
1333
1343
  update_data["description"] = description
1334
- if labels is not None:
1335
- update_data["labels"] = labels # type: ignore[assignment]
1344
+ if attrs is not None:
1345
+ update_data["attrs"] = attrs # type: ignore[assignment]
1336
1346
 
1337
1347
  dataset = self.get_dataset(name)
1338
1348
  return self.update_dataset(dataset, **update_data)
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import sys
2
3
  from importlib import import_module
3
4
  from typing import TYPE_CHECKING, Any, Optional
4
5
 
@@ -15,6 +16,7 @@ METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
15
16
  WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
16
17
  WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
17
18
  WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
19
+ DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
18
20
  DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
19
21
 
20
22
  IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
@@ -100,19 +102,21 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
100
102
  return warehouse_class(**warehouse_args)
101
103
 
102
104
 
103
- def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
104
- distributed_import_path = os.environ.get(DISTRIBUTED_IMPORT_PATH)
105
+ def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
106
+ if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
107
+ return None
105
108
 
106
- if not distributed_import_path:
107
- raise RuntimeError(
108
- f"{DISTRIBUTED_IMPORT_PATH} import path is required "
109
- "for distributed UDF processing."
110
- )
111
109
  # Distributed class paths are specified as (for example): module.classname
112
110
  if "." not in distributed_import_path:
113
111
  raise RuntimeError(
114
112
  f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
115
113
  )
114
+
115
+ # Optional: set the Python path to look for the module
116
+ distributed_import_pythonpath = os.environ.get(DISTRIBUTED_IMPORT_PYTHONPATH)
117
+ if distributed_import_pythonpath and distributed_import_pythonpath not in sys.path:
118
+ sys.path.insert(0, distributed_import_pythonpath)
119
+
116
120
  module_name, _, class_name = distributed_import_path.rpartition(".")
117
121
  distributed = import_module(module_name)
118
122
  return getattr(distributed, class_name)
@@ -149,7 +149,7 @@ def handle_dataset_command(args, catalog):
149
149
  args.name,
150
150
  new_name=args.new_name,
151
151
  description=args.description,
152
- labels=args.labels,
152
+ attrs=args.attrs,
153
153
  studio=args.studio,
154
154
  local=args.local,
155
155
  all=args.all,
@@ -154,7 +154,7 @@ def edit_dataset(
154
154
  name: str,
155
155
  new_name: Optional[str] = None,
156
156
  description: Optional[str] = None,
157
- labels: Optional[list[str]] = None,
157
+ attrs: Optional[list[str]] = None,
158
158
  studio: bool = False,
159
159
  local: bool = False,
160
160
  all: bool = True,
@@ -167,9 +167,9 @@ def edit_dataset(
167
167
 
168
168
  if all or local:
169
169
  try:
170
- catalog.edit_dataset(name, new_name, description, labels)
170
+ catalog.edit_dataset(name, new_name, description, attrs)
171
171
  except DatasetNotFoundError:
172
172
  print("Dataset not found in local", file=sys.stderr)
173
173
 
174
174
  if (all or studio) and token:
175
- edit_studio_dataset(team, name, new_name, description, labels)
175
+ edit_studio_dataset(team, name, new_name, description, attrs)
@@ -42,8 +42,8 @@ def show(
42
42
  print("Name: ", name)
43
43
  if dataset.description:
44
44
  print("Description: ", dataset.description)
45
- if dataset.labels:
46
- print("Labels: ", ",".join(dataset.labels))
45
+ if dataset.attrs:
46
+ print("Attributes: ", ",".join(dataset.attrs))
47
47
  print("\n")
48
48
 
49
49
  show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
@@ -217,9 +217,9 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
217
217
  help="Dataset description",
218
218
  )
219
219
  parse_edit_dataset.add_argument(
220
- "--labels",
220
+ "--attrs",
221
221
  nargs="+",
222
- help="Dataset labels",
222
+ help="Dataset attributes",
223
223
  )
224
224
  parse_edit_dataset.add_argument(
225
225
  "--studio",
@@ -120,7 +120,7 @@ class AbstractMetastore(ABC, Serializable):
120
120
  schema: Optional[dict[str, Any]] = None,
121
121
  ignore_if_exists: bool = False,
122
122
  description: Optional[str] = None,
123
- labels: Optional[list[str]] = None,
123
+ attrs: Optional[list[str]] = None,
124
124
  ) -> DatasetRecord:
125
125
  """Creates new dataset."""
126
126
 
@@ -326,7 +326,7 @@ class AbstractDBMetastore(AbstractMetastore):
326
326
  Column("id", Integer, primary_key=True),
327
327
  Column("name", Text, nullable=False),
328
328
  Column("description", Text),
329
- Column("labels", JSON, nullable=True),
329
+ Column("attrs", JSON, nullable=True),
330
330
  Column("status", Integer, nullable=False),
331
331
  Column("feature_schema", JSON, nullable=True),
332
332
  Column("created_at", DateTime(timezone=True)),
@@ -521,7 +521,7 @@ class AbstractDBMetastore(AbstractMetastore):
521
521
  schema: Optional[dict[str, Any]] = None,
522
522
  ignore_if_exists: bool = False,
523
523
  description: Optional[str] = None,
524
- labels: Optional[list[str]] = None,
524
+ attrs: Optional[list[str]] = None,
525
525
  **kwargs, # TODO registered = True / False
526
526
  ) -> DatasetRecord:
527
527
  """Creates new dataset."""
@@ -538,7 +538,7 @@ class AbstractDBMetastore(AbstractMetastore):
538
538
  query_script=query_script,
539
539
  schema=json.dumps(schema or {}),
540
540
  description=description,
541
- labels=json.dumps(labels or []),
541
+ attrs=json.dumps(attrs or []),
542
542
  )
543
543
  if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
544
544
  # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
@@ -621,7 +621,7 @@ class AbstractDBMetastore(AbstractMetastore):
621
621
  dataset_values = {}
622
622
  for field, value in kwargs.items():
623
623
  if field in self._dataset_fields[1:]:
624
- if field in ["labels", "schema"]:
624
+ if field in ["attrs", "schema"]:
625
625
  values[field] = json.dumps(value) if value else None
626
626
  else:
627
627
  values[field] = value
@@ -329,7 +329,7 @@ class DatasetRecord:
329
329
  id: int
330
330
  name: str
331
331
  description: Optional[str]
332
- labels: list[str]
332
+ attrs: list[str]
333
333
  schema: dict[str, Union[SQLType, type[SQLType]]]
334
334
  feature_schema: dict
335
335
  versions: list[DatasetVersion]
@@ -357,7 +357,7 @@ class DatasetRecord:
357
357
  id: int,
358
358
  name: str,
359
359
  description: Optional[str],
360
- labels: str,
360
+ attrs: str,
361
361
  status: int,
362
362
  feature_schema: Optional[str],
363
363
  created_at: datetime,
@@ -387,7 +387,7 @@ class DatasetRecord:
387
387
  version_schema: str,
388
388
  version_job_id: Optional[str] = None,
389
389
  ) -> "DatasetRecord":
390
- labels_lst: list[str] = json.loads(labels) if labels else []
390
+ attrs_lst: list[str] = json.loads(attrs) if attrs else []
391
391
  schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
392
392
  version_schema_dct: dict[str, str] = (
393
393
  json.loads(version_schema) if version_schema else {}
@@ -418,7 +418,7 @@ class DatasetRecord:
418
418
  id,
419
419
  name,
420
420
  description,
421
- labels_lst,
421
+ attrs_lst,
422
422
  cls.parse_schema(schema_dct), # type: ignore[arg-type]
423
423
  json.loads(feature_schema) if feature_schema else {},
424
424
  [dataset_version],
@@ -562,7 +562,7 @@ class DatasetListRecord:
562
562
  id: int
563
563
  name: str
564
564
  description: Optional[str]
565
- labels: list[str]
565
+ attrs: list[str]
566
566
  versions: list[DatasetListVersion]
567
567
  created_at: Optional[datetime] = None
568
568
 
@@ -572,7 +572,7 @@ class DatasetListRecord:
572
572
  id: int,
573
573
  name: str,
574
574
  description: Optional[str],
575
- labels: str,
575
+ attrs: str,
576
576
  created_at: datetime,
577
577
  version_id: int,
578
578
  version_uuid: str,
@@ -588,7 +588,7 @@ class DatasetListRecord:
588
588
  version_query_script: Optional[str],
589
589
  version_job_id: Optional[str] = None,
590
590
  ) -> "DatasetListRecord":
591
- labels_lst: list[str] = json.loads(labels) if labels else []
591
+ attrs_lst: list[str] = json.loads(attrs) if attrs else []
592
592
 
593
593
  dataset_version = DatasetListVersion.parse(
594
594
  version_id,
@@ -610,7 +610,7 @@ class DatasetListRecord:
610
610
  id,
611
611
  name,
612
612
  description,
613
- labels_lst,
613
+ attrs_lst,
614
614
  [dataset_version],
615
615
  created_at,
616
616
  )
@@ -1,5 +1,6 @@
1
+ import itertools
1
2
  from collections.abc import Sequence
2
- from typing import Any, Union
3
+ from typing import Any, Optional, Union
3
4
 
4
5
  from datachain.lib.data_model import (
5
6
  DataType,
@@ -66,21 +67,29 @@ def values_to_tuples( # noqa: C901, PLR0912
66
67
  f"signal '{k}' is not present in the output",
67
68
  )
68
69
  else:
69
- if len_ == 0:
70
- raise ValuesToTupleError(ds_name, f"signal '{k}' is empty list")
71
-
72
- first_element = next(iter(v))
73
- typ = type(first_element)
74
- if not is_chain_type(typ):
75
- raise ValuesToTupleError(
76
- ds_name,
77
- f"signal '{k}' has unsupported type '{typ.__name__}'."
78
- f" Please use DataModel types: {DataTypeNames}",
70
+ # FIXME: Stops as soon as it finds the first non-None value.
71
+ # If a non-None value appears early, it won't check the remaining items for
72
+ # `None` values.
73
+ try:
74
+ pos, first_not_none_element = next(
75
+ itertools.dropwhile(lambda pair: pair[1] is None, enumerate(v))
79
76
  )
80
- if isinstance(first_element, list):
81
- types_map[k] = list[type(first_element[0])] # type: ignore[assignment, misc]
77
+ except StopIteration:
78
+ typ = str # default to str if all values are None or has length 0
79
+ nullable = True
82
80
  else:
83
- types_map[k] = typ
81
+ nullable = pos > 0
82
+ typ = type(first_not_none_element) # type: ignore[assignment]
83
+ if not is_chain_type(typ):
84
+ raise ValuesToTupleError(
85
+ ds_name,
86
+ f"signal '{k}' has unsupported type '{typ.__name__}'."
87
+ f" Please use DataModel types: {DataTypeNames}",
88
+ )
89
+ if isinstance(first_not_none_element, list):
90
+ typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
91
+
92
+ types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
84
93
 
85
94
  if length < 0:
86
95
  length = len_
@@ -32,11 +32,28 @@ class DatasetInfo(DataModel):
32
32
  metrics: dict[str, Any] = Field(default={})
33
33
  error_message: str = Field(default="")
34
34
  error_stack: str = Field(default="")
35
+ attrs: list[str] = Field(default=[])
35
36
 
36
37
  @property
37
38
  def is_temp(self) -> bool:
38
39
  return Session.is_temp_dataset(self.name)
39
40
 
41
+ def has_attr(self, attr: str) -> bool:
42
+ s = attr.split("=")
43
+ if len(s) == 1:
44
+ return attr in self.attrs
45
+
46
+ name = s[0]
47
+ value = s[1]
48
+ for a in self.attrs:
49
+ s = a.split("=")
50
+ if value == "*" and s[0] == name:
51
+ return True
52
+ if len(s) == 2 and s[0] == name and s[1] == value:
53
+ return True
54
+
55
+ return False
56
+
40
57
  @staticmethod
41
58
  def _validate_dict(
42
59
  v: Optional[Union[str, dict]],
@@ -83,4 +100,5 @@ class DatasetInfo(DataModel):
83
100
  metrics=job.metrics if job else {},
84
101
  error_message=version.error_message,
85
102
  error_stack=version.error_stack,
103
+ attrs=dataset.attrs,
86
104
  )
@@ -1,6 +1,7 @@
1
1
  from .csv import read_csv
2
+ from .database import read_database
2
3
  from .datachain import C, Column, DataChain
3
- from .datasets import datasets, read_dataset
4
+ from .datasets import datasets, delete_dataset, read_dataset
4
5
  from .hf import read_hf
5
6
  from .json import read_json
6
7
  from .listings import listings
@@ -19,8 +20,10 @@ __all__ = [
19
20
  "DatasetPrepareError",
20
21
  "Sys",
21
22
  "datasets",
23
+ "delete_dataset",
22
24
  "listings",
23
25
  "read_csv",
26
+ "read_database",
24
27
  "read_dataset",
25
28
  "read_hf",
26
29
  "read_json",
@@ -0,0 +1,151 @@
1
+ import contextlib
2
+ import itertools
3
+ import os
4
+ import sqlite3
5
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
+
7
+ import sqlalchemy
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import Iterator, Mapping, Sequence
11
+
12
+ import sqlalchemy.orm # noqa: TC004
13
+
14
+ from datachain.lib.data_model import DataType
15
+ from datachain.query import Session
16
+
17
+ from .datachain import DataChain
18
+
19
+ ConnectionType = Union[
20
+ str,
21
+ sqlalchemy.engine.URL,
22
+ sqlalchemy.engine.interfaces.Connectable,
23
+ sqlalchemy.engine.Engine,
24
+ sqlalchemy.engine.Connection,
25
+ sqlalchemy.orm.Session,
26
+ sqlite3.Connection,
27
+ ]
28
+
29
+
30
+ @contextlib.contextmanager
31
+ def _connect(
32
+ connection: "ConnectionType",
33
+ ) -> "Iterator[Union[sqlalchemy.engine.Connection, sqlalchemy.orm.Session]]":
34
+ import sqlalchemy.orm
35
+
36
+ with contextlib.ExitStack() as stack:
37
+ engine_kwargs = {"echo": bool(os.environ.get("DEBUG_SHOW_SQL_QUERIES"))}
38
+ if isinstance(connection, (str, sqlalchemy.URL)):
39
+ engine = sqlalchemy.create_engine(connection, **engine_kwargs)
40
+ stack.callback(engine.dispose)
41
+ yield stack.enter_context(engine.connect())
42
+ elif isinstance(connection, sqlite3.Connection):
43
+ engine = sqlalchemy.create_engine(
44
+ "sqlite://", creator=lambda: connection, **engine_kwargs
45
+ )
46
+ # do not close the connection, as it is managed by the caller
47
+ yield engine.connect()
48
+ elif isinstance(connection, sqlalchemy.Engine):
49
+ yield stack.enter_context(connection.connect())
50
+ elif isinstance(connection, (sqlalchemy.Connection, sqlalchemy.orm.Session)):
51
+ # do not close the connection, as it is managed by the caller
52
+ yield connection
53
+ else:
54
+ raise TypeError(f"Unsupported connection type: {type(connection).__name__}")
55
+
56
+
57
+ def _infer_schema(
58
+ result: "sqlalchemy.engine.Result",
59
+ to_infer: list[str],
60
+ infer_schema_length: Optional[int] = 100,
61
+ ) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
62
+ from datachain.lib.convert.values_to_tuples import values_to_tuples
63
+
64
+ if not to_infer:
65
+ return [], {}
66
+
67
+ rows = list(itertools.islice(result, infer_schema_length))
68
+ values = {col: [row._mapping[col] for row in rows] for col in to_infer}
69
+ _, output_schema, _ = values_to_tuples("", **values)
70
+ return rows, output_schema
71
+
72
+
73
+ def read_database(
74
+ query: Union[str, "sqlalchemy.sql.expression.Executable"],
75
+ connection: "ConnectionType",
76
+ params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
77
+ *,
78
+ output: Optional["dict[str, DataType]"] = None,
79
+ session: Optional["Session"] = None,
80
+ settings: Optional[dict] = None,
81
+ in_memory: bool = False,
82
+ infer_schema_length: Optional[int] = 100,
83
+ ) -> "DataChain":
84
+ """
85
+ Read the results of a SQL query into a DataChain, using a given database connection.
86
+
87
+ Args:
88
+ query:
89
+ The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
90
+ `Executable` object.
91
+ connection: SQLAlchemy connectable, str, or a sqlite3 connection
92
+ Using SQLAlchemy makes it possible to use any DB supported by that
93
+ library. If a DBAPI2 object, only sqlite3 is supported. The user is
94
+ responsible for engine disposal and connection closure for the
95
+ SQLAlchemy connectable; str connections are closed automatically.
96
+ params: Parameters to pass to execute method.
97
+ output: A dictionary mapping column names to types, used to override the
98
+ schema inferred from the query results.
99
+ session: Session to use for the chain.
100
+ settings: Settings to use for the chain.
101
+ in_memory: If True, creates an in-memory session. Defaults to False.
102
+ infer_schema_length:
103
+ The maximum number of rows to scan for inferring schema.
104
+ If set to `None`, the full data may be scanned.
105
+ The rows used for schema inference are stored in memory,
106
+ so large values can lead to high memory usage.
107
+ Only applies if the `output` parameter is not set for the given column.
108
+
109
+ Examples:
110
+ Reading from a SQL query against a user-supplied connection:
111
+ ```python
112
+ query = "SELECT key, value FROM tbl"
113
+ chain = dc.read_database(query, connection, output={"value": float})
114
+ ```
115
+
116
+ Load data from a SQLAlchemy driver/engine:
117
+ ```python
118
+ from sqlalchemy import create_engine
119
+ engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
120
+ chain = dc.read_database("select * from tbl", engine)
121
+ ```
122
+
123
+ Load data from a parameterized SQLAlchemy query:
124
+ ```python
125
+ query = "SELECT key, value FROM tbl WHERE value > :value"
126
+ dc.read_database(query, engine, params={"value": 50})
127
+ ```
128
+
129
+ Notes:
130
+ This function works with a variety of databases — including, but not limited to,
131
+ SQLite, DuckDB, PostgreSQL, and Snowflake, provided the appropriate driver is
132
+ installed.
133
+ """
134
+ from datachain.lib.dc.records import read_records
135
+
136
+ output = output or {}
137
+ if isinstance(query, str):
138
+ query = sqlalchemy.text(query)
139
+ kw = {"execution_options": {"stream_results": True}} # use server-side cursors
140
+ with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
141
+ cols = result.keys()
142
+ to_infer = [k for k in cols if k not in output] # preserve the order
143
+ rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
144
+ records = (row._asdict() for row in itertools.chain(rows, result))
145
+ return read_records(
146
+ records,
147
+ session=session,
148
+ settings=settings,
149
+ in_memory=in_memory,
150
+ schema=inferred_schema | output,
151
+ )