datachain 0.8.11__tar.gz → 0.8.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (309) hide show
  1. {datachain-0.8.11 → datachain-0.8.13}/PKG-INFO +1 -1
  2. datachain-0.8.13/docs/references/func.md +5 -0
  3. {datachain-0.8.11 → datachain-0.8.13}/docs/references/index.md +1 -1
  4. {datachain-0.8.11 → datachain-0.8.13}/examples/get_started/common_sql_functions.py +16 -1
  5. {datachain-0.8.11 → datachain-0.8.13}/mkdocs.yml +1 -1
  6. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/catalog/catalog.py +1 -20
  7. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/__init__.py +0 -8
  8. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/commands/__init__.py +0 -2
  9. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/commands/datasets.py +0 -19
  10. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/parser/__init__.py +0 -25
  11. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/dataset.py +0 -6
  12. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/__init__.py +2 -1
  13. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/array.py +39 -1
  14. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/conditional.py +16 -9
  15. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/func.py +4 -5
  16. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/remote/studio.py +1 -13
  17. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/functions/array.py +13 -1
  18. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/sqlite/base.py +17 -1
  19. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/sqlite/types.py +5 -0
  20. {datachain-0.8.11 → datachain-0.8.13}/src/datachain.egg-info/PKG-INFO +1 -1
  21. {datachain-0.8.11 → datachain-0.8.13}/src/datachain.egg-info/SOURCES.txt +1 -1
  22. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_catalog.py +23 -22
  23. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_datachain.py +4 -3
  24. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_datasets.py +3 -3
  25. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_pull.py +0 -32
  26. {datachain-0.8.11 → datachain-0.8.13}/tests/test_cli_studio.py +1 -1
  27. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_datachain.py +23 -42
  28. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_diff.py +20 -20
  29. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/sql/test_array.py +18 -0
  30. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_func.py +55 -0
  31. datachain-0.8.11/docs/references/sql.md +0 -18
  32. {datachain-0.8.11 → datachain-0.8.13}/.cruft.json +0 -0
  33. {datachain-0.8.11 → datachain-0.8.13}/.gitattributes +0 -0
  34. {datachain-0.8.11 → datachain-0.8.13}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  35. {datachain-0.8.11 → datachain-0.8.13}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  36. {datachain-0.8.11 → datachain-0.8.13}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  37. {datachain-0.8.11 → datachain-0.8.13}/.github/codecov.yaml +0 -0
  38. {datachain-0.8.11 → datachain-0.8.13}/.github/dependabot.yml +0 -0
  39. {datachain-0.8.11 → datachain-0.8.13}/.github/workflows/benchmarks.yml +0 -0
  40. {datachain-0.8.11 → datachain-0.8.13}/.github/workflows/release.yml +0 -0
  41. {datachain-0.8.11 → datachain-0.8.13}/.github/workflows/tests-studio.yml +0 -0
  42. {datachain-0.8.11 → datachain-0.8.13}/.github/workflows/tests.yml +0 -0
  43. {datachain-0.8.11 → datachain-0.8.13}/.github/workflows/update-template.yaml +0 -0
  44. {datachain-0.8.11 → datachain-0.8.13}/.gitignore +0 -0
  45. {datachain-0.8.11 → datachain-0.8.13}/.pre-commit-config.yaml +0 -0
  46. {datachain-0.8.11 → datachain-0.8.13}/CODE_OF_CONDUCT.rst +0 -0
  47. {datachain-0.8.11 → datachain-0.8.13}/LICENSE +0 -0
  48. {datachain-0.8.11 → datachain-0.8.13}/README.rst +0 -0
  49. {datachain-0.8.11 → datachain-0.8.13}/docs/assets/captioned_cartoons.png +0 -0
  50. {datachain-0.8.11 → datachain-0.8.13}/docs/assets/datachain-white.svg +0 -0
  51. {datachain-0.8.11 → datachain-0.8.13}/docs/assets/datachain.svg +0 -0
  52. {datachain-0.8.11 → datachain-0.8.13}/docs/contributing.md +0 -0
  53. {datachain-0.8.11 → datachain-0.8.13}/docs/css/github-permalink-style.css +0 -0
  54. {datachain-0.8.11 → datachain-0.8.13}/docs/examples.md +0 -0
  55. {datachain-0.8.11 → datachain-0.8.13}/docs/index.md +0 -0
  56. {datachain-0.8.11 → datachain-0.8.13}/docs/overrides/main.html +0 -0
  57. {datachain-0.8.11 → datachain-0.8.13}/docs/quick-start.md +0 -0
  58. {datachain-0.8.11 → datachain-0.8.13}/docs/references/datachain.md +0 -0
  59. {datachain-0.8.11 → datachain-0.8.13}/docs/references/datatype.md +0 -0
  60. {datachain-0.8.11 → datachain-0.8.13}/docs/references/file.md +0 -0
  61. {datachain-0.8.11 → datachain-0.8.13}/docs/references/torch.md +0 -0
  62. {datachain-0.8.11 → datachain-0.8.13}/docs/references/udf.md +0 -0
  63. {datachain-0.8.11 → datachain-0.8.13}/docs/tutorials.md +0 -0
  64. {datachain-0.8.11 → datachain-0.8.13}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  65. {datachain-0.8.11 → datachain-0.8.13}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  66. {datachain-0.8.11 → datachain-0.8.13}/examples/computer_vision/openimage-detect.py +0 -0
  67. {datachain-0.8.11 → datachain-0.8.13}/examples/computer_vision/ultralytics-bbox.py +0 -0
  68. {datachain-0.8.11 → datachain-0.8.13}/examples/computer_vision/ultralytics-pose.py +0 -0
  69. {datachain-0.8.11 → datachain-0.8.13}/examples/computer_vision/ultralytics-segment.py +0 -0
  70. {datachain-0.8.11 → datachain-0.8.13}/examples/get_started/json-csv-reader.py +0 -0
  71. {datachain-0.8.11 → datachain-0.8.13}/examples/get_started/torch-loader.py +0 -0
  72. {datachain-0.8.11 → datachain-0.8.13}/examples/get_started/udfs/parallel.py +0 -0
  73. {datachain-0.8.11 → datachain-0.8.13}/examples/get_started/udfs/simple.py +0 -0
  74. {datachain-0.8.11 → datachain-0.8.13}/examples/get_started/udfs/stateful.py +0 -0
  75. {datachain-0.8.11 → datachain-0.8.13}/examples/llm_and_nlp/claude-query.py +0 -0
  76. {datachain-0.8.11 → datachain-0.8.13}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  77. {datachain-0.8.11 → datachain-0.8.13}/examples/multimodal/clip_inference.py +0 -0
  78. {datachain-0.8.11 → datachain-0.8.13}/examples/multimodal/hf_pipeline.py +0 -0
  79. {datachain-0.8.11 → datachain-0.8.13}/examples/multimodal/openai_image_desc_lib.py +0 -0
  80. {datachain-0.8.11 → datachain-0.8.13}/examples/multimodal/wds.py +0 -0
  81. {datachain-0.8.11 → datachain-0.8.13}/examples/multimodal/wds_filtered.py +0 -0
  82. {datachain-0.8.11 → datachain-0.8.13}/noxfile.py +0 -0
  83. {datachain-0.8.11 → datachain-0.8.13}/pyproject.toml +0 -0
  84. {datachain-0.8.11 → datachain-0.8.13}/setup.cfg +0 -0
  85. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/__init__.py +0 -0
  86. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/__main__.py +0 -0
  87. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/asyn.py +0 -0
  88. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cache.py +0 -0
  89. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/catalog/__init__.py +0 -0
  90. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/catalog/datasource.py +0 -0
  91. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/catalog/loader.py +0 -0
  92. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/commands/du.py +0 -0
  93. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/commands/index.py +0 -0
  94. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/commands/ls.py +0 -0
  95. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/commands/misc.py +0 -0
  96. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/commands/query.py +0 -0
  97. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/commands/show.py +0 -0
  98. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/parser/job.py +0 -0
  99. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/parser/studio.py +0 -0
  100. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/parser/utils.py +0 -0
  101. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/cli/utils.py +0 -0
  102. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/client/__init__.py +0 -0
  103. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/client/azure.py +0 -0
  104. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/client/fileslice.py +0 -0
  105. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/client/fsspec.py +0 -0
  106. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/client/gcs.py +0 -0
  107. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/client/hf.py +0 -0
  108. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/client/local.py +0 -0
  109. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/client/s3.py +0 -0
  110. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/config.py +0 -0
  111. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/data_storage/__init__.py +0 -0
  112. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/data_storage/db_engine.py +0 -0
  113. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/data_storage/job.py +0 -0
  114. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/data_storage/metastore.py +0 -0
  115. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/data_storage/schema.py +0 -0
  116. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/data_storage/serializer.py +0 -0
  117. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/data_storage/sqlite.py +0 -0
  118. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/data_storage/warehouse.py +0 -0
  119. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/diff/__init__.py +0 -0
  120. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/error.py +0 -0
  121. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/aggregate.py +0 -0
  122. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/base.py +0 -0
  123. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/numeric.py +0 -0
  124. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/path.py +0 -0
  125. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/random.py +0 -0
  126. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/string.py +0 -0
  127. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/func/window.py +0 -0
  128. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/job.py +0 -0
  129. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/__init__.py +0 -0
  130. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/arrow.py +0 -0
  131. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/clip.py +0 -0
  132. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/convert/__init__.py +0 -0
  133. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/convert/flatten.py +0 -0
  134. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/convert/python_to_sql.py +0 -0
  135. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/convert/sql_to_python.py +0 -0
  136. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/convert/unflatten.py +0 -0
  137. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  138. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/data_model.py +0 -0
  139. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/dataset_info.py +0 -0
  140. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/dc.py +0 -0
  141. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/file.py +0 -0
  142. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/hf.py +0 -0
  143. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/image.py +0 -0
  144. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/listing.py +0 -0
  145. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/listing_info.py +0 -0
  146. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/meta_formats.py +0 -0
  147. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/model_store.py +0 -0
  148. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/pytorch.py +0 -0
  149. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/settings.py +0 -0
  150. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/signal_schema.py +0 -0
  151. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/tar.py +0 -0
  152. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/text.py +0 -0
  153. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/udf.py +0 -0
  154. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/udf_signature.py +0 -0
  155. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/utils.py +0 -0
  156. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/vfile.py +0 -0
  157. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/webdataset.py +0 -0
  158. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/lib/webdataset_laion.py +0 -0
  159. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/listing.py +0 -0
  160. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/model/__init__.py +0 -0
  161. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/model/bbox.py +0 -0
  162. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/model/pose.py +0 -0
  163. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/model/segment.py +0 -0
  164. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/model/ultralytics/__init__.py +0 -0
  165. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/model/ultralytics/bbox.py +0 -0
  166. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/model/ultralytics/pose.py +0 -0
  167. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/model/ultralytics/segment.py +0 -0
  168. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/node.py +0 -0
  169. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/nodes_fetcher.py +0 -0
  170. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/nodes_thread_pool.py +0 -0
  171. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/progress.py +0 -0
  172. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/py.typed +0 -0
  173. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/__init__.py +0 -0
  174. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/batch.py +0 -0
  175. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/dataset.py +0 -0
  176. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/dispatch.py +0 -0
  177. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/metrics.py +0 -0
  178. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/params.py +0 -0
  179. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/queue.py +0 -0
  180. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/schema.py +0 -0
  181. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/session.py +0 -0
  182. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/udf.py +0 -0
  183. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/query/utils.py +0 -0
  184. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/remote/__init__.py +0 -0
  185. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/__init__.py +0 -0
  186. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/default/__init__.py +0 -0
  187. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/default/base.py +0 -0
  188. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/functions/__init__.py +0 -0
  189. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/functions/aggregate.py +0 -0
  190. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/functions/conditional.py +0 -0
  191. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/functions/numeric.py +0 -0
  192. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/functions/path.py +0 -0
  193. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/functions/random.py +0 -0
  194. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/functions/string.py +0 -0
  195. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/selectable.py +0 -0
  196. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/sqlite/__init__.py +0 -0
  197. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/sqlite/vector.py +0 -0
  198. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/types.py +0 -0
  199. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/sql/utils.py +0 -0
  200. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/studio.py +0 -0
  201. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/telemetry.py +0 -0
  202. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/toolkit/__init__.py +0 -0
  203. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/toolkit/split.py +0 -0
  204. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/torch/__init__.py +0 -0
  205. {datachain-0.8.11 → datachain-0.8.13}/src/datachain/utils.py +0 -0
  206. {datachain-0.8.11 → datachain-0.8.13}/src/datachain.egg-info/dependency_links.txt +0 -0
  207. {datachain-0.8.11 → datachain-0.8.13}/src/datachain.egg-info/entry_points.txt +0 -0
  208. {datachain-0.8.11 → datachain-0.8.13}/src/datachain.egg-info/requires.txt +0 -0
  209. {datachain-0.8.11 → datachain-0.8.13}/src/datachain.egg-info/top_level.txt +0 -0
  210. {datachain-0.8.11 → datachain-0.8.13}/tests/__init__.py +0 -0
  211. {datachain-0.8.11 → datachain-0.8.13}/tests/benchmarks/__init__.py +0 -0
  212. {datachain-0.8.11 → datachain-0.8.13}/tests/benchmarks/conftest.py +0 -0
  213. {datachain-0.8.11 → datachain-0.8.13}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  214. {datachain-0.8.11 → datachain-0.8.13}/tests/benchmarks/datasets/.dvc/config +0 -0
  215. {datachain-0.8.11 → datachain-0.8.13}/tests/benchmarks/datasets/.gitignore +0 -0
  216. {datachain-0.8.11 → datachain-0.8.13}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  217. {datachain-0.8.11 → datachain-0.8.13}/tests/benchmarks/test_datachain.py +0 -0
  218. {datachain-0.8.11 → datachain-0.8.13}/tests/benchmarks/test_ls.py +0 -0
  219. {datachain-0.8.11 → datachain-0.8.13}/tests/benchmarks/test_version.py +0 -0
  220. {datachain-0.8.11 → datachain-0.8.13}/tests/conftest.py +0 -0
  221. {datachain-0.8.11 → datachain-0.8.13}/tests/data.py +0 -0
  222. {datachain-0.8.11 → datachain-0.8.13}/tests/examples/__init__.py +0 -0
  223. {datachain-0.8.11 → datachain-0.8.13}/tests/examples/test_examples.py +0 -0
  224. {datachain-0.8.11 → datachain-0.8.13}/tests/examples/test_wds_e2e.py +0 -0
  225. {datachain-0.8.11 → datachain-0.8.13}/tests/examples/wds_data.py +0 -0
  226. {datachain-0.8.11 → datachain-0.8.13}/tests/func/__init__.py +0 -0
  227. {datachain-0.8.11 → datachain-0.8.13}/tests/func/fake-service-account-credentials.json +0 -0
  228. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_client.py +0 -0
  229. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_data_storage.py +0 -0
  230. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_datachain_merge.py +0 -0
  231. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_dataset_query.py +0 -0
  232. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_feature_pickling.py +0 -0
  233. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_file.py +0 -0
  234. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_hf.py +0 -0
  235. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_listing.py +0 -0
  236. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_ls.py +0 -0
  237. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_meta_formats.py +0 -0
  238. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_metrics.py +0 -0
  239. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_pytorch.py +0 -0
  240. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_query.py +0 -0
  241. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_session.py +0 -0
  242. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_toolkit.py +0 -0
  243. {datachain-0.8.11 → datachain-0.8.13}/tests/func/test_warehouse.py +0 -0
  244. {datachain-0.8.11 → datachain-0.8.13}/tests/scripts/feature_class.py +0 -0
  245. {datachain-0.8.11 → datachain-0.8.13}/tests/scripts/feature_class_exception.py +0 -0
  246. {datachain-0.8.11 → datachain-0.8.13}/tests/scripts/feature_class_parallel.py +0 -0
  247. {datachain-0.8.11 → datachain-0.8.13}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  248. {datachain-0.8.11 → datachain-0.8.13}/tests/scripts/name_len_slow.py +0 -0
  249. {datachain-0.8.11 → datachain-0.8.13}/tests/test_atomicity.py +0 -0
  250. {datachain-0.8.11 → datachain-0.8.13}/tests/test_cli_e2e.py +0 -0
  251. {datachain-0.8.11 → datachain-0.8.13}/tests/test_query_e2e.py +0 -0
  252. {datachain-0.8.11 → datachain-0.8.13}/tests/test_telemetry.py +0 -0
  253. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/__init__.py +0 -0
  254. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/__init__.py +0 -0
  255. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/conftest.py +0 -0
  256. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_arrow.py +0 -0
  257. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_clip.py +0 -0
  258. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  259. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_datachain_merge.py +0 -0
  260. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_feature.py +0 -0
  261. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_feature_utils.py +0 -0
  262. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_file.py +0 -0
  263. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_hf.py +0 -0
  264. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_image.py +0 -0
  265. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_listing_info.py +0 -0
  266. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_models.py +0 -0
  267. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_python_to_sql.py +0 -0
  268. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_schema.py +0 -0
  269. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_signal_schema.py +0 -0
  270. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_sql_to_python.py +0 -0
  271. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_text.py +0 -0
  272. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_udf_signature.py +0 -0
  273. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_utils.py +0 -0
  274. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/lib/test_webdataset.py +0 -0
  275. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/sql/__init__.py +0 -0
  276. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/sql/sqlite/__init__.py +0 -0
  277. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/sql/sqlite/test_types.py +0 -0
  278. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/sql/sqlite/test_utils.py +0 -0
  279. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/sql/test_conditional.py +0 -0
  280. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/sql/test_path.py +0 -0
  281. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/sql/test_random.py +0 -0
  282. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/sql/test_selectable.py +0 -0
  283. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/sql/test_string.py +0 -0
  284. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_asyn.py +0 -0
  285. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_cache.py +0 -0
  286. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_catalog.py +0 -0
  287. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_catalog_loader.py +0 -0
  288. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_cli_parsing.py +0 -0
  289. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_client.py +0 -0
  290. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_client_gcs.py +0 -0
  291. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_client_s3.py +0 -0
  292. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_config.py +0 -0
  293. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_data_storage.py +0 -0
  294. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_database_engine.py +0 -0
  295. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_dataset.py +0 -0
  296. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_dispatch.py +0 -0
  297. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_fileslice.py +0 -0
  298. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_listing.py +0 -0
  299. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_metastore.py +0 -0
  300. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_module_exports.py +0 -0
  301. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_pytorch.py +0 -0
  302. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_query.py +0 -0
  303. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_query_metrics.py +0 -0
  304. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_query_params.py +0 -0
  305. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_serializer.py +0 -0
  306. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_session.py +0 -0
  307. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_utils.py +0 -0
  308. {datachain-0.8.11 → datachain-0.8.13}/tests/unit/test_warehouse.py +0 -0
  309. {datachain-0.8.11 → datachain-0.8.13}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.8.11
3
+ Version: 0.8.13
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -0,0 +1,5 @@
1
+ # Functions
2
+
3
+ Use built-in functions for data manipulation and analysis to operate on the underlying database storing the chain data. These functions are useful for operations like [`DataChain.filter`](datachain.md#datachain.lib.dc.DataChain.filter) and [`DataChain.mutate`](datachain.md#datachain.lib.dc.DataChain.mutate). Import these functions from `datachain.func`.
4
+
5
+ ::: datachain.func
@@ -10,5 +10,5 @@ DataChain's API is organized into several modules:
10
10
  - [DataType](./datatype.md) - Type system and schema definitions
11
11
  - [File](./file.md) - File handling and storage operations
12
12
  - [UDF](./udf.md) - User-defined functions and transformations
13
- - [SQL](./sql.md) - SQL query integration
13
+ - [Functions](./func.md) - Built-in functions for data manipulation and analysis
14
14
  - [Torch](./torch.md) - PyTorch data loading utilities
@@ -9,7 +9,7 @@ def num_chars_udf(file):
9
9
  return ([],)
10
10
 
11
11
 
12
- dc = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/")
12
+ dc = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", anon=True)
13
13
  dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
14
14
  "file.path", "num_chars"
15
15
  ).show(5)
@@ -32,6 +32,12 @@ dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
32
32
  .show(5)
33
33
  )
34
34
 
35
+ parts = string.split(path.name(C("file.path")), ".")
36
+ chain = dc.mutate(
37
+ isdog=array.contains(parts, "dog"),
38
+ iscat=array.contains(parts, "cat"),
39
+ )
40
+ chain.select("file.path", "isdog", "iscat").show(5)
35
41
 
36
42
  chain = dc.mutate(
37
43
  a=array.length(string.split("file.path", "/")),
@@ -79,6 +85,15 @@ Processed: 400 rows [00:00, 16364.66 rows/s]
79
85
  3 dogs-and-cats/cat.10.json cat.10 json
80
86
  4 dogs-and-cats/cat.100.jpg cat.100 jpg
81
87
 
88
+ [Limited by 5 rows]
89
+ file isdog iscat
90
+ path
91
+ 0 dogs-and-cats/cat.1.jpg 0 1
92
+ 1 dogs-and-cats/cat.1.json 0 1
93
+ 2 dogs-and-cats/cat.10.jpg 0 1
94
+ 3 dogs-and-cats/cat.10.json 0 1
95
+ 4 dogs-and-cats/cat.100.jpg 0 1
96
+
82
97
  [Limited by 5 rows]
83
98
  Processed: 400 rows [00:00, 16496.93 rows/s]
84
99
  a b greatest least
@@ -73,7 +73,7 @@ nav:
73
73
  - File: references/file.md
74
74
  - UDF: references/udf.md
75
75
  - Torch: references/torch.md
76
- - SQL: references/sql.md
76
+ - Functions: references/func.md
77
77
  - 🤝 Contributing: contributing.md
78
78
 
79
79
  - DataChain Website ↗: https://datachain.ai" target="_blank"
@@ -38,7 +38,6 @@ from datachain.dataset import (
38
38
  DatasetDependency,
39
39
  DatasetListRecord,
40
40
  DatasetRecord,
41
- DatasetStats,
42
41
  DatasetStatus,
43
42
  StorageURI,
44
43
  create_dataset_uri,
@@ -1235,17 +1234,6 @@ class Catalog:
1235
1234
  dataset = self.get_dataset(name)
1236
1235
  return self.warehouse.dataset_table_export_file_names(dataset, version)
1237
1236
 
1238
- def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
1239
- """
1240
- Returns tuple with dataset stats: total number of rows and total dataset size.
1241
- """
1242
- dataset = self.get_dataset(name)
1243
- dataset_version = dataset.get_version(version or dataset.latest_version)
1244
- return DatasetStats(
1245
- num_objects=dataset_version.num_objects,
1246
- size=dataset_version.size,
1247
- )
1248
-
1249
1237
  def remove_dataset(
1250
1238
  self,
1251
1239
  name: str,
@@ -1391,19 +1379,12 @@ class Catalog:
1391
1379
  except DatasetNotFoundError:
1392
1380
  pass
1393
1381
 
1394
- stats_response = studio_client.dataset_stats(
1395
- remote_ds_name, remote_ds_version.version
1396
- )
1397
- if not stats_response.ok:
1398
- raise_remote_error(stats_response.message)
1399
- ds_stats = stats_response.data
1400
-
1401
1382
  dataset_save_progress_bar = tqdm(
1402
1383
  desc=f"Saving dataset {remote_ds_uri} locally: ",
1403
1384
  unit=" rows",
1404
1385
  unit_scale=True,
1405
1386
  unit_divisor=1000,
1406
- total=ds_stats.num_objects, # type: ignore [union-attr]
1387
+ total=remote_ds_version.num_objects, # type: ignore [union-attr]
1407
1388
  leave=False,
1408
1389
  )
1409
1390
 
@@ -11,7 +11,6 @@ from datachain.telemetry import telemetry
11
11
  from .commands import (
12
12
  clear_cache,
13
13
  completion,
14
- dataset_stats,
15
14
  du,
16
15
  edit_dataset,
17
16
  garbage_collect,
@@ -182,13 +181,6 @@ def handle_dataset_command(args, catalog):
182
181
  all=args.all,
183
182
  team=args.team,
184
183
  ),
185
- "stats": lambda: dataset_stats(
186
- catalog,
187
- args.name,
188
- args.version,
189
- show_bytes=args.bytes,
190
- si=args.si,
191
- ),
192
184
  }
193
185
 
194
186
  handler = dataset_commands.get(args.datasets_cmd)
@@ -1,5 +1,4 @@
1
1
  from .datasets import (
2
- dataset_stats,
3
2
  edit_dataset,
4
3
  list_datasets,
5
4
  list_datasets_local,
@@ -15,7 +14,6 @@ from .show import show
15
14
  __all__ = [
16
15
  "clear_cache",
17
16
  "completion",
18
- "dataset_stats",
19
17
  "du",
20
18
  "edit_dataset",
21
19
  "garbage_collect",
@@ -3,8 +3,6 @@ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  from tabulate import tabulate
5
5
 
6
- from datachain import utils
7
-
8
6
  if TYPE_CHECKING:
9
7
  from datachain.catalog import Catalog
10
8
 
@@ -109,20 +107,3 @@ def edit_dataset(
109
107
 
110
108
  if (all or studio) and token:
111
109
  edit_studio_dataset(team, name, new_name, description, labels)
112
-
113
-
114
- def dataset_stats(
115
- catalog: "Catalog",
116
- name: str,
117
- version: int,
118
- show_bytes=False,
119
- si=False,
120
- ):
121
- stats = catalog.dataset_stats(name, version)
122
-
123
- if stats:
124
- print(f"Number of objects: {stats.num_objects}")
125
- if show_bytes:
126
- print(f"Total objects size: {stats.size}")
127
- else:
128
- print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")
@@ -307,31 +307,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
307
307
  help="The team to delete a dataset. By default, it will use team from config",
308
308
  )
309
309
 
310
- dataset_stats_parser = datasets_subparser.add_parser(
311
- "stats", parents=[parent_parser], description="Show basic dataset statistics."
312
- )
313
- dataset_stats_parser.add_argument("name", type=str, help="Dataset name")
314
- dataset_stats_parser.add_argument(
315
- "--version",
316
- action="store",
317
- default=None,
318
- type=int,
319
- help="Dataset version",
320
- )
321
- dataset_stats_parser.add_argument(
322
- "-b",
323
- "--bytes",
324
- default=False,
325
- action="store_true",
326
- help="Display size in bytes instead of human-readable size",
327
- )
328
- dataset_stats_parser.add_argument(
329
- "--si",
330
- default=False,
331
- action="store_true",
332
- help="Display size using powers of 1000 not 1024",
333
- )
334
-
335
310
  parse_ls = subp.add_parser(
336
311
  "ls", parents=[parent_parser], description="List storage contents."
337
312
  )
@@ -150,12 +150,6 @@ class DatasetDependency:
150
150
  return hash(f"{self.type}_{self.name}_{self.version}")
151
151
 
152
152
 
153
- @dataclass
154
- class DatasetStats:
155
- num_objects: Optional[int] # None if table is missing
156
- size: Optional[int] # in bytes None if table is missing or empty
157
-
158
-
159
153
  class DatasetStatus:
160
154
  CREATED = 1
161
155
  PENDING = 2
@@ -15,7 +15,7 @@ from .aggregate import (
15
15
  row_number,
16
16
  sum,
17
17
  )
18
- from .array import cosine_distance, euclidean_distance, length, sip_hash_64
18
+ from .array import contains, cosine_distance, euclidean_distance, length, sip_hash_64
19
19
  from .conditional import case, greatest, ifelse, isnone, least
20
20
  from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
21
21
  from .random import rand
@@ -34,6 +34,7 @@ __all__ = [
34
34
  "case",
35
35
  "collect",
36
36
  "concat",
37
+ "contains",
37
38
  "cosine_distance",
38
39
  "count",
39
40
  "dense_rank",
@@ -1,5 +1,5 @@
1
1
  from collections.abc import Sequence
2
- from typing import Union
2
+ from typing import Any, Union
3
3
 
4
4
  from datachain.sql.functions import array
5
5
 
@@ -140,6 +140,44 @@ def length(arg: Union[str, Sequence, Func]) -> Func:
140
140
  return Func("length", inner=array.length, cols=cols, args=args, result_type=int)
141
141
 
142
142
 
143
+ def contains(arr: Union[str, Sequence, Func], elem: Any) -> Func:
144
+ """
145
+ Checks whether the `arr` array has the `elem` element.
146
+
147
+ Args:
148
+ arr (str | Sequence | Func): Array to check for the element.
149
+ If a string is provided, it is assumed to be the name of the array column.
150
+ If a sequence is provided, it is assumed to be an array of values.
151
+ If a Func is provided, it is assumed to be a function returning an array.
152
+ elem (Any): Element to check for in the array.
153
+
154
+ Returns:
155
+ Func: A Func object that represents the contains function. Result of the
156
+ function will be 1 if the element is present in the array, and 0 otherwise.
157
+
158
+ Example:
159
+ ```py
160
+ dc.mutate(
161
+ contains1=func.array.contains("signal.values", 3),
162
+ contains2=func.array.contains([1, 2, 3, 4, 5], 7),
163
+ )
164
+ ```
165
+ """
166
+
167
+ def inner(arg):
168
+ is_json = type(elem) in [list, dict]
169
+ return array.contains(arg, elem, is_json)
170
+
171
+ if isinstance(arr, (str, Func)):
172
+ cols = [arr]
173
+ args = None
174
+ else:
175
+ cols = None
176
+ args = [arr]
177
+
178
+ return Func("contains", inner=inner, cols=cols, args=args, result_type=int)
179
+
180
+
143
181
  def sip_hash_64(arg: Union[str, Sequence]) -> Func:
144
182
  """
145
183
  Computes the SipHash-64 hash of the array.
@@ -9,7 +9,7 @@ from datachain.sql.functions import conditional
9
9
 
10
10
  from .func import ColT, Func
11
11
 
12
- CaseT = Union[int, float, complex, bool, str, Func]
12
+ CaseT = Union[int, float, complex, bool, str, Func, ColumnElement]
13
13
 
14
14
 
15
15
  def greatest(*args: Union[ColT, float]) -> Func:
@@ -94,11 +94,12 @@ def case(
94
94
  """
95
95
  Returns the case function that produces case expression which has a list of
96
96
  conditions and corresponding results. Results can be python primitives like string,
97
- numbers or booleans but can also be other nested function (including case function).
97
+ numbers or booleans but can also be other nested functions (including case function)
98
+ or columns.
98
99
  Result type is inferred from condition results.
99
100
 
100
101
  Args:
101
- args (tuple((ColumnElement, Func), (str | int | float | complex | bool, Func))):
102
+ args tuple((ColumnElement | Func),(str | int | float | complex | bool, Func, ColumnElement)):
102
103
  Tuple of condition and values pair.
103
104
  else_ (str | int | float | complex | bool, Func): optional else value in case
104
105
  expression. If omitted, and no case conditions are satisfied, the result
@@ -113,13 +114,16 @@ def case(
113
114
  res=func.case((C("num") > 0, "P"), (C("num") < 0, "N"), else_="Z"),
114
115
  )
115
116
  ```
116
- """
117
+ """ # noqa: E501
117
118
  supported_types = [int, float, complex, str, bool]
118
119
 
119
120
  def _get_type(val):
120
121
  if isinstance(val, Func):
121
122
  # nested functions
122
123
  return val.result_type
124
+ if isinstance(val, Column):
125
+ # at this point we cannot know what is the type of a column
126
+ return None
123
127
  return type(val)
124
128
 
125
129
  if not args:
@@ -129,13 +133,16 @@ def case(
129
133
 
130
134
  for arg in args:
131
135
  arg_type = _get_type(arg[1])
136
+ if arg_type is None:
137
+ # we couldn't figure out the type of case value
138
+ continue
132
139
  if type_ and arg_type != type_:
133
140
  raise DataChainParamsError(
134
141
  f"Statement values must be of the same type, got {type_} and {arg_type}"
135
142
  )
136
143
  type_ = arg_type
137
144
 
138
- if type_ not in supported_types:
145
+ if type_ is not None and type_ not in supported_types:
139
146
  raise DataChainParamsError(
140
147
  f"Only python literals ({supported_types}) are supported for values"
141
148
  )
@@ -151,15 +158,15 @@ def ifelse(
151
158
  """
152
159
  Returns the ifelse function that produces if expression which has a condition
153
160
  and values for true and false outcome. Results can be one of python primitives
154
- like string, numbers or booleans, but can also be nested functions.
161
+ like string, numbers or booleans, but can also be nested functions or columns.
155
162
  Result type is inferred from the values.
156
163
 
157
164
  Args:
158
165
  condition (ColumnElement, Func): Condition which is evaluated.
159
- if_val (str | int | float | complex | bool, Func): Value for true
166
+ if_val (str | int | float | complex | bool, Func, ColumnElement): Value for true
160
167
  condition outcome.
161
- else_val (str | int | float | complex | bool, Func): Value for false condition
162
- outcome.
168
+ else_val (str | int | float | complex | bool, Func, ColumnElement): Value for
169
+ false condition outcome.
163
170
 
164
171
  Returns:
165
172
  Func: A Func object that represents the ifelse function.
@@ -424,10 +424,9 @@ class Func(Function):
424
424
 
425
425
  def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
426
426
  if isinstance(col, tuple):
427
- raise DataChainParamsError(
428
- "Cannot get type from tuple, please provide type hint to the function"
429
- )
430
-
427
+ # we can only get tuple from case statement where the first tuple item
428
+ # is condition, and second one is value which type is important
429
+ col = col[1]
431
430
  if isinstance(col, Func):
432
431
  return col.get_result_type(signals_schema)
433
432
 
@@ -435,7 +434,7 @@ def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
435
434
  return sql_to_python(col)
436
435
 
437
436
  return signals_schema.get_column_type(
438
- col.name if isinstance(col, ColumnElement) else col
437
+ col.name if isinstance(col, ColumnElement) else col # type: ignore[arg-type]
439
438
  )
440
439
 
441
440
 
@@ -16,14 +16,12 @@ from urllib.parse import urlparse, urlunparse
16
16
  import websockets
17
17
 
18
18
  from datachain.config import Config
19
- from datachain.dataset import DatasetStats
20
19
  from datachain.error import DataChainError
21
20
  from datachain.utils import STUDIO_URL, retry_with_backoff
22
21
 
23
22
  T = TypeVar("T")
24
23
  LsData = Optional[list[dict[str, Any]]]
25
24
  DatasetInfoData = Optional[dict[str, Any]]
26
- DatasetStatsData = Optional[DatasetStats]
27
25
  DatasetRowsData = Optional[Iterable[dict[str, Any]]]
28
26
  DatasetJobVersionsData = Optional[dict[str, Any]]
29
27
  DatasetExportStatus = Optional[dict[str, Any]]
@@ -309,7 +307,7 @@ class StudioClient:
309
307
  "datachain/datasets",
310
308
  {
311
309
  "dataset_name": name,
312
- "version": version,
310
+ "dataset_version": version,
313
311
  "force": force,
314
312
  },
315
313
  method="DELETE",
@@ -347,16 +345,6 @@ class StudioClient:
347
345
  method="GET",
348
346
  )
349
347
 
350
- def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
351
- response = self._send_request(
352
- "datachain/datasets/stats",
353
- {"dataset_name": name, "dataset_version": version},
354
- method="GET",
355
- )
356
- if response.ok:
357
- response.data = DatasetStats(**response.data)
358
- return response
359
-
360
348
  def export_dataset_table(
361
349
  self, name: str, version: int
362
350
  ) -> Response[DatasetExportSignedUrls]:
@@ -1,6 +1,6 @@
1
1
  from sqlalchemy.sql.functions import GenericFunction
2
2
 
3
- from datachain.sql.types import Float, Int64
3
+ from datachain.sql.types import Boolean, Float, Int64
4
4
  from datachain.sql.utils import compiler_not_implemented
5
5
 
6
6
 
@@ -37,6 +37,17 @@ class length(GenericFunction): # noqa: N801
37
37
  inherit_cache = True
38
38
 
39
39
 
40
+ class contains(GenericFunction): # noqa: N801
41
+ """
42
+ Checks if element is in the array.
43
+ """
44
+
45
+ type = Boolean()
46
+ package = "array"
47
+ name = "contains"
48
+ inherit_cache = True
49
+
50
+
40
51
  class sip_hash_64(GenericFunction): # noqa: N801
41
52
  """
42
53
  Computes the SipHash-64 hash of the array.
@@ -51,4 +62,5 @@ class sip_hash_64(GenericFunction): # noqa: N801
51
62
  compiler_not_implemented(cosine_distance)
52
63
  compiler_not_implemented(euclidean_distance)
53
64
  compiler_not_implemented(length)
65
+ compiler_not_implemented(contains)
54
66
  compiler_not_implemented(sip_hash_64)
@@ -87,6 +87,7 @@ def setup():
87
87
  compiles(sql_path.file_stem, "sqlite")(compile_path_file_stem)
88
88
  compiles(sql_path.file_ext, "sqlite")(compile_path_file_ext)
89
89
  compiles(array.length, "sqlite")(compile_array_length)
90
+ compiles(array.contains, "sqlite")(compile_array_contains)
90
91
  compiles(string.length, "sqlite")(compile_string_length)
91
92
  compiles(string.split, "sqlite")(compile_string_split)
92
93
  compiles(string.regexp_replace, "sqlite")(compile_string_regexp_replace)
@@ -269,13 +270,16 @@ def register_user_defined_sql_functions() -> None:
269
270
 
270
271
  _registered_function_creators["string_functions"] = create_string_functions
271
272
 
272
- has_json_extension = functions_exist(["json_array_length"])
273
+ has_json_extension = functions_exist(["json_array_length", "json_array_contains"])
273
274
  if not has_json_extension:
274
275
 
275
276
  def create_json_functions(conn):
276
277
  conn.create_function(
277
278
  "json_array_length", 1, py_json_array_length, deterministic=True
278
279
  )
280
+ conn.create_function(
281
+ "json_array_contains", 3, py_json_array_contains, deterministic=True
282
+ )
279
283
 
280
284
  _registered_function_creators["json_functions"] = create_json_functions
281
285
 
@@ -428,10 +432,22 @@ def py_json_array_length(arr):
428
432
  return len(orjson.loads(arr))
429
433
 
430
434
 
435
+ def py_json_array_contains(arr, value, is_json):
436
+ if is_json:
437
+ value = orjson.loads(value)
438
+ return value in orjson.loads(arr)
439
+
440
+
431
441
  def compile_array_length(element, compiler, **kwargs):
432
442
  return compiler.process(func.json_array_length(*element.clauses.clauses), **kwargs)
433
443
 
434
444
 
445
+ def compile_array_contains(element, compiler, **kwargs):
446
+ return compiler.process(
447
+ func.json_array_contains(*element.clauses.clauses), **kwargs
448
+ )
449
+
450
+
435
451
  def compile_string_length(element, compiler, **kwargs):
436
452
  return compiler.process(func.length(*element.clauses.clauses), **kwargs)
437
453
 
@@ -31,6 +31,10 @@ def adapt_array(arr):
31
31
  return orjson.dumps(arr).decode("utf-8")
32
32
 
33
33
 
34
+ def adapt_dict(dct):
35
+ return orjson.dumps(dct).decode("utf-8")
36
+
37
+
34
38
  def convert_array(arr):
35
39
  return orjson.loads(arr)
36
40
 
@@ -52,6 +56,7 @@ def adapt_np_generic(val):
52
56
 
53
57
  def register_type_converters():
54
58
  sqlite3.register_adapter(list, adapt_array)
59
+ sqlite3.register_adapter(dict, adapt_dict)
55
60
  sqlite3.register_converter("ARRAY", convert_array)
56
61
  if numpy_imported:
57
62
  sqlite3.register_adapter(np.ndarray, adapt_np_array)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.8.11
3
+ Version: 0.8.13
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -31,8 +31,8 @@ docs/overrides/main.html
31
31
  docs/references/datachain.md
32
32
  docs/references/datatype.md
33
33
  docs/references/file.md
34
+ docs/references/func.md
34
35
  docs/references/index.md
35
- docs/references/sql.md
36
36
  docs/references/torch.md
37
37
  docs/references/udf.md
38
38
  examples/computer_vision/iptc_exif_xmp_lib.py
@@ -17,7 +17,8 @@ from tests.utils import DEFAULT_TREE, skip_if_not_sqlite, tree_from_path
17
17
  def listing_stats(uri, catalog):
18
18
  list_dataset_name, _, _ = parse_listing_uri(uri, catalog.client_config)
19
19
  dataset = catalog.get_dataset(list_dataset_name)
20
- return catalog.dataset_stats(dataset.name, dataset.latest_version)
20
+ dataset_version = dataset.get_version(dataset.latest_version)
21
+ return dataset_version.num_objects, dataset_version.size
21
22
 
22
23
 
23
24
  @pytest.fixture
@@ -582,23 +583,23 @@ def test_listing_stats(cloud_test_catalog):
582
583
  listing_stats(src_uri, catalog)
583
584
 
584
585
  catalog.enlist_source(src_uri)
585
- stats = listing_stats(src_uri, catalog)
586
- assert stats.num_objects == 7
587
- assert stats.size == 36
586
+ num_objects, size = listing_stats(src_uri, catalog)
587
+ assert num_objects == 7
588
+ assert size == 36
588
589
 
589
590
  catalog.enlist_source(f"{src_uri}/dogs/", update=True)
590
- stats = listing_stats(src_uri, catalog)
591
- assert stats.num_objects == 7
592
- assert stats.size == 36
591
+ num_objects, size = listing_stats(src_uri, catalog)
592
+ assert num_objects == 7
593
+ assert size == 36
593
594
 
594
- stats = listing_stats(f"{src_uri}/dogs/", catalog)
595
- assert stats.num_objects == 4
596
- assert stats.size == 15
595
+ num_objects, size = listing_stats(f"{src_uri}/dogs/", catalog)
596
+ assert num_objects == 4
597
+ assert size == 15
597
598
 
598
599
  catalog.enlist_source(f"{src_uri}/dogs/")
599
- stats = listing_stats(src_uri, catalog)
600
- assert stats.num_objects == 7
601
- assert stats.size == 36
600
+ num_objects, size = listing_stats(src_uri, catalog)
601
+ assert num_objects == 7
602
+ assert size == 36
602
603
 
603
604
 
604
605
  @pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
@@ -608,15 +609,15 @@ def test_enlist_source_handles_slash(cloud_test_catalog):
608
609
  src_path = f"{src_uri}/dogs"
609
610
 
610
611
  catalog.enlist_source(src_path)
611
- stats = listing_stats(src_path, catalog)
612
- assert stats.num_objects == len(DEFAULT_TREE["dogs"])
613
- assert stats.size == 15
612
+ num_objects, size = listing_stats(src_path, catalog)
613
+ assert num_objects == len(DEFAULT_TREE["dogs"])
614
+ assert size == 15
614
615
 
615
616
  src_path = f"{src_uri}/dogs"
616
617
  catalog.enlist_source(src_path, update=True)
617
- stats = listing_stats(src_path, catalog)
618
- assert stats.num_objects == len(DEFAULT_TREE["dogs"])
619
- assert stats.size == 15
618
+ num_objects, size = listing_stats(src_path, catalog)
619
+ assert num_objects == len(DEFAULT_TREE["dogs"])
620
+ assert size == 15
620
621
 
621
622
 
622
623
  @pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
@@ -626,10 +627,10 @@ def test_enlist_source_handles_glob(cloud_test_catalog):
626
627
  src_path = f"{src_uri}/dogs/*.jpg"
627
628
 
628
629
  catalog.enlist_source(src_path)
629
- stats = listing_stats(src_path, catalog)
630
+ num_objects, size = listing_stats(src_path, catalog)
630
631
 
631
- assert stats.num_objects == len(DEFAULT_TREE["dogs"])
632
- assert stats.size == 15
632
+ assert num_objects == len(DEFAULT_TREE["dogs"])
633
+ assert size == 15
633
634
 
634
635
 
635
636
  @pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
@@ -20,7 +20,7 @@ from sqlalchemy import Column
20
20
  from datachain import DataModel, func
21
21
  from datachain.catalog.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
22
22
  from datachain.data_storage.sqlite import SQLiteWarehouse
23
- from datachain.dataset import DatasetDependencyType, DatasetStats
23
+ from datachain.dataset import DatasetDependencyType
24
24
  from datachain.func import path as pathfunc
25
25
  from datachain.lib.dc import C, DataChain
26
26
  from datachain.lib.file import File, ImageFile
@@ -515,8 +515,9 @@ def test_from_storage_dataset_stats(tmp_dir, test_session):
515
515
  dc = DataChain.from_storage(tmp_dir.as_uri(), session=test_session).save(
516
516
  "test-data"
517
517
  )
518
- stats = test_session.catalog.dataset_stats(dc.name, dc.version)
519
- assert stats == DatasetStats(num_objects=4, size=20)
518
+ version = test_session.catalog.get_dataset(dc.name).get_version(dc.version)
519
+ assert version.num_objects == 4
520
+ assert version.size == 20
520
521
 
521
522
 
522
523
  def test_from_storage_check_rows(tmp_dir, test_session):