datachain 0.14.2__tar.gz → 0.14.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (354) hide show
  1. {datachain-0.14.2/src/datachain.egg-info → datachain-0.14.3}/PKG-INFO +1 -1
  2. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/catalog/loader.py +4 -9
  3. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/data_storage/warehouse.py +9 -0
  4. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/query/dataset.py +39 -40
  5. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/query/dispatch.py +6 -12
  6. datachain-0.14.3/src/datachain/query/udf.py +49 -0
  7. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/utils.py +30 -4
  8. {datachain-0.14.2 → datachain-0.14.3/src/datachain.egg-info}/PKG-INFO +1 -1
  9. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_datachain.py +13 -1
  10. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_catalog_loader.py +7 -14
  11. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_utils.py +46 -10
  12. datachain-0.14.2/src/datachain/query/udf.py +0 -20
  13. {datachain-0.14.2 → datachain-0.14.3}/.cruft.json +0 -0
  14. {datachain-0.14.2 → datachain-0.14.3}/.gitattributes +0 -0
  15. {datachain-0.14.2 → datachain-0.14.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  16. {datachain-0.14.2 → datachain-0.14.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  17. {datachain-0.14.2 → datachain-0.14.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  18. {datachain-0.14.2 → datachain-0.14.3}/.github/codecov.yaml +0 -0
  19. {datachain-0.14.2 → datachain-0.14.3}/.github/dependabot.yml +0 -0
  20. {datachain-0.14.2 → datachain-0.14.3}/.github/workflows/benchmarks.yml +0 -0
  21. {datachain-0.14.2 → datachain-0.14.3}/.github/workflows/release.yml +0 -0
  22. {datachain-0.14.2 → datachain-0.14.3}/.github/workflows/tests-studio.yml +0 -0
  23. {datachain-0.14.2 → datachain-0.14.3}/.github/workflows/tests.yml +0 -0
  24. {datachain-0.14.2 → datachain-0.14.3}/.github/workflows/update-template.yaml +0 -0
  25. {datachain-0.14.2 → datachain-0.14.3}/.gitignore +0 -0
  26. {datachain-0.14.2 → datachain-0.14.3}/.pre-commit-config.yaml +0 -0
  27. {datachain-0.14.2 → datachain-0.14.3}/CODE_OF_CONDUCT.rst +0 -0
  28. {datachain-0.14.2 → datachain-0.14.3}/LICENSE +0 -0
  29. {datachain-0.14.2 → datachain-0.14.3}/README.rst +0 -0
  30. {datachain-0.14.2 → datachain-0.14.3}/docs/assets/captioned_cartoons.png +0 -0
  31. {datachain-0.14.2 → datachain-0.14.3}/docs/assets/datachain-white.svg +0 -0
  32. {datachain-0.14.2 → datachain-0.14.3}/docs/assets/datachain.svg +0 -0
  33. {datachain-0.14.2 → datachain-0.14.3}/docs/contributing.md +0 -0
  34. {datachain-0.14.2 → datachain-0.14.3}/docs/css/github-permalink-style.css +0 -0
  35. {datachain-0.14.2 → datachain-0.14.3}/docs/examples.md +0 -0
  36. {datachain-0.14.2 → datachain-0.14.3}/docs/index.md +0 -0
  37. {datachain-0.14.2 → datachain-0.14.3}/docs/overrides/main.html +0 -0
  38. {datachain-0.14.2 → datachain-0.14.3}/docs/quick-start.md +0 -0
  39. {datachain-0.14.2 → datachain-0.14.3}/docs/references/data-types/arrowrow.md +0 -0
  40. {datachain-0.14.2 → datachain-0.14.3}/docs/references/data-types/bbox.md +0 -0
  41. {datachain-0.14.2 → datachain-0.14.3}/docs/references/data-types/file.md +0 -0
  42. {datachain-0.14.2 → datachain-0.14.3}/docs/references/data-types/imagefile.md +0 -0
  43. {datachain-0.14.2 → datachain-0.14.3}/docs/references/data-types/index.md +0 -0
  44. {datachain-0.14.2 → datachain-0.14.3}/docs/references/data-types/pose.md +0 -0
  45. {datachain-0.14.2 → datachain-0.14.3}/docs/references/data-types/segment.md +0 -0
  46. {datachain-0.14.2 → datachain-0.14.3}/docs/references/data-types/tarvfile.md +0 -0
  47. {datachain-0.14.2 → datachain-0.14.3}/docs/references/data-types/textfile.md +0 -0
  48. {datachain-0.14.2 → datachain-0.14.3}/docs/references/data-types/videofile.md +0 -0
  49. {datachain-0.14.2 → datachain-0.14.3}/docs/references/datachain.md +0 -0
  50. {datachain-0.14.2 → datachain-0.14.3}/docs/references/func.md +0 -0
  51. {datachain-0.14.2 → datachain-0.14.3}/docs/references/index.md +0 -0
  52. {datachain-0.14.2 → datachain-0.14.3}/docs/references/remotes.md +0 -0
  53. {datachain-0.14.2 → datachain-0.14.3}/docs/references/toolkit.md +0 -0
  54. {datachain-0.14.2 → datachain-0.14.3}/docs/references/torch.md +0 -0
  55. {datachain-0.14.2 → datachain-0.14.3}/docs/references/udf.md +0 -0
  56. {datachain-0.14.2 → datachain-0.14.3}/docs/tutorials.md +0 -0
  57. {datachain-0.14.2 → datachain-0.14.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  58. {datachain-0.14.2 → datachain-0.14.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  59. {datachain-0.14.2 → datachain-0.14.3}/examples/computer_vision/openimage-detect.py +0 -0
  60. {datachain-0.14.2 → datachain-0.14.3}/examples/computer_vision/ultralytics-bbox.py +0 -0
  61. {datachain-0.14.2 → datachain-0.14.3}/examples/computer_vision/ultralytics-pose.py +0 -0
  62. {datachain-0.14.2 → datachain-0.14.3}/examples/computer_vision/ultralytics-segment.py +0 -0
  63. {datachain-0.14.2 → datachain-0.14.3}/examples/get_started/common_sql_functions.py +0 -0
  64. {datachain-0.14.2 → datachain-0.14.3}/examples/get_started/json-csv-reader.py +0 -0
  65. {datachain-0.14.2 → datachain-0.14.3}/examples/get_started/torch-loader.py +0 -0
  66. {datachain-0.14.2 → datachain-0.14.3}/examples/get_started/udfs/parallel.py +0 -0
  67. {datachain-0.14.2 → datachain-0.14.3}/examples/get_started/udfs/simple.py +0 -0
  68. {datachain-0.14.2 → datachain-0.14.3}/examples/get_started/udfs/stateful.py +0 -0
  69. {datachain-0.14.2 → datachain-0.14.3}/examples/llm_and_nlp/claude-query.py +0 -0
  70. {datachain-0.14.2 → datachain-0.14.3}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  71. {datachain-0.14.2 → datachain-0.14.3}/examples/multimodal/clip_inference.py +0 -0
  72. {datachain-0.14.2 → datachain-0.14.3}/examples/multimodal/hf_pipeline.py +0 -0
  73. {datachain-0.14.2 → datachain-0.14.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
  74. {datachain-0.14.2 → datachain-0.14.3}/examples/multimodal/wds.py +0 -0
  75. {datachain-0.14.2 → datachain-0.14.3}/examples/multimodal/wds_filtered.py +0 -0
  76. {datachain-0.14.2 → datachain-0.14.3}/mkdocs.yml +0 -0
  77. {datachain-0.14.2 → datachain-0.14.3}/noxfile.py +0 -0
  78. {datachain-0.14.2 → datachain-0.14.3}/pyproject.toml +0 -0
  79. {datachain-0.14.2 → datachain-0.14.3}/setup.cfg +0 -0
  80. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/__init__.py +0 -0
  81. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/__main__.py +0 -0
  82. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/asyn.py +0 -0
  83. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cache.py +0 -0
  84. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/catalog/__init__.py +0 -0
  85. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/catalog/catalog.py +0 -0
  86. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/catalog/datasource.py +0 -0
  87. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/__init__.py +0 -0
  88. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/commands/__init__.py +0 -0
  89. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/commands/datasets.py +0 -0
  90. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/commands/du.py +0 -0
  91. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/commands/index.py +0 -0
  92. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/commands/ls.py +0 -0
  93. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/commands/misc.py +0 -0
  94. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/commands/query.py +0 -0
  95. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/commands/show.py +0 -0
  96. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/parser/__init__.py +0 -0
  97. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/parser/job.py +0 -0
  98. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/parser/studio.py +0 -0
  99. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/parser/utils.py +0 -0
  100. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/cli/utils.py +0 -0
  101. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/client/__init__.py +0 -0
  102. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/client/azure.py +0 -0
  103. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/client/fileslice.py +0 -0
  104. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/client/fsspec.py +0 -0
  105. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/client/gcs.py +0 -0
  106. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/client/hf.py +0 -0
  107. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/client/local.py +0 -0
  108. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/client/s3.py +0 -0
  109. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/config.py +0 -0
  110. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/data_storage/__init__.py +0 -0
  111. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/data_storage/db_engine.py +0 -0
  112. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/data_storage/job.py +0 -0
  113. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/data_storage/metastore.py +0 -0
  114. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/data_storage/schema.py +0 -0
  115. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/data_storage/serializer.py +0 -0
  116. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/data_storage/sqlite.py +0 -0
  117. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/dataset.py +0 -0
  118. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/diff/__init__.py +0 -0
  119. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/error.py +0 -0
  120. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/fs/__init__.py +0 -0
  121. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/fs/reference.py +0 -0
  122. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/fs/utils.py +0 -0
  123. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/__init__.py +0 -0
  124. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/aggregate.py +0 -0
  125. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/array.py +0 -0
  126. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/base.py +0 -0
  127. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/conditional.py +0 -0
  128. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/func.py +0 -0
  129. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/numeric.py +0 -0
  130. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/path.py +0 -0
  131. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/random.py +0 -0
  132. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/string.py +0 -0
  133. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/func/window.py +0 -0
  134. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/job.py +0 -0
  135. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/__init__.py +0 -0
  136. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/arrow.py +0 -0
  137. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/clip.py +0 -0
  138. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/convert/__init__.py +0 -0
  139. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/convert/flatten.py +0 -0
  140. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
  141. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
  142. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/convert/unflatten.py +0 -0
  143. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  144. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/data_model.py +0 -0
  145. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dataset_info.py +0 -0
  146. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/__init__.py +0 -0
  147. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/csv.py +0 -0
  148. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/datachain.py +0 -0
  149. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/datasets.py +0 -0
  150. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/hf.py +0 -0
  151. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/json.py +0 -0
  152. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/listings.py +0 -0
  153. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/pandas.py +0 -0
  154. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/parquet.py +0 -0
  155. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/records.py +0 -0
  156. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/storage.py +0 -0
  157. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/utils.py +0 -0
  158. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/dc/values.py +0 -0
  159. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/file.py +0 -0
  160. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/hf.py +0 -0
  161. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/image.py +0 -0
  162. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/listing.py +0 -0
  163. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/listing_info.py +0 -0
  164. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/meta_formats.py +0 -0
  165. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/model_store.py +0 -0
  166. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/pytorch.py +0 -0
  167. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/settings.py +0 -0
  168. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/signal_schema.py +0 -0
  169. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/tar.py +0 -0
  170. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/text.py +0 -0
  171. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/udf.py +0 -0
  172. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/udf_signature.py +0 -0
  173. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/utils.py +0 -0
  174. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/video.py +0 -0
  175. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/webdataset.py +0 -0
  176. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/lib/webdataset_laion.py +0 -0
  177. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/listing.py +0 -0
  178. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/model/__init__.py +0 -0
  179. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/model/bbox.py +0 -0
  180. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/model/pose.py +0 -0
  181. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/model/segment.py +0 -0
  182. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/model/ultralytics/__init__.py +0 -0
  183. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/model/ultralytics/bbox.py +0 -0
  184. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/model/ultralytics/pose.py +0 -0
  185. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/model/ultralytics/segment.py +0 -0
  186. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/model/utils.py +0 -0
  187. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/node.py +0 -0
  188. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/nodes_fetcher.py +0 -0
  189. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/nodes_thread_pool.py +0 -0
  190. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/progress.py +0 -0
  191. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/py.typed +0 -0
  192. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/query/__init__.py +0 -0
  193. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/query/batch.py +0 -0
  194. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/query/metrics.py +0 -0
  195. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/query/params.py +0 -0
  196. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/query/queue.py +0 -0
  197. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/query/schema.py +0 -0
  198. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/query/session.py +0 -0
  199. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/query/utils.py +0 -0
  200. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/remote/__init__.py +0 -0
  201. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/remote/studio.py +0 -0
  202. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/script_meta.py +0 -0
  203. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/__init__.py +0 -0
  204. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/default/__init__.py +0 -0
  205. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/default/base.py +0 -0
  206. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/functions/__init__.py +0 -0
  207. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/functions/aggregate.py +0 -0
  208. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/functions/array.py +0 -0
  209. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/functions/conditional.py +0 -0
  210. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/functions/numeric.py +0 -0
  211. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/functions/path.py +0 -0
  212. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/functions/random.py +0 -0
  213. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/functions/string.py +0 -0
  214. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/selectable.py +0 -0
  215. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/sqlite/__init__.py +0 -0
  216. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/sqlite/base.py +0 -0
  217. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/sqlite/types.py +0 -0
  218. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/sqlite/vector.py +0 -0
  219. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/types.py +0 -0
  220. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/sql/utils.py +0 -0
  221. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/studio.py +0 -0
  222. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/telemetry.py +0 -0
  223. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/toolkit/__init__.py +0 -0
  224. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/toolkit/split.py +0 -0
  225. {datachain-0.14.2 → datachain-0.14.3}/src/datachain/torch/__init__.py +0 -0
  226. {datachain-0.14.2 → datachain-0.14.3}/src/datachain.egg-info/SOURCES.txt +0 -0
  227. {datachain-0.14.2 → datachain-0.14.3}/src/datachain.egg-info/dependency_links.txt +0 -0
  228. {datachain-0.14.2 → datachain-0.14.3}/src/datachain.egg-info/entry_points.txt +0 -0
  229. {datachain-0.14.2 → datachain-0.14.3}/src/datachain.egg-info/requires.txt +0 -0
  230. {datachain-0.14.2 → datachain-0.14.3}/src/datachain.egg-info/top_level.txt +0 -0
  231. {datachain-0.14.2 → datachain-0.14.3}/tests/__init__.py +0 -0
  232. {datachain-0.14.2 → datachain-0.14.3}/tests/benchmarks/__init__.py +0 -0
  233. {datachain-0.14.2 → datachain-0.14.3}/tests/benchmarks/conftest.py +0 -0
  234. {datachain-0.14.2 → datachain-0.14.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  235. {datachain-0.14.2 → datachain-0.14.3}/tests/benchmarks/datasets/.dvc/config +0 -0
  236. {datachain-0.14.2 → datachain-0.14.3}/tests/benchmarks/datasets/.gitignore +0 -0
  237. {datachain-0.14.2 → datachain-0.14.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  238. {datachain-0.14.2 → datachain-0.14.3}/tests/benchmarks/test_datachain.py +0 -0
  239. {datachain-0.14.2 → datachain-0.14.3}/tests/benchmarks/test_ls.py +0 -0
  240. {datachain-0.14.2 → datachain-0.14.3}/tests/benchmarks/test_version.py +0 -0
  241. {datachain-0.14.2 → datachain-0.14.3}/tests/conftest.py +0 -0
  242. {datachain-0.14.2 → datachain-0.14.3}/tests/data.py +0 -0
  243. {datachain-0.14.2 → datachain-0.14.3}/tests/examples/__init__.py +0 -0
  244. {datachain-0.14.2 → datachain-0.14.3}/tests/examples/test_examples.py +0 -0
  245. {datachain-0.14.2 → datachain-0.14.3}/tests/examples/test_wds_e2e.py +0 -0
  246. {datachain-0.14.2 → datachain-0.14.3}/tests/examples/wds_data.py +0 -0
  247. {datachain-0.14.2 → datachain-0.14.3}/tests/func/__init__.py +0 -0
  248. {datachain-0.14.2 → datachain-0.14.3}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  249. {datachain-0.14.2 → datachain-0.14.3}/tests/func/data/lena.jpg +0 -0
  250. {datachain-0.14.2 → datachain-0.14.3}/tests/func/fake-service-account-credentials.json +0 -0
  251. {datachain-0.14.2 → datachain-0.14.3}/tests/func/model/__init__.py +0 -0
  252. {datachain-0.14.2 → datachain-0.14.3}/tests/func/model/data/running-mask0.png +0 -0
  253. {datachain-0.14.2 → datachain-0.14.3}/tests/func/model/data/running-mask1.png +0 -0
  254. {datachain-0.14.2 → datachain-0.14.3}/tests/func/model/data/running.jpg +0 -0
  255. {datachain-0.14.2 → datachain-0.14.3}/tests/func/model/data/ships.jpg +0 -0
  256. {datachain-0.14.2 → datachain-0.14.3}/tests/func/model/test_yolo.py +0 -0
  257. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_catalog.py +0 -0
  258. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_client.py +0 -0
  259. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_cloud_transfer.py +0 -0
  260. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_data_storage.py +0 -0
  261. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_datachain.py +0 -0
  262. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_datachain_merge.py +0 -0
  263. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_dataset_query.py +0 -0
  264. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_datasets.py +0 -0
  265. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_feature_pickling.py +0 -0
  266. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_file.py +0 -0
  267. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_hf.py +0 -0
  268. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_hidden_field.py +0 -0
  269. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_image.py +0 -0
  270. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_listing.py +0 -0
  271. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_ls.py +0 -0
  272. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_meta_formats.py +0 -0
  273. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_metrics.py +0 -0
  274. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_pull.py +0 -0
  275. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_pytorch.py +0 -0
  276. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_query.py +0 -0
  277. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_session.py +0 -0
  278. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_toolkit.py +0 -0
  279. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_video.py +0 -0
  280. {datachain-0.14.2 → datachain-0.14.3}/tests/func/test_warehouse.py +0 -0
  281. {datachain-0.14.2 → datachain-0.14.3}/tests/scripts/feature_class.py +0 -0
  282. {datachain-0.14.2 → datachain-0.14.3}/tests/scripts/feature_class_exception.py +0 -0
  283. {datachain-0.14.2 → datachain-0.14.3}/tests/scripts/feature_class_parallel.py +0 -0
  284. {datachain-0.14.2 → datachain-0.14.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  285. {datachain-0.14.2 → datachain-0.14.3}/tests/scripts/name_len_slow.py +0 -0
  286. {datachain-0.14.2 → datachain-0.14.3}/tests/test_atomicity.py +0 -0
  287. {datachain-0.14.2 → datachain-0.14.3}/tests/test_cli_e2e.py +0 -0
  288. {datachain-0.14.2 → datachain-0.14.3}/tests/test_cli_studio.py +0 -0
  289. {datachain-0.14.2 → datachain-0.14.3}/tests/test_import_time.py +0 -0
  290. {datachain-0.14.2 → datachain-0.14.3}/tests/test_query_e2e.py +0 -0
  291. {datachain-0.14.2 → datachain-0.14.3}/tests/test_telemetry.py +0 -0
  292. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/__init__.py +0 -0
  293. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/__init__.py +0 -0
  294. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/conftest.py +0 -0
  295. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_arrow.py +0 -0
  296. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_clip.py +0 -0
  297. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  298. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_datachain_merge.py +0 -0
  299. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_diff.py +0 -0
  300. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_feature.py +0 -0
  301. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_feature_utils.py +0 -0
  302. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_file.py +0 -0
  303. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_hf.py +0 -0
  304. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_image.py +0 -0
  305. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_listing_info.py +0 -0
  306. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_python_to_sql.py +0 -0
  307. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_schema.py +0 -0
  308. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_signal_schema.py +0 -0
  309. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_sql_to_python.py +0 -0
  310. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_text.py +0 -0
  311. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_udf_signature.py +0 -0
  312. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_utils.py +0 -0
  313. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/lib/test_webdataset.py +0 -0
  314. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/model/__init__.py +0 -0
  315. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/model/test_bbox.py +0 -0
  316. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/model/test_pose.py +0 -0
  317. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/model/test_segment.py +0 -0
  318. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/model/test_utils.py +0 -0
  319. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/sql/__init__.py +0 -0
  320. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/sql/sqlite/__init__.py +0 -0
  321. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/sql/sqlite/test_types.py +0 -0
  322. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
  323. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/sql/test_array.py +0 -0
  324. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/sql/test_conditional.py +0 -0
  325. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/sql/test_path.py +0 -0
  326. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/sql/test_random.py +0 -0
  327. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/sql/test_selectable.py +0 -0
  328. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/sql/test_string.py +0 -0
  329. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_asyn.py +0 -0
  330. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_cache.py +0 -0
  331. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_catalog.py +0 -0
  332. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_cli_parsing.py +0 -0
  333. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_client.py +0 -0
  334. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_client_gcs.py +0 -0
  335. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_client_s3.py +0 -0
  336. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_config.py +0 -0
  337. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_data_storage.py +0 -0
  338. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_database_engine.py +0 -0
  339. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_dataset.py +0 -0
  340. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_dispatch.py +0 -0
  341. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_fileslice.py +0 -0
  342. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_func.py +0 -0
  343. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_listing.py +0 -0
  344. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_metastore.py +0 -0
  345. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_module_exports.py +0 -0
  346. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_pytorch.py +0 -0
  347. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_query.py +0 -0
  348. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_query_metrics.py +0 -0
  349. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_query_params.py +0 -0
  350. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_script_meta.py +0 -0
  351. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_serializer.py +0 -0
  352. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_session.py +0 -0
  353. {datachain-0.14.2 → datachain-0.14.3}/tests/unit/test_warehouse.py +0 -0
  354. {datachain-0.14.2 → datachain-0.14.3}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.2
3
+ Version: 0.14.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -7,6 +7,7 @@ from datachain.utils import get_envs_by_prefix
7
7
  if TYPE_CHECKING:
8
8
  from datachain.catalog import Catalog
9
9
  from datachain.data_storage import AbstractMetastore, AbstractWarehouse
10
+ from datachain.query.udf import AbstractUDFDistributor
10
11
 
11
12
  METASTORE_SERIALIZED = "DATACHAIN__METASTORE"
12
13
  METASTORE_IMPORT_PATH = "DATACHAIN_METASTORE"
@@ -15,7 +16,6 @@ WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
15
16
  WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
16
17
  WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
17
18
  DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
18
- DISTRIBUTED_ARG_PREFIX = "DATACHAIN_DISTRIBUTED_ARG_"
19
19
 
20
20
  IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
21
21
 
@@ -100,27 +100,22 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
100
100
  return warehouse_class(**warehouse_args)
101
101
 
102
102
 
103
- def get_distributed_class(**kwargs):
103
+ def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
104
104
  distributed_import_path = os.environ.get(DISTRIBUTED_IMPORT_PATH)
105
- distributed_arg_envs = get_envs_by_prefix(DISTRIBUTED_ARG_PREFIX)
106
- # Convert env variable names to keyword argument names by lowercasing them
107
- distributed_args = {k.lower(): v for k, v in distributed_arg_envs.items()}
108
105
 
109
106
  if not distributed_import_path:
110
107
  raise RuntimeError(
111
108
  f"{DISTRIBUTED_IMPORT_PATH} import path is required "
112
109
  "for distributed UDF processing."
113
110
  )
114
- # Distributed class paths are specified as (for example):
115
- # module.classname
111
+ # Distributed class paths are specified as (for example): module.classname
116
112
  if "." not in distributed_import_path:
117
113
  raise RuntimeError(
118
114
  f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
119
115
  )
120
116
  module_name, _, class_name = distributed_import_path.rpartition(".")
121
117
  distributed = import_module(module_name)
122
- distributed_class = getattr(distributed, class_name)
123
- return distributed_class(**distributed_args | kwargs)
118
+ return getattr(distributed, class_name)
124
119
 
125
120
 
126
121
  def get_catalog(
@@ -199,6 +199,15 @@ class AbstractWarehouse(ABC, Serializable):
199
199
  # Query Execution
200
200
  #
201
201
 
202
+ def query_count(self, query: sa.sql.selectable.Select) -> int:
203
+ """Count the number of rows in a query."""
204
+ count_query = sa.select(func.count(1)).select_from(query.subquery())
205
+ return next(self.db.execute(count_query))[0]
206
+
207
+ def table_rows_count(self, table) -> int:
208
+ count_query = sa.select(func.count(1)).select_from(table)
209
+ return next(self.db.execute(count_query))[0]
210
+
202
211
  def dataset_select_paginated(
203
212
  self,
204
213
  query,
@@ -55,10 +55,12 @@ from datachain.lib.udf import UDFAdapter, _get_cache
55
55
  from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
56
56
  from datachain.query.schema import C, UDFParamSpec, normalize_param
57
57
  from datachain.query.session import Session
58
+ from datachain.query.udf import UdfInfo
58
59
  from datachain.sql.functions.random import rand
59
60
  from datachain.utils import (
60
61
  batched,
61
62
  determine_processes,
63
+ determine_workers,
62
64
  filtered_cloudpickle_dumps,
63
65
  get_datachain_executable,
64
66
  safe_closing,
@@ -74,7 +76,6 @@ if TYPE_CHECKING:
74
76
  from datachain.data_storage import AbstractWarehouse
75
77
  from datachain.dataset import DatasetRecord
76
78
  from datachain.lib.udf import UDFAdapter, UDFResult
77
- from datachain.query.udf import UdfInfo
78
79
 
79
80
  P = ParamSpec("P")
80
81
 
@@ -414,20 +415,15 @@ class UDFStep(Step, ABC):
414
415
  def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
415
416
  from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
416
417
 
417
- use_partitioning = self.partition_by is not None
418
- batching = self.udf.get_batching(use_partitioning)
419
- workers = self.workers
420
- if (
421
- not workers
422
- and os.environ.get("DATACHAIN_DISTRIBUTED")
423
- and os.environ.get("DATACHAIN_SETTINGS_WORKERS")
424
- ):
425
- # Enable distributed processing by default if the module is available,
426
- # and a default number of workers is provided.
427
- workers = True
418
+ rows_total = self.catalog.warehouse.query_count(query)
419
+ if rows_total == 0:
420
+ return
428
421
 
429
- processes = determine_processes(self.parallel)
422
+ workers = determine_workers(self.workers, rows_total=rows_total)
423
+ processes = determine_processes(self.parallel, rows_total=rows_total)
430
424
 
425
+ use_partitioning = self.partition_by is not None
426
+ batching = self.udf.get_batching(use_partitioning)
431
427
  udf_fields = [str(c.name) for c in query.selected_columns]
432
428
 
433
429
  prefetch = self.udf.prefetch
@@ -441,23 +437,24 @@ class UDFStep(Step, ABC):
441
437
  "distributed processing."
442
438
  )
443
439
 
444
- from datachain.catalog.loader import get_distributed_class
445
-
446
- distributor = get_distributed_class(
447
- min_task_size=self.min_task_size
448
- )
449
- distributor(
450
- self.udf,
451
- catalog,
452
- udf_table,
453
- query,
454
- workers,
455
- processes,
440
+ from datachain.catalog.loader import get_udf_distributor_class
441
+
442
+ udf_distributor_class = get_udf_distributor_class()
443
+ udf_distributor = udf_distributor_class(
444
+ catalog=catalog,
445
+ table=udf_table,
446
+ query=query,
447
+ udf_data=filtered_cloudpickle_dumps(self.udf),
448
+ batching=batching,
449
+ workers=workers,
450
+ processes=processes,
456
451
  udf_fields=udf_fields,
452
+ rows_total=rows_total,
453
+ use_cache=self.cache,
457
454
  is_generator=self.is_generator,
458
- use_partitioning=use_partitioning,
459
- cache=self.cache,
455
+ min_task_size=self.min_task_size,
460
456
  )
457
+ udf_distributor()
461
458
  elif processes:
462
459
  # Parallel processing (faster for more CPU-heavy UDFs)
463
460
  if catalog.in_memory:
@@ -465,19 +462,21 @@ class UDFStep(Step, ABC):
465
462
  "In-memory databases cannot be used "
466
463
  "with parallel processing."
467
464
  )
468
- udf_info: UdfInfo = {
469
- "udf_data": filtered_cloudpickle_dumps(self.udf),
470
- "catalog_init": catalog.get_init_params(),
471
- "metastore_clone_params": catalog.metastore.clone_params(),
472
- "warehouse_clone_params": catalog.warehouse.clone_params(),
473
- "table": udf_table,
474
- "query": query,
475
- "udf_fields": udf_fields,
476
- "batching": batching,
477
- "processes": processes,
478
- "is_generator": self.is_generator,
479
- "cache": self.cache,
480
- }
465
+
466
+ udf_info = UdfInfo(
467
+ udf_data=filtered_cloudpickle_dumps(self.udf),
468
+ catalog_init=catalog.get_init_params(),
469
+ metastore_clone_params=catalog.metastore.clone_params(),
470
+ warehouse_clone_params=catalog.warehouse.clone_params(),
471
+ table=udf_table,
472
+ query=query,
473
+ udf_fields=udf_fields,
474
+ batching=batching,
475
+ processes=processes,
476
+ is_generator=self.is_generator,
477
+ cache=self.cache,
478
+ rows_total=rows_total,
479
+ )
481
480
 
482
481
  # Run the UDFDispatcher in another process to avoid needing
483
482
  # if __name__ == '__main__': in user scripts
@@ -11,11 +11,10 @@ import multiprocess
11
11
  from cloudpickle import load, loads
12
12
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
13
13
  from multiprocess import get_context
14
- from sqlalchemy.sql import func
15
14
 
16
15
  from datachain.catalog import Catalog
17
16
  from datachain.catalog.catalog import clone_catalog_with_cache
18
- from datachain.catalog.loader import get_distributed_class
17
+ from datachain.catalog.loader import get_udf_distributor_class
19
18
  from datachain.lib.udf import _get_cache
20
19
  from datachain.query.batch import RowsOutput, RowsOutputBatch
21
20
  from datachain.query.dataset import (
@@ -59,6 +58,7 @@ def udf_entrypoint() -> int:
59
58
  dispatch = UDFDispatcher(udf_info)
60
59
 
61
60
  query = udf_info["query"]
61
+ rows_total = udf_info["rows_total"]
62
62
  batching = udf_info["batching"]
63
63
  n_workers = udf_info["processes"]
64
64
  if n_workers is True:
@@ -67,12 +67,6 @@ def udf_entrypoint() -> int:
67
67
  wh_cls, wh_args, wh_kwargs = udf_info["warehouse_clone_params"]
68
68
  warehouse: AbstractWarehouse = wh_cls(*wh_args, **wh_kwargs)
69
69
 
70
- total_rows = next(
71
- warehouse.db.execute(
72
- query.with_only_columns(func.count(query.c.sys__id)).order_by(None)
73
- )
74
- )[0]
75
-
76
70
  with contextlib.closing(
77
71
  batching(warehouse.dataset_select_paginated, query, ids_only=True)
78
72
  ) as udf_inputs:
@@ -81,7 +75,7 @@ def udf_entrypoint() -> int:
81
75
  try:
82
76
  dispatch.run_udf_parallel(
83
77
  udf_inputs,
84
- total_rows=total_rows,
78
+ rows_total=rows_total,
85
79
  n_workers=n_workers,
86
80
  processed_cb=processed_cb,
87
81
  download_cb=download_cb,
@@ -94,7 +88,7 @@ def udf_entrypoint() -> int:
94
88
 
95
89
 
96
90
  def udf_worker_entrypoint() -> int:
97
- return get_distributed_class().run_worker()
91
+ return get_udf_distributor_class().run_worker()
98
92
 
99
93
 
100
94
  class UDFDispatcher:
@@ -164,14 +158,14 @@ class UDFDispatcher:
164
158
  def run_udf_parallel( # noqa: C901, PLR0912
165
159
  self,
166
160
  input_rows: Iterable[RowsOutput],
167
- total_rows: int,
161
+ rows_total: int,
168
162
  n_workers: Optional[int] = None,
169
163
  processed_cb: Callback = DEFAULT_CALLBACK,
170
164
  download_cb: Callback = DEFAULT_CALLBACK,
171
165
  ) -> None:
172
166
  n_workers = get_n_workers_from_arg(n_workers)
173
167
 
174
- input_batch_size = total_rows // n_workers
168
+ input_batch_size = rows_total // n_workers
175
169
  if input_batch_size == 0:
176
170
  input_batch_size = 1
177
171
  elif input_batch_size > DEFAULT_BATCH_SIZE:
@@ -0,0 +1,49 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import TYPE_CHECKING, Any, Callable, Optional, TypedDict, Union
3
+
4
+ if TYPE_CHECKING:
5
+ from sqlalchemy import Select, Table
6
+
7
+ from datachain.catalog import Catalog
8
+ from datachain.query.batch import BatchingStrategy
9
+
10
+
11
+ class UdfInfo(TypedDict):
12
+ udf_data: bytes
13
+ catalog_init: dict[str, Any]
14
+ metastore_clone_params: tuple[Callable[..., Any], list[Any], dict[str, Any]]
15
+ warehouse_clone_params: tuple[Callable[..., Any], list[Any], dict[str, Any]]
16
+ table: "Table"
17
+ query: "Select"
18
+ udf_fields: list[str]
19
+ batching: "BatchingStrategy"
20
+ processes: Optional[int]
21
+ is_generator: bool
22
+ cache: bool
23
+ rows_total: int
24
+
25
+
26
+ class AbstractUDFDistributor(ABC):
27
+ @abstractmethod
28
+ def __init__(
29
+ self,
30
+ catalog: "Catalog",
31
+ table: "Table",
32
+ query: "Select",
33
+ udf_data: bytes,
34
+ batching: "BatchingStrategy",
35
+ workers: Union[bool, int],
36
+ processes: Union[bool, int],
37
+ udf_fields: list[str],
38
+ rows_total: int,
39
+ use_cache: bool,
40
+ is_generator: bool = False,
41
+ min_task_size: Optional[Union[str, int]] = None,
42
+ ) -> None: ...
43
+
44
+ @abstractmethod
45
+ def __call__(self) -> None: ...
46
+
47
+ @staticmethod
48
+ @abstractmethod
49
+ def run_worker() -> int: ...
@@ -286,15 +286,41 @@ def retry_with_backoff(retries=5, backoff_sec=1, errors=(Exception,)):
286
286
  return retry
287
287
 
288
288
 
289
- def determine_processes(parallel: Optional[Union[bool, int]]) -> Union[bool, int]:
289
+ def determine_workers(
290
+ workers: Union[bool, int],
291
+ rows_total: Optional[int] = None,
292
+ ) -> Union[bool, int]:
293
+ """Determine the number of workers to use for distributed processing."""
294
+ if rows_total is not None and rows_total <= 1:
295
+ # Disable distributed processing if there is no rows or only one row.
296
+ return False
297
+ if (
298
+ workers is False
299
+ and os.environ.get("DATACHAIN_DISTRIBUTED")
300
+ and os.environ.get("DATACHAIN_SETTINGS_WORKERS")
301
+ ):
302
+ # Enable distributed processing by default if the module is available,
303
+ # and a default number of workers is provided.
304
+ workers = int(os.environ["DATACHAIN_SETTINGS_WORKERS"])
305
+ if not workers or workers <= 0:
306
+ return False
307
+ return workers
308
+
309
+
310
+ def determine_processes(
311
+ parallel: Optional[Union[bool, int]] = None,
312
+ rows_total: Optional[int] = None,
313
+ ) -> Union[bool, int]:
314
+ """Determine the number of processes to use for parallel processing."""
315
+ if rows_total is not None and rows_total <= 1:
316
+ # Disable parallel processing if there is no rows or only one row.
317
+ return False
290
318
  if parallel is None and os.environ.get("DATACHAIN_SETTINGS_PARALLEL") is not None:
291
319
  parallel = int(os.environ["DATACHAIN_SETTINGS_PARALLEL"])
292
- if parallel is None or parallel is False:
320
+ if parallel is None or parallel is False or parallel == 0:
293
321
  return False
294
322
  if parallel is True:
295
323
  return True
296
- if parallel == 0:
297
- return False
298
324
  if parallel < 0:
299
325
  return True
300
326
  return parallel
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.2
3
+ Version: 0.14.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ import math
4
4
  import os
5
5
  import re
6
6
  from collections.abc import Generator, Iterator
7
- from unittest.mock import ANY
7
+ from unittest.mock import ANY, patch
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
@@ -26,6 +26,7 @@ from datachain.lib.signal_schema import (
26
26
  SignalResolvingTypeError,
27
27
  SignalSchema,
28
28
  )
29
+ from datachain.lib.udf import UDFAdapter
29
30
  from datachain.lib.udf_signature import UdfSignatureError
30
31
  from datachain.lib.utils import DataChainColumnError, DataChainParamsError
31
32
  from datachain.sql.types import Float, Int64, String
@@ -270,6 +271,17 @@ def test_read_record_empty_chain_without_schema(test_session):
270
271
  )
271
272
 
272
273
 
274
+ def test_empty_chain_skip_udf_run(test_session):
275
+ # Test that UDF is not called for empty chain
276
+ with patch.object(UDFAdapter, "run") as mock_udf_run:
277
+ (
278
+ dc.read_records([], schema={"val": int}, session=test_session)
279
+ .map(lambda val: val * 2, params="val", output={"val2": int})
280
+ .exec()
281
+ )
282
+ mock_udf_run.assert_not_called()
283
+
284
+
273
285
  def test_datasets(test_session):
274
286
  ds = dc.datasets(session=test_session)
275
287
  datasets = [d for d in ds.collect("dataset") if d.name == "fibonacci"]
@@ -5,8 +5,8 @@ import pytest
5
5
 
6
6
  from datachain.catalog.loader import (
7
7
  get_catalog,
8
- get_distributed_class,
9
8
  get_metastore,
9
+ get_udf_distributor_class,
10
10
  get_warehouse,
11
11
  )
12
12
  from datachain.data_storage.sqlite import (
@@ -44,7 +44,7 @@ def test_get_metastore(sqlite_db):
44
44
  def test_get_metastore_in_memory():
45
45
  if os.environ.get("DATACHAIN_METASTORE"):
46
46
  with pytest.raises(RuntimeError):
47
- metastore = get_metastore(in_memory=True)
47
+ get_metastore(in_memory=True)
48
48
  else:
49
49
  metastore = get_metastore(in_memory=True)
50
50
  assert isinstance(metastore, SQLiteMetastore)
@@ -71,7 +71,7 @@ def test_get_warehouse(sqlite_db):
71
71
  def test_get_warehouse_in_memory():
72
72
  if os.environ.get("DATACHAIN_WAREHOUSE"):
73
73
  with pytest.raises(RuntimeError):
74
- warehouse = get_warehouse(in_memory=True)
74
+ get_warehouse(in_memory=True)
75
75
  else:
76
76
  warehouse = get_warehouse(in_memory=True)
77
77
  assert isinstance(warehouse, SQLiteWarehouse)
@@ -80,38 +80,31 @@ def test_get_warehouse_in_memory():
80
80
 
81
81
 
82
82
  def test_get_distributed_class():
83
- distributed_args = {"foo": "bar", "baz": "37", "empty": ""}
84
83
  env = {
85
84
  "DATACHAIN_DISTRIBUTED": "tests.unit.test_catalog_loader.DistributedClass",
86
- "DATACHAIN_DISTRIBUTED_ARG_FOO": "bar",
87
- "DATACHAIN_DISTRIBUTED_ARG_BAZ": "37",
88
- "DATACHAIN_DISTRIBUTED_ARG_EMPTY": "",
89
85
  }
90
86
 
91
87
  with patch.dict(os.environ, env):
92
- distributed = get_distributed_class()
93
- assert distributed
94
- assert isinstance(distributed, DistributedClass)
95
- assert distributed.kwargs == distributed_args
88
+ assert get_udf_distributor_class() == DistributedClass
96
89
 
97
90
  with patch.dict(os.environ, {"DATACHAIN_DISTRIBUTED": ""}):
98
91
  with pytest.raises(
99
92
  RuntimeError, match="DATACHAIN_DISTRIBUTED import path is required"
100
93
  ):
101
- get_distributed_class()
94
+ get_udf_distributor_class()
102
95
 
103
96
  with patch.dict(
104
97
  os.environ,
105
98
  {"DATACHAIN_DISTRIBUTED": "tests.unit.test_catalog_loader.NonExistent"},
106
99
  ):
107
100
  with pytest.raises(AttributeError, match="has no attribute 'NonExistent'"):
108
- get_distributed_class()
101
+ get_udf_distributor_class()
109
102
 
110
103
  with patch.dict(os.environ, {"DATACHAIN_DISTRIBUTED": "DistributionClass"}):
111
104
  with pytest.raises(
112
105
  RuntimeError, match="Invalid DATACHAIN_DISTRIBUTED import path"
113
106
  ):
114
- get_distributed_class()
107
+ get_udf_distributor_class()
115
108
 
116
109
 
117
110
  def test_get_catalog(sqlite_db):
@@ -5,6 +5,7 @@ import pytest
5
5
  from datachain.utils import (
6
6
  datachain_paths_join,
7
7
  determine_processes,
8
+ determine_workers,
8
9
  nested_dict_path_set,
9
10
  retry_with_backoff,
10
11
  row_to_nested_dict,
@@ -141,21 +142,56 @@ def test_retry_with_backoff():
141
142
 
142
143
 
143
144
  @pytest.mark.parametrize(
144
- "parallel,settings,expected",
145
+ "workers,rows_total,settings,expected",
145
146
  (
146
- (None, None, False),
147
- (None, "-1", True),
148
- (None, "0", False),
149
- (None, "5", 5),
150
- (-1, "5", True),
151
- (0, "5", False),
152
- (10, "5", 10),
147
+ (False, None, None, False),
148
+ (False, None, "-1", False),
149
+ (False, None, "0", False),
150
+ (False, None, "5", 5),
151
+ (-1, None, "5", False),
152
+ (0, None, "5", False),
153
+ (10, None, "5", 10),
154
+ (True, None, None, True),
155
+ (True, None, "5", True),
156
+ (10, 0, None, False),
157
+ (10, 1, None, False),
158
+ (10, 2, None, 10),
159
+ (True, 0, None, False),
160
+ (True, 1, None, False),
161
+ (True, 2, None, True),
153
162
  ),
154
163
  )
155
- def test_determine_processes(parallel, settings, expected):
164
+ def test_determine_workers(workers, rows_total, settings, expected):
165
+ if settings is not None:
166
+ os.environ["DATACHAIN_DISTRIBUTED"] = "some_defined_value"
167
+ os.environ["DATACHAIN_SETTINGS_WORKERS"] = settings
168
+ assert determine_workers(workers, rows_total=rows_total) == expected
169
+
170
+
171
+ @pytest.mark.parametrize(
172
+ "parallel,rows_total,settings,expected",
173
+ (
174
+ (None, None, None, False),
175
+ (None, None, "-1", True),
176
+ (None, None, "0", False),
177
+ (None, None, "5", 5),
178
+ (-1, None, "5", True),
179
+ (0, None, "5", False),
180
+ (10, None, "5", 10),
181
+ (True, None, None, True),
182
+ (True, None, "5", True),
183
+ (10, 0, None, False),
184
+ (10, 1, None, False),
185
+ (10, 2, None, 10),
186
+ (True, 0, None, False),
187
+ (True, 1, None, False),
188
+ (True, 2, None, True),
189
+ ),
190
+ )
191
+ def test_determine_processes(parallel, rows_total, settings, expected):
156
192
  if settings is not None:
157
193
  os.environ["DATACHAIN_SETTINGS_PARALLEL"] = settings
158
- assert determine_processes(parallel) == expected
194
+ assert determine_processes(parallel, rows_total=rows_total) == expected
159
195
 
160
196
 
161
197
  @pytest.mark.parametrize(
@@ -1,20 +0,0 @@
1
- from typing import TYPE_CHECKING, Any, Callable, Optional, TypedDict
2
-
3
- if TYPE_CHECKING:
4
- from sqlalchemy import Select, Table
5
-
6
- from datachain.query.batch import BatchingStrategy
7
-
8
-
9
- class UdfInfo(TypedDict):
10
- udf_data: bytes
11
- catalog_init: dict[str, Any]
12
- metastore_clone_params: tuple[Callable[..., Any], list[Any], dict[str, Any]]
13
- warehouse_clone_params: tuple[Callable[..., Any], list[Any], dict[str, Any]]
14
- table: "Table"
15
- query: "Select"
16
- udf_fields: list[str]
17
- batching: "BatchingStrategy"
18
- processes: Optional[int]
19
- is_generator: bool
20
- cache: bool
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes