datachain 0.14.2__tar.gz → 0.14.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (354) hide show
  1. {datachain-0.14.2/src/datachain.egg-info → datachain-0.14.4}/PKG-INFO +1 -1
  2. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/catalog/loader.py +4 -9
  3. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/warehouse.py +9 -0
  4. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dataset_info.py +5 -0
  5. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/datasets.py +2 -0
  6. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/udf.py +3 -3
  7. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/dataset.py +39 -40
  8. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/dispatch.py +62 -58
  9. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/session.py +4 -0
  10. datachain-0.14.4/src/datachain/query/udf.py +49 -0
  11. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/utils.py +30 -4
  12. {datachain-0.14.2 → datachain-0.14.4/src/datachain.egg-info}/PKG-INFO +1 -1
  13. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_datachain.py +11 -0
  14. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_datachain.py +13 -1
  15. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_catalog_loader.py +7 -14
  16. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_session.py +12 -0
  17. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_utils.py +46 -10
  18. datachain-0.14.2/src/datachain/query/udf.py +0 -20
  19. {datachain-0.14.2 → datachain-0.14.4}/.cruft.json +0 -0
  20. {datachain-0.14.2 → datachain-0.14.4}/.gitattributes +0 -0
  21. {datachain-0.14.2 → datachain-0.14.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  22. {datachain-0.14.2 → datachain-0.14.4}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  23. {datachain-0.14.2 → datachain-0.14.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  24. {datachain-0.14.2 → datachain-0.14.4}/.github/codecov.yaml +0 -0
  25. {datachain-0.14.2 → datachain-0.14.4}/.github/dependabot.yml +0 -0
  26. {datachain-0.14.2 → datachain-0.14.4}/.github/workflows/benchmarks.yml +0 -0
  27. {datachain-0.14.2 → datachain-0.14.4}/.github/workflows/release.yml +0 -0
  28. {datachain-0.14.2 → datachain-0.14.4}/.github/workflows/tests-studio.yml +0 -0
  29. {datachain-0.14.2 → datachain-0.14.4}/.github/workflows/tests.yml +0 -0
  30. {datachain-0.14.2 → datachain-0.14.4}/.github/workflows/update-template.yaml +0 -0
  31. {datachain-0.14.2 → datachain-0.14.4}/.gitignore +0 -0
  32. {datachain-0.14.2 → datachain-0.14.4}/.pre-commit-config.yaml +0 -0
  33. {datachain-0.14.2 → datachain-0.14.4}/CODE_OF_CONDUCT.rst +0 -0
  34. {datachain-0.14.2 → datachain-0.14.4}/LICENSE +0 -0
  35. {datachain-0.14.2 → datachain-0.14.4}/README.rst +0 -0
  36. {datachain-0.14.2 → datachain-0.14.4}/docs/assets/captioned_cartoons.png +0 -0
  37. {datachain-0.14.2 → datachain-0.14.4}/docs/assets/datachain-white.svg +0 -0
  38. {datachain-0.14.2 → datachain-0.14.4}/docs/assets/datachain.svg +0 -0
  39. {datachain-0.14.2 → datachain-0.14.4}/docs/contributing.md +0 -0
  40. {datachain-0.14.2 → datachain-0.14.4}/docs/css/github-permalink-style.css +0 -0
  41. {datachain-0.14.2 → datachain-0.14.4}/docs/examples.md +0 -0
  42. {datachain-0.14.2 → datachain-0.14.4}/docs/index.md +0 -0
  43. {datachain-0.14.2 → datachain-0.14.4}/docs/overrides/main.html +0 -0
  44. {datachain-0.14.2 → datachain-0.14.4}/docs/quick-start.md +0 -0
  45. {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/arrowrow.md +0 -0
  46. {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/bbox.md +0 -0
  47. {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/file.md +0 -0
  48. {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/imagefile.md +0 -0
  49. {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/index.md +0 -0
  50. {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/pose.md +0 -0
  51. {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/segment.md +0 -0
  52. {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/tarvfile.md +0 -0
  53. {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/textfile.md +0 -0
  54. {datachain-0.14.2 → datachain-0.14.4}/docs/references/data-types/videofile.md +0 -0
  55. {datachain-0.14.2 → datachain-0.14.4}/docs/references/datachain.md +0 -0
  56. {datachain-0.14.2 → datachain-0.14.4}/docs/references/func.md +0 -0
  57. {datachain-0.14.2 → datachain-0.14.4}/docs/references/index.md +0 -0
  58. {datachain-0.14.2 → datachain-0.14.4}/docs/references/remotes.md +0 -0
  59. {datachain-0.14.2 → datachain-0.14.4}/docs/references/toolkit.md +0 -0
  60. {datachain-0.14.2 → datachain-0.14.4}/docs/references/torch.md +0 -0
  61. {datachain-0.14.2 → datachain-0.14.4}/docs/references/udf.md +0 -0
  62. {datachain-0.14.2 → datachain-0.14.4}/docs/tutorials.md +0 -0
  63. {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  64. {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  65. {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/openimage-detect.py +0 -0
  66. {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/ultralytics-bbox.py +0 -0
  67. {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/ultralytics-pose.py +0 -0
  68. {datachain-0.14.2 → datachain-0.14.4}/examples/computer_vision/ultralytics-segment.py +0 -0
  69. {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/common_sql_functions.py +0 -0
  70. {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/json-csv-reader.py +0 -0
  71. {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/torch-loader.py +0 -0
  72. {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/udfs/parallel.py +0 -0
  73. {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/udfs/simple.py +0 -0
  74. {datachain-0.14.2 → datachain-0.14.4}/examples/get_started/udfs/stateful.py +0 -0
  75. {datachain-0.14.2 → datachain-0.14.4}/examples/llm_and_nlp/claude-query.py +0 -0
  76. {datachain-0.14.2 → datachain-0.14.4}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  77. {datachain-0.14.2 → datachain-0.14.4}/examples/multimodal/clip_inference.py +0 -0
  78. {datachain-0.14.2 → datachain-0.14.4}/examples/multimodal/hf_pipeline.py +0 -0
  79. {datachain-0.14.2 → datachain-0.14.4}/examples/multimodal/openai_image_desc_lib.py +0 -0
  80. {datachain-0.14.2 → datachain-0.14.4}/examples/multimodal/wds.py +0 -0
  81. {datachain-0.14.2 → datachain-0.14.4}/examples/multimodal/wds_filtered.py +0 -0
  82. {datachain-0.14.2 → datachain-0.14.4}/mkdocs.yml +0 -0
  83. {datachain-0.14.2 → datachain-0.14.4}/noxfile.py +0 -0
  84. {datachain-0.14.2 → datachain-0.14.4}/pyproject.toml +0 -0
  85. {datachain-0.14.2 → datachain-0.14.4}/setup.cfg +0 -0
  86. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/__init__.py +0 -0
  87. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/__main__.py +0 -0
  88. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/asyn.py +0 -0
  89. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cache.py +0 -0
  90. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/catalog/__init__.py +0 -0
  91. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/catalog/catalog.py +0 -0
  92. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/catalog/datasource.py +0 -0
  93. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/__init__.py +0 -0
  94. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/__init__.py +0 -0
  95. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/datasets.py +0 -0
  96. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/du.py +0 -0
  97. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/index.py +0 -0
  98. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/ls.py +0 -0
  99. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/misc.py +0 -0
  100. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/query.py +0 -0
  101. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/commands/show.py +0 -0
  102. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/parser/__init__.py +0 -0
  103. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/parser/job.py +0 -0
  104. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/parser/studio.py +0 -0
  105. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/parser/utils.py +0 -0
  106. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/cli/utils.py +0 -0
  107. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/__init__.py +0 -0
  108. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/azure.py +0 -0
  109. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/fileslice.py +0 -0
  110. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/fsspec.py +0 -0
  111. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/gcs.py +0 -0
  112. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/hf.py +0 -0
  113. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/local.py +0 -0
  114. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/client/s3.py +0 -0
  115. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/config.py +0 -0
  116. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/__init__.py +0 -0
  117. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/db_engine.py +0 -0
  118. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/job.py +0 -0
  119. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/metastore.py +0 -0
  120. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/schema.py +0 -0
  121. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/serializer.py +0 -0
  122. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/data_storage/sqlite.py +0 -0
  123. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/dataset.py +0 -0
  124. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/diff/__init__.py +0 -0
  125. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/error.py +0 -0
  126. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/fs/__init__.py +0 -0
  127. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/fs/reference.py +0 -0
  128. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/fs/utils.py +0 -0
  129. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/__init__.py +0 -0
  130. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/aggregate.py +0 -0
  131. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/array.py +0 -0
  132. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/base.py +0 -0
  133. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/conditional.py +0 -0
  134. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/func.py +0 -0
  135. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/numeric.py +0 -0
  136. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/path.py +0 -0
  137. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/random.py +0 -0
  138. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/string.py +0 -0
  139. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/func/window.py +0 -0
  140. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/job.py +0 -0
  141. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/__init__.py +0 -0
  142. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/arrow.py +0 -0
  143. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/clip.py +0 -0
  144. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/__init__.py +0 -0
  145. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/flatten.py +0 -0
  146. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/python_to_sql.py +0 -0
  147. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/sql_to_python.py +0 -0
  148. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/unflatten.py +0 -0
  149. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  150. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/data_model.py +0 -0
  151. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/__init__.py +0 -0
  152. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/csv.py +0 -0
  153. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/datachain.py +0 -0
  154. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/hf.py +0 -0
  155. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/json.py +0 -0
  156. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/listings.py +0 -0
  157. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/pandas.py +0 -0
  158. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/parquet.py +0 -0
  159. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/records.py +0 -0
  160. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/storage.py +0 -0
  161. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/utils.py +0 -0
  162. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/dc/values.py +0 -0
  163. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/file.py +0 -0
  164. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/hf.py +0 -0
  165. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/image.py +0 -0
  166. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/listing.py +0 -0
  167. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/listing_info.py +0 -0
  168. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/meta_formats.py +0 -0
  169. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/model_store.py +0 -0
  170. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/pytorch.py +0 -0
  171. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/settings.py +0 -0
  172. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/signal_schema.py +0 -0
  173. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/tar.py +0 -0
  174. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/text.py +0 -0
  175. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/udf_signature.py +0 -0
  176. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/utils.py +0 -0
  177. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/video.py +0 -0
  178. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/webdataset.py +0 -0
  179. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/lib/webdataset_laion.py +0 -0
  180. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/listing.py +0 -0
  181. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/__init__.py +0 -0
  182. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/bbox.py +0 -0
  183. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/pose.py +0 -0
  184. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/segment.py +0 -0
  185. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/ultralytics/__init__.py +0 -0
  186. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/ultralytics/bbox.py +0 -0
  187. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/ultralytics/pose.py +0 -0
  188. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/ultralytics/segment.py +0 -0
  189. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/model/utils.py +0 -0
  190. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/node.py +0 -0
  191. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/nodes_fetcher.py +0 -0
  192. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/nodes_thread_pool.py +0 -0
  193. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/progress.py +0 -0
  194. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/py.typed +0 -0
  195. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/__init__.py +0 -0
  196. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/batch.py +0 -0
  197. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/metrics.py +0 -0
  198. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/params.py +0 -0
  199. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/queue.py +0 -0
  200. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/schema.py +0 -0
  201. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/query/utils.py +0 -0
  202. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/remote/__init__.py +0 -0
  203. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/remote/studio.py +0 -0
  204. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/script_meta.py +0 -0
  205. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/__init__.py +0 -0
  206. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/default/__init__.py +0 -0
  207. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/default/base.py +0 -0
  208. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/__init__.py +0 -0
  209. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/aggregate.py +0 -0
  210. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/array.py +0 -0
  211. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/conditional.py +0 -0
  212. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/numeric.py +0 -0
  213. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/path.py +0 -0
  214. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/random.py +0 -0
  215. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/functions/string.py +0 -0
  216. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/selectable.py +0 -0
  217. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/sqlite/__init__.py +0 -0
  218. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/sqlite/base.py +0 -0
  219. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/sqlite/types.py +0 -0
  220. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/sqlite/vector.py +0 -0
  221. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/types.py +0 -0
  222. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/sql/utils.py +0 -0
  223. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/studio.py +0 -0
  224. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/telemetry.py +0 -0
  225. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/toolkit/__init__.py +0 -0
  226. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/toolkit/split.py +0 -0
  227. {datachain-0.14.2 → datachain-0.14.4}/src/datachain/torch/__init__.py +0 -0
  228. {datachain-0.14.2 → datachain-0.14.4}/src/datachain.egg-info/SOURCES.txt +0 -0
  229. {datachain-0.14.2 → datachain-0.14.4}/src/datachain.egg-info/dependency_links.txt +0 -0
  230. {datachain-0.14.2 → datachain-0.14.4}/src/datachain.egg-info/entry_points.txt +0 -0
  231. {datachain-0.14.2 → datachain-0.14.4}/src/datachain.egg-info/requires.txt +0 -0
  232. {datachain-0.14.2 → datachain-0.14.4}/src/datachain.egg-info/top_level.txt +0 -0
  233. {datachain-0.14.2 → datachain-0.14.4}/tests/__init__.py +0 -0
  234. {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/__init__.py +0 -0
  235. {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/conftest.py +0 -0
  236. {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  237. {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/datasets/.dvc/config +0 -0
  238. {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/datasets/.gitignore +0 -0
  239. {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  240. {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/test_datachain.py +0 -0
  241. {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/test_ls.py +0 -0
  242. {datachain-0.14.2 → datachain-0.14.4}/tests/benchmarks/test_version.py +0 -0
  243. {datachain-0.14.2 → datachain-0.14.4}/tests/conftest.py +0 -0
  244. {datachain-0.14.2 → datachain-0.14.4}/tests/data.py +0 -0
  245. {datachain-0.14.2 → datachain-0.14.4}/tests/examples/__init__.py +0 -0
  246. {datachain-0.14.2 → datachain-0.14.4}/tests/examples/test_examples.py +0 -0
  247. {datachain-0.14.2 → datachain-0.14.4}/tests/examples/test_wds_e2e.py +0 -0
  248. {datachain-0.14.2 → datachain-0.14.4}/tests/examples/wds_data.py +0 -0
  249. {datachain-0.14.2 → datachain-0.14.4}/tests/func/__init__.py +0 -0
  250. {datachain-0.14.2 → datachain-0.14.4}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  251. {datachain-0.14.2 → datachain-0.14.4}/tests/func/data/lena.jpg +0 -0
  252. {datachain-0.14.2 → datachain-0.14.4}/tests/func/fake-service-account-credentials.json +0 -0
  253. {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/__init__.py +0 -0
  254. {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/data/running-mask0.png +0 -0
  255. {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/data/running-mask1.png +0 -0
  256. {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/data/running.jpg +0 -0
  257. {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/data/ships.jpg +0 -0
  258. {datachain-0.14.2 → datachain-0.14.4}/tests/func/model/test_yolo.py +0 -0
  259. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_catalog.py +0 -0
  260. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_client.py +0 -0
  261. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_cloud_transfer.py +0 -0
  262. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_data_storage.py +0 -0
  263. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_datachain_merge.py +0 -0
  264. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_dataset_query.py +0 -0
  265. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_datasets.py +0 -0
  266. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_feature_pickling.py +0 -0
  267. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_file.py +0 -0
  268. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_hf.py +0 -0
  269. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_hidden_field.py +0 -0
  270. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_image.py +0 -0
  271. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_listing.py +0 -0
  272. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_ls.py +0 -0
  273. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_meta_formats.py +0 -0
  274. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_metrics.py +0 -0
  275. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_pull.py +0 -0
  276. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_pytorch.py +0 -0
  277. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_query.py +0 -0
  278. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_session.py +0 -0
  279. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_toolkit.py +0 -0
  280. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_video.py +0 -0
  281. {datachain-0.14.2 → datachain-0.14.4}/tests/func/test_warehouse.py +0 -0
  282. {datachain-0.14.2 → datachain-0.14.4}/tests/scripts/feature_class.py +0 -0
  283. {datachain-0.14.2 → datachain-0.14.4}/tests/scripts/feature_class_exception.py +0 -0
  284. {datachain-0.14.2 → datachain-0.14.4}/tests/scripts/feature_class_parallel.py +0 -0
  285. {datachain-0.14.2 → datachain-0.14.4}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  286. {datachain-0.14.2 → datachain-0.14.4}/tests/scripts/name_len_slow.py +0 -0
  287. {datachain-0.14.2 → datachain-0.14.4}/tests/test_atomicity.py +0 -0
  288. {datachain-0.14.2 → datachain-0.14.4}/tests/test_cli_e2e.py +0 -0
  289. {datachain-0.14.2 → datachain-0.14.4}/tests/test_cli_studio.py +0 -0
  290. {datachain-0.14.2 → datachain-0.14.4}/tests/test_import_time.py +0 -0
  291. {datachain-0.14.2 → datachain-0.14.4}/tests/test_query_e2e.py +0 -0
  292. {datachain-0.14.2 → datachain-0.14.4}/tests/test_telemetry.py +0 -0
  293. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/__init__.py +0 -0
  294. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/__init__.py +0 -0
  295. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/conftest.py +0 -0
  296. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_arrow.py +0 -0
  297. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_clip.py +0 -0
  298. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  299. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_datachain_merge.py +0 -0
  300. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_diff.py +0 -0
  301. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_feature.py +0 -0
  302. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_feature_utils.py +0 -0
  303. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_file.py +0 -0
  304. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_hf.py +0 -0
  305. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_image.py +0 -0
  306. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_listing_info.py +0 -0
  307. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_python_to_sql.py +0 -0
  308. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_schema.py +0 -0
  309. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_signal_schema.py +0 -0
  310. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_sql_to_python.py +0 -0
  311. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_text.py +0 -0
  312. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_udf_signature.py +0 -0
  313. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_utils.py +0 -0
  314. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/lib/test_webdataset.py +0 -0
  315. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/model/__init__.py +0 -0
  316. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/model/test_bbox.py +0 -0
  317. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/model/test_pose.py +0 -0
  318. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/model/test_segment.py +0 -0
  319. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/model/test_utils.py +0 -0
  320. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/__init__.py +0 -0
  321. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/sqlite/__init__.py +0 -0
  322. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/sqlite/test_types.py +0 -0
  323. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/sqlite/test_utils.py +0 -0
  324. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_array.py +0 -0
  325. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_conditional.py +0 -0
  326. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_path.py +0 -0
  327. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_random.py +0 -0
  328. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_selectable.py +0 -0
  329. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/sql/test_string.py +0 -0
  330. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_asyn.py +0 -0
  331. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_cache.py +0 -0
  332. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_catalog.py +0 -0
  333. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_cli_parsing.py +0 -0
  334. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_client.py +0 -0
  335. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_client_gcs.py +0 -0
  336. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_client_s3.py +0 -0
  337. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_config.py +0 -0
  338. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_data_storage.py +0 -0
  339. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_database_engine.py +0 -0
  340. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_dataset.py +0 -0
  341. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_dispatch.py +0 -0
  342. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_fileslice.py +0 -0
  343. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_func.py +0 -0
  344. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_listing.py +0 -0
  345. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_metastore.py +0 -0
  346. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_module_exports.py +0 -0
  347. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_pytorch.py +0 -0
  348. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_query.py +0 -0
  349. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_query_metrics.py +0 -0
  350. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_query_params.py +0 -0
  351. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_script_meta.py +0 -0
  352. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_serializer.py +0 -0
  353. {datachain-0.14.2 → datachain-0.14.4}/tests/unit/test_warehouse.py +0 -0
  354. {datachain-0.14.2 → datachain-0.14.4}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.2
3
+ Version: 0.14.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -7,6 +7,7 @@ from datachain.utils import get_envs_by_prefix
7
7
  if TYPE_CHECKING:
8
8
  from datachain.catalog import Catalog
9
9
  from datachain.data_storage import AbstractMetastore, AbstractWarehouse
10
+ from datachain.query.udf import AbstractUDFDistributor
10
11
 
11
12
  METASTORE_SERIALIZED = "DATACHAIN__METASTORE"
12
13
  METASTORE_IMPORT_PATH = "DATACHAIN_METASTORE"
@@ -15,7 +16,6 @@ WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
15
16
  WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
16
17
  WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
17
18
  DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
18
- DISTRIBUTED_ARG_PREFIX = "DATACHAIN_DISTRIBUTED_ARG_"
19
19
 
20
20
  IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
21
21
 
@@ -100,27 +100,22 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
100
100
  return warehouse_class(**warehouse_args)
101
101
 
102
102
 
103
- def get_distributed_class(**kwargs):
103
+ def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
104
104
  distributed_import_path = os.environ.get(DISTRIBUTED_IMPORT_PATH)
105
- distributed_arg_envs = get_envs_by_prefix(DISTRIBUTED_ARG_PREFIX)
106
- # Convert env variable names to keyword argument names by lowercasing them
107
- distributed_args = {k.lower(): v for k, v in distributed_arg_envs.items()}
108
105
 
109
106
  if not distributed_import_path:
110
107
  raise RuntimeError(
111
108
  f"{DISTRIBUTED_IMPORT_PATH} import path is required "
112
109
  "for distributed UDF processing."
113
110
  )
114
- # Distributed class paths are specified as (for example):
115
- # module.classname
111
+ # Distributed class paths are specified as (for example): module.classname
116
112
  if "." not in distributed_import_path:
117
113
  raise RuntimeError(
118
114
  f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
119
115
  )
120
116
  module_name, _, class_name = distributed_import_path.rpartition(".")
121
117
  distributed = import_module(module_name)
122
- distributed_class = getattr(distributed, class_name)
123
- return distributed_class(**distributed_args | kwargs)
118
+ return getattr(distributed, class_name)
124
119
 
125
120
 
126
121
  def get_catalog(
@@ -199,6 +199,15 @@ class AbstractWarehouse(ABC, Serializable):
199
199
  # Query Execution
200
200
  #
201
201
 
202
+ def query_count(self, query: sa.sql.selectable.Select) -> int:
203
+ """Count the number of rows in a query."""
204
+ count_query = sa.select(func.count(1)).select_from(query.subquery())
205
+ return next(self.db.execute(count_query))[0]
206
+
207
+ def table_rows_count(self, table) -> int:
208
+ count_query = sa.select(func.count(1)).select_from(table)
209
+ return next(self.db.execute(count_query))[0]
210
+
202
211
  def dataset_select_paginated(
203
212
  self,
204
213
  query,
@@ -12,6 +12,7 @@ from datachain.dataset import (
12
12
  )
13
13
  from datachain.job import Job
14
14
  from datachain.lib.data_model import DataModel
15
+ from datachain.query.session import Session
15
16
  from datachain.utils import TIME_ZERO
16
17
 
17
18
  if TYPE_CHECKING:
@@ -32,6 +33,10 @@ class DatasetInfo(DataModel):
32
33
  error_message: str = Field(default="")
33
34
  error_stack: str = Field(default="")
34
35
 
36
+ @property
37
+ def is_temp(self) -> bool:
38
+ return Session.is_temp_dataset(self.name)
39
+
35
40
  @staticmethod
36
41
  def _validate_dict(
37
42
  v: Optional[Union[str, dict]],
@@ -140,6 +140,8 @@ def datasets(
140
140
  )
141
141
  ]
142
142
 
143
+ datasets_values = [d for d in datasets_values if not d.is_temp]
144
+
143
145
  return read_values(
144
146
  session=session,
145
147
  settings=settings,
@@ -16,7 +16,6 @@ from datachain.lib.convert.flatten import flatten
16
16
  from datachain.lib.data_model import DataValue
17
17
  from datachain.lib.file import File
18
18
  from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
19
- from datachain.progress import CombinedDownloadCallback
20
19
  from datachain.query.batch import (
21
20
  Batch,
22
21
  BatchingStrategy,
@@ -327,8 +326,9 @@ def _prefetch_inputs(
327
326
 
328
327
  if after_prefetch is None:
329
328
  after_prefetch = noop
330
- if isinstance(download_cb, CombinedDownloadCallback):
331
- after_prefetch = download_cb.increment_file_count
329
+ if download_cb and hasattr(download_cb, "increment_file_count"):
330
+ increment_file_count: Callable[[], None] = download_cb.increment_file_count
331
+ after_prefetch = increment_file_count
332
332
 
333
333
  f = partial(_prefetch_input, download_cb=download_cb, after_prefetch=after_prefetch)
334
334
  mapper = AsyncMapper(f, prepared_inputs, workers=prefetch)
@@ -55,10 +55,12 @@ from datachain.lib.udf import UDFAdapter, _get_cache
55
55
  from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
56
56
  from datachain.query.schema import C, UDFParamSpec, normalize_param
57
57
  from datachain.query.session import Session
58
+ from datachain.query.udf import UdfInfo
58
59
  from datachain.sql.functions.random import rand
59
60
  from datachain.utils import (
60
61
  batched,
61
62
  determine_processes,
63
+ determine_workers,
62
64
  filtered_cloudpickle_dumps,
63
65
  get_datachain_executable,
64
66
  safe_closing,
@@ -74,7 +76,6 @@ if TYPE_CHECKING:
74
76
  from datachain.data_storage import AbstractWarehouse
75
77
  from datachain.dataset import DatasetRecord
76
78
  from datachain.lib.udf import UDFAdapter, UDFResult
77
- from datachain.query.udf import UdfInfo
78
79
 
79
80
  P = ParamSpec("P")
80
81
 
@@ -414,20 +415,15 @@ class UDFStep(Step, ABC):
414
415
  def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
415
416
  from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
416
417
 
417
- use_partitioning = self.partition_by is not None
418
- batching = self.udf.get_batching(use_partitioning)
419
- workers = self.workers
420
- if (
421
- not workers
422
- and os.environ.get("DATACHAIN_DISTRIBUTED")
423
- and os.environ.get("DATACHAIN_SETTINGS_WORKERS")
424
- ):
425
- # Enable distributed processing by default if the module is available,
426
- # and a default number of workers is provided.
427
- workers = True
418
+ rows_total = self.catalog.warehouse.query_count(query)
419
+ if rows_total == 0:
420
+ return
428
421
 
429
- processes = determine_processes(self.parallel)
422
+ workers = determine_workers(self.workers, rows_total=rows_total)
423
+ processes = determine_processes(self.parallel, rows_total=rows_total)
430
424
 
425
+ use_partitioning = self.partition_by is not None
426
+ batching = self.udf.get_batching(use_partitioning)
431
427
  udf_fields = [str(c.name) for c in query.selected_columns]
432
428
 
433
429
  prefetch = self.udf.prefetch
@@ -441,23 +437,24 @@ class UDFStep(Step, ABC):
441
437
  "distributed processing."
442
438
  )
443
439
 
444
- from datachain.catalog.loader import get_distributed_class
445
-
446
- distributor = get_distributed_class(
447
- min_task_size=self.min_task_size
448
- )
449
- distributor(
450
- self.udf,
451
- catalog,
452
- udf_table,
453
- query,
454
- workers,
455
- processes,
440
+ from datachain.catalog.loader import get_udf_distributor_class
441
+
442
+ udf_distributor_class = get_udf_distributor_class()
443
+ udf_distributor = udf_distributor_class(
444
+ catalog=catalog,
445
+ table=udf_table,
446
+ query=query,
447
+ udf_data=filtered_cloudpickle_dumps(self.udf),
448
+ batching=batching,
449
+ workers=workers,
450
+ processes=processes,
456
451
  udf_fields=udf_fields,
452
+ rows_total=rows_total,
453
+ use_cache=self.cache,
457
454
  is_generator=self.is_generator,
458
- use_partitioning=use_partitioning,
459
- cache=self.cache,
455
+ min_task_size=self.min_task_size,
460
456
  )
457
+ udf_distributor()
461
458
  elif processes:
462
459
  # Parallel processing (faster for more CPU-heavy UDFs)
463
460
  if catalog.in_memory:
@@ -465,19 +462,21 @@ class UDFStep(Step, ABC):
465
462
  "In-memory databases cannot be used "
466
463
  "with parallel processing."
467
464
  )
468
- udf_info: UdfInfo = {
469
- "udf_data": filtered_cloudpickle_dumps(self.udf),
470
- "catalog_init": catalog.get_init_params(),
471
- "metastore_clone_params": catalog.metastore.clone_params(),
472
- "warehouse_clone_params": catalog.warehouse.clone_params(),
473
- "table": udf_table,
474
- "query": query,
475
- "udf_fields": udf_fields,
476
- "batching": batching,
477
- "processes": processes,
478
- "is_generator": self.is_generator,
479
- "cache": self.cache,
480
- }
465
+
466
+ udf_info = UdfInfo(
467
+ udf_data=filtered_cloudpickle_dumps(self.udf),
468
+ catalog_init=catalog.get_init_params(),
469
+ metastore_clone_params=catalog.metastore.clone_params(),
470
+ warehouse_clone_params=catalog.warehouse.clone_params(),
471
+ table=udf_table,
472
+ query=query,
473
+ udf_fields=udf_fields,
474
+ batching=batching,
475
+ processes=processes,
476
+ is_generator=self.is_generator,
477
+ cache=self.cache,
478
+ rows_total=rows_total,
479
+ )
481
480
 
482
481
  # Run the UDFDispatcher in another process to avoid needing
483
482
  # if __name__ == '__main__': in user scripts
@@ -4,18 +4,16 @@ from itertools import chain
4
4
  from multiprocessing import cpu_count
5
5
  from sys import stdin
6
6
  from threading import Timer
7
- from typing import TYPE_CHECKING, Optional
7
+ from typing import TYPE_CHECKING, Literal, Optional
8
8
 
9
- import attrs
10
9
  import multiprocess
11
10
  from cloudpickle import load, loads
12
11
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
13
12
  from multiprocess import get_context
14
- from sqlalchemy.sql import func
15
13
 
16
14
  from datachain.catalog import Catalog
17
15
  from datachain.catalog.catalog import clone_catalog_with_cache
18
- from datachain.catalog.loader import get_distributed_class
16
+ from datachain.catalog.loader import get_udf_distributor_class
19
17
  from datachain.lib.udf import _get_cache
20
18
  from datachain.query.batch import RowsOutput, RowsOutputBatch
21
19
  from datachain.query.dataset import (
@@ -59,7 +57,9 @@ def udf_entrypoint() -> int:
59
57
  dispatch = UDFDispatcher(udf_info)
60
58
 
61
59
  query = udf_info["query"]
60
+ rows_total = udf_info["rows_total"]
62
61
  batching = udf_info["batching"]
62
+ is_generator = udf_info["is_generator"]
63
63
  n_workers = udf_info["processes"]
64
64
  if n_workers is True:
65
65
  n_workers = None # Use default number of CPUs (cores)
@@ -67,34 +67,31 @@ def udf_entrypoint() -> int:
67
67
  wh_cls, wh_args, wh_kwargs = udf_info["warehouse_clone_params"]
68
68
  warehouse: AbstractWarehouse = wh_cls(*wh_args, **wh_kwargs)
69
69
 
70
- total_rows = next(
71
- warehouse.db.execute(
72
- query.with_only_columns(func.count(query.c.sys__id)).order_by(None)
73
- )
74
- )[0]
75
-
76
70
  with contextlib.closing(
77
71
  batching(warehouse.dataset_select_paginated, query, ids_only=True)
78
72
  ) as udf_inputs:
79
73
  download_cb = get_download_callback()
80
74
  processed_cb = get_processed_callback()
75
+ generated_cb = get_generated_callback(is_generator)
81
76
  try:
82
77
  dispatch.run_udf_parallel(
83
78
  udf_inputs,
84
- total_rows=total_rows,
79
+ rows_total=rows_total,
85
80
  n_workers=n_workers,
86
- processed_cb=processed_cb,
87
81
  download_cb=download_cb,
82
+ processed_cb=processed_cb,
83
+ generated_cb=generated_cb,
88
84
  )
89
85
  finally:
90
86
  download_cb.close()
91
87
  processed_cb.close()
88
+ generated_cb.close()
92
89
 
93
90
  return 0
94
91
 
95
92
 
96
93
  def udf_worker_entrypoint() -> int:
97
- return get_distributed_class().run_worker()
94
+ return get_udf_distributor_class().run_worker()
98
95
 
99
96
 
100
97
  class UDFDispatcher:
@@ -134,7 +131,6 @@ class UDFDispatcher:
134
131
  self.done_queue,
135
132
  self.query,
136
133
  self.table,
137
- self.is_generator,
138
134
  self.is_batching,
139
135
  self.cache,
140
136
  self.udf_fields,
@@ -158,20 +154,18 @@ class UDFDispatcher:
158
154
  for _ in range(n_workers):
159
155
  put_into_queue(task_queue, STOP_SIGNAL)
160
156
 
161
- def create_input_queue(self):
162
- return self.ctx.Queue()
163
-
164
157
  def run_udf_parallel( # noqa: C901, PLR0912
165
158
  self,
166
159
  input_rows: Iterable[RowsOutput],
167
- total_rows: int,
160
+ rows_total: int,
168
161
  n_workers: Optional[int] = None,
169
- processed_cb: Callback = DEFAULT_CALLBACK,
170
162
  download_cb: Callback = DEFAULT_CALLBACK,
163
+ processed_cb: Callback = DEFAULT_CALLBACK,
164
+ generated_cb: Callback = DEFAULT_CALLBACK,
171
165
  ) -> None:
172
166
  n_workers = get_n_workers_from_arg(n_workers)
173
167
 
174
- input_batch_size = total_rows // n_workers
168
+ input_batch_size = rows_total // n_workers
175
169
  if input_batch_size == 0:
176
170
  input_batch_size = 1
177
171
  elif input_batch_size > DEFAULT_BATCH_SIZE:
@@ -220,6 +214,8 @@ class UDFDispatcher:
220
214
  download_cb.relative_update(downloaded)
221
215
  if processed := result.get("processed"):
222
216
  processed_cb.relative_update(processed)
217
+ if generated := result.get("generated"):
218
+ generated_cb.relative_update(generated)
223
219
 
224
220
  status = result["status"]
225
221
  if status in (OK_STATUS, NOTIFY_STATUS):
@@ -266,46 +262,61 @@ class UDFDispatcher:
266
262
  p.join()
267
263
 
268
264
 
269
- class WorkerCallback(Callback):
270
- def __init__(self, queue: "multiprocess.Queue"):
265
+ class DownloadCallback(Callback):
266
+ def __init__(self, queue: "multiprocess.Queue") -> None:
271
267
  self.queue = queue
272
268
  super().__init__()
273
269
 
274
270
  def relative_update(self, inc: int = 1) -> None:
271
+ # This callback is used to notify the size of the downloaded files
272
+ pass
273
+
274
+ def increment_file_count(self, inc: int = 1) -> None:
275
275
  put_into_queue(self.queue, {"status": NOTIFY_STATUS, "downloaded": inc})
276
276
 
277
277
 
278
278
  class ProcessedCallback(Callback):
279
- def __init__(self):
280
- self.processed_rows: Optional[int] = None
279
+ def __init__(
280
+ self,
281
+ name: Literal["processed", "generated"],
282
+ queue: "multiprocess.Queue",
283
+ ) -> None:
284
+ self.name = name
285
+ self.queue = queue
281
286
  super().__init__()
282
287
 
283
288
  def relative_update(self, inc: int = 1) -> None:
284
- self.processed_rows = inc
289
+ put_into_queue(self.queue, {"status": NOTIFY_STATUS, self.name: inc})
285
290
 
286
291
 
287
- @attrs.define
288
292
  class UDFWorker:
289
- catalog: "Catalog"
290
- udf: "UDFAdapter"
291
- task_queue: "multiprocess.Queue"
292
- done_queue: "multiprocess.Queue"
293
- query: "Select"
294
- table: "Table"
295
- is_generator: bool
296
- is_batching: bool
297
- cache: bool
298
- udf_fields: Sequence[str]
299
- cb: Callback = attrs.field()
300
-
301
- @cb.default
302
- def _default_callback(self) -> WorkerCallback:
303
- return WorkerCallback(self.done_queue)
293
+ def __init__(
294
+ self,
295
+ catalog: "Catalog",
296
+ udf: "UDFAdapter",
297
+ task_queue: "multiprocess.Queue",
298
+ done_queue: "multiprocess.Queue",
299
+ query: "Select",
300
+ table: "Table",
301
+ is_batching: bool,
302
+ cache: bool,
303
+ udf_fields: Sequence[str],
304
+ ) -> None:
305
+ self.catalog = catalog
306
+ self.udf = udf
307
+ self.task_queue = task_queue
308
+ self.done_queue = done_queue
309
+ self.query = query
310
+ self.table = table
311
+ self.is_batching = is_batching
312
+ self.cache = cache
313
+ self.udf_fields = udf_fields
314
+
315
+ self.download_cb = DownloadCallback(self.done_queue)
316
+ self.processed_cb = ProcessedCallback("processed", self.done_queue)
317
+ self.generated_cb = ProcessedCallback("generated", self.done_queue)
304
318
 
305
319
  def run(self) -> None:
306
- processed_cb = ProcessedCallback()
307
- generated_cb = get_generated_callback(self.is_generator)
308
-
309
320
  prefetch = self.udf.prefetch
310
321
  with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
311
322
  catalog = clone_catalog_with_cache(self.catalog, _cache)
@@ -314,29 +325,22 @@ class UDFWorker:
314
325
  self.get_inputs(),
315
326
  catalog,
316
327
  self.cache,
317
- download_cb=self.cb,
318
- processed_cb=processed_cb,
328
+ download_cb=self.download_cb,
329
+ processed_cb=self.processed_cb,
319
330
  )
320
331
  with safe_closing(udf_results):
321
332
  process_udf_outputs(
322
333
  catalog.warehouse,
323
334
  self.table,
324
- self.notify_and_process(udf_results, processed_cb),
335
+ self.notify_and_process(udf_results),
325
336
  self.udf,
326
- cb=generated_cb,
337
+ cb=self.generated_cb,
327
338
  )
339
+ put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
328
340
 
329
- put_into_queue(
330
- self.done_queue,
331
- {"status": FINISHED_STATUS, "processed": processed_cb.processed_rows},
332
- )
333
-
334
- def notify_and_process(self, udf_results, processed_cb):
341
+ def notify_and_process(self, udf_results):
335
342
  for row in udf_results:
336
- put_into_queue(
337
- self.done_queue,
338
- {"status": OK_STATUS, "processed": processed_cb.processed_rows},
339
- )
343
+ put_into_queue(self.done_queue, {"status": OK_STATUS})
340
344
  yield row
341
345
 
342
346
  def get_inputs(self):
@@ -100,6 +100,10 @@ class Session:
100
100
  def get_temp_prefix(self) -> str:
101
101
  return f"{self.DATASET_PREFIX}{self.name}_"
102
102
 
103
+ @classmethod
104
+ def is_temp_dataset(cls, name) -> bool:
105
+ return name.startswith(cls.DATASET_PREFIX)
106
+
103
107
  def _cleanup_temp_datasets(self) -> None:
104
108
  prefix = self.get_temp_prefix()
105
109
  try:
@@ -0,0 +1,49 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import TYPE_CHECKING, Any, Callable, Optional, TypedDict, Union
3
+
4
+ if TYPE_CHECKING:
5
+ from sqlalchemy import Select, Table
6
+
7
+ from datachain.catalog import Catalog
8
+ from datachain.query.batch import BatchingStrategy
9
+
10
+
11
+ class UdfInfo(TypedDict):
12
+ udf_data: bytes
13
+ catalog_init: dict[str, Any]
14
+ metastore_clone_params: tuple[Callable[..., Any], list[Any], dict[str, Any]]
15
+ warehouse_clone_params: tuple[Callable[..., Any], list[Any], dict[str, Any]]
16
+ table: "Table"
17
+ query: "Select"
18
+ udf_fields: list[str]
19
+ batching: "BatchingStrategy"
20
+ processes: Optional[int]
21
+ is_generator: bool
22
+ cache: bool
23
+ rows_total: int
24
+
25
+
26
+ class AbstractUDFDistributor(ABC):
27
+ @abstractmethod
28
+ def __init__(
29
+ self,
30
+ catalog: "Catalog",
31
+ table: "Table",
32
+ query: "Select",
33
+ udf_data: bytes,
34
+ batching: "BatchingStrategy",
35
+ workers: Union[bool, int],
36
+ processes: Union[bool, int],
37
+ udf_fields: list[str],
38
+ rows_total: int,
39
+ use_cache: bool,
40
+ is_generator: bool = False,
41
+ min_task_size: Optional[Union[str, int]] = None,
42
+ ) -> None: ...
43
+
44
+ @abstractmethod
45
+ def __call__(self) -> None: ...
46
+
47
+ @staticmethod
48
+ @abstractmethod
49
+ def run_worker() -> int: ...
@@ -286,15 +286,41 @@ def retry_with_backoff(retries=5, backoff_sec=1, errors=(Exception,)):
286
286
  return retry
287
287
 
288
288
 
289
- def determine_processes(parallel: Optional[Union[bool, int]]) -> Union[bool, int]:
289
+ def determine_workers(
290
+ workers: Union[bool, int],
291
+ rows_total: Optional[int] = None,
292
+ ) -> Union[bool, int]:
293
+ """Determine the number of workers to use for distributed processing."""
294
+ if rows_total is not None and rows_total <= 1:
295
+ # Disable distributed processing if there is no rows or only one row.
296
+ return False
297
+ if (
298
+ workers is False
299
+ and os.environ.get("DATACHAIN_DISTRIBUTED")
300
+ and os.environ.get("DATACHAIN_SETTINGS_WORKERS")
301
+ ):
302
+ # Enable distributed processing by default if the module is available,
303
+ # and a default number of workers is provided.
304
+ workers = int(os.environ["DATACHAIN_SETTINGS_WORKERS"])
305
+ if not workers or workers <= 0:
306
+ return False
307
+ return workers
308
+
309
+
310
+ def determine_processes(
311
+ parallel: Optional[Union[bool, int]] = None,
312
+ rows_total: Optional[int] = None,
313
+ ) -> Union[bool, int]:
314
+ """Determine the number of processes to use for parallel processing."""
315
+ if rows_total is not None and rows_total <= 1:
316
+ # Disable parallel processing if there is no rows or only one row.
317
+ return False
290
318
  if parallel is None and os.environ.get("DATACHAIN_SETTINGS_PARALLEL") is not None:
291
319
  parallel = int(os.environ["DATACHAIN_SETTINGS_PARALLEL"])
292
- if parallel is None or parallel is False:
320
+ if parallel is None or parallel is False or parallel == 0:
293
321
  return False
294
322
  if parallel is True:
295
323
  return True
296
- if parallel == 0:
297
- return False
298
324
  if parallel < 0:
299
325
  return True
300
326
  return parallel
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.2
3
+ Version: 0.14.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -538,6 +538,17 @@ def test_show(capsys, test_session):
538
538
  assert f"{i} {first_name[i]}" in normalized_output
539
539
 
540
540
 
541
+ def test_show_without_temp_datasets(capsys, test_session):
542
+ dc.read_values(
543
+ key=[1, 2, 3, 4], session=test_session
544
+ ).save() # creates temp dataset
545
+ dc.datasets().show()
546
+ captured = capsys.readouterr()
547
+ normalized_output = re.sub(r"\s+", " ", captured.out)
548
+ print(normalized_output)
549
+ assert "Empty result" in normalized_output
550
+
551
+
541
552
  def test_class_method_deprecated(capsys, test_session):
542
553
  with pytest.warns(DeprecationWarning):
543
554
  dc.DataChain.from_values(key=["a", "b", "c"], session=test_session)
@@ -4,7 +4,7 @@ import math
4
4
  import os
5
5
  import re
6
6
  from collections.abc import Generator, Iterator
7
- from unittest.mock import ANY
7
+ from unittest.mock import ANY, patch
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
@@ -26,6 +26,7 @@ from datachain.lib.signal_schema import (
26
26
  SignalResolvingTypeError,
27
27
  SignalSchema,
28
28
  )
29
+ from datachain.lib.udf import UDFAdapter
29
30
  from datachain.lib.udf_signature import UdfSignatureError
30
31
  from datachain.lib.utils import DataChainColumnError, DataChainParamsError
31
32
  from datachain.sql.types import Float, Int64, String
@@ -270,6 +271,17 @@ def test_read_record_empty_chain_without_schema(test_session):
270
271
  )
271
272
 
272
273
 
274
+ def test_empty_chain_skip_udf_run(test_session):
275
+ # Test that UDF is not called for empty chain
276
+ with patch.object(UDFAdapter, "run") as mock_udf_run:
277
+ (
278
+ dc.read_records([], schema={"val": int}, session=test_session)
279
+ .map(lambda val: val * 2, params="val", output={"val2": int})
280
+ .exec()
281
+ )
282
+ mock_udf_run.assert_not_called()
283
+
284
+
273
285
  def test_datasets(test_session):
274
286
  ds = dc.datasets(session=test_session)
275
287
  datasets = [d for d in ds.collect("dataset") if d.name == "fibonacci"]