datachain 0.28.0__tar.gz → 0.28.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (407) hide show
  1. {datachain-0.28.0 → datachain-0.28.2}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.28.0 → datachain-0.28.2}/PKG-INFO +1 -1
  3. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/datachain.py +45 -17
  4. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/records.py +4 -2
  5. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/file.py +53 -1
  6. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/settings.py +23 -0
  7. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/udf.py +27 -4
  8. datachain-0.28.2/src/datachain/lib/utils.py +155 -0
  9. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/dataset.py +18 -20
  10. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/utils.py +37 -22
  11. {datachain-0.28.0 → datachain-0.28.2}/src/datachain.egg-info/PKG-INFO +1 -1
  12. {datachain-0.28.0 → datachain-0.28.2}/src/datachain.egg-info/SOURCES.txt +1 -0
  13. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_datachain.py +36 -6
  14. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_hf.py +1 -0
  15. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_file.py +47 -1
  16. datachain-0.28.2/tests/unit/lib/test_settings.py +61 -0
  17. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_utils.py +70 -1
  18. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_utils.py +19 -0
  19. datachain-0.28.0/src/datachain/lib/utils.py +0 -59
  20. {datachain-0.28.0 → datachain-0.28.2}/.cruft.json +0 -0
  21. {datachain-0.28.0 → datachain-0.28.2}/.gitattributes +0 -0
  22. {datachain-0.28.0 → datachain-0.28.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  23. {datachain-0.28.0 → datachain-0.28.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  24. {datachain-0.28.0 → datachain-0.28.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  25. {datachain-0.28.0 → datachain-0.28.2}/.github/codecov.yaml +0 -0
  26. {datachain-0.28.0 → datachain-0.28.2}/.github/dependabot.yml +0 -0
  27. {datachain-0.28.0 → datachain-0.28.2}/.github/workflows/benchmarks.yml +0 -0
  28. {datachain-0.28.0 → datachain-0.28.2}/.github/workflows/release.yml +0 -0
  29. {datachain-0.28.0 → datachain-0.28.2}/.github/workflows/tests-studio.yml +0 -0
  30. {datachain-0.28.0 → datachain-0.28.2}/.github/workflows/tests.yml +0 -0
  31. {datachain-0.28.0 → datachain-0.28.2}/.github/workflows/update-template.yaml +0 -0
  32. {datachain-0.28.0 → datachain-0.28.2}/.gitignore +0 -0
  33. {datachain-0.28.0 → datachain-0.28.2}/CODE_OF_CONDUCT.rst +0 -0
  34. {datachain-0.28.0 → datachain-0.28.2}/LICENSE +0 -0
  35. {datachain-0.28.0 → datachain-0.28.2}/README.rst +0 -0
  36. {datachain-0.28.0 → datachain-0.28.2}/docs/assets/captioned_cartoons.png +0 -0
  37. {datachain-0.28.0 → datachain-0.28.2}/docs/assets/datachain-white.svg +0 -0
  38. {datachain-0.28.0 → datachain-0.28.2}/docs/assets/datachain.svg +0 -0
  39. {datachain-0.28.0 → datachain-0.28.2}/docs/commands/auth/login.md +0 -0
  40. {datachain-0.28.0 → datachain-0.28.2}/docs/commands/auth/logout.md +0 -0
  41. {datachain-0.28.0 → datachain-0.28.2}/docs/commands/auth/team.md +0 -0
  42. {datachain-0.28.0 → datachain-0.28.2}/docs/commands/auth/token.md +0 -0
  43. {datachain-0.28.0 → datachain-0.28.2}/docs/commands/index.md +0 -0
  44. {datachain-0.28.0 → datachain-0.28.2}/docs/commands/job/cancel.md +0 -0
  45. {datachain-0.28.0 → datachain-0.28.2}/docs/commands/job/clusters.md +0 -0
  46. {datachain-0.28.0 → datachain-0.28.2}/docs/commands/job/logs.md +0 -0
  47. {datachain-0.28.0 → datachain-0.28.2}/docs/commands/job/ls.md +0 -0
  48. {datachain-0.28.0 → datachain-0.28.2}/docs/commands/job/run.md +0 -0
  49. {datachain-0.28.0 → datachain-0.28.2}/docs/contributing.md +0 -0
  50. {datachain-0.28.0 → datachain-0.28.2}/docs/css/github-permalink-style.css +0 -0
  51. {datachain-0.28.0 → datachain-0.28.2}/docs/examples.md +0 -0
  52. {datachain-0.28.0 → datachain-0.28.2}/docs/guide/db_migrations.md +0 -0
  53. {datachain-0.28.0 → datachain-0.28.2}/docs/guide/delta.md +0 -0
  54. {datachain-0.28.0 → datachain-0.28.2}/docs/guide/env.md +0 -0
  55. {datachain-0.28.0 → datachain-0.28.2}/docs/guide/index.md +0 -0
  56. {datachain-0.28.0 → datachain-0.28.2}/docs/guide/namespaces.md +0 -0
  57. {datachain-0.28.0 → datachain-0.28.2}/docs/guide/processing.md +0 -0
  58. {datachain-0.28.0 → datachain-0.28.2}/docs/guide/remotes.md +0 -0
  59. {datachain-0.28.0 → datachain-0.28.2}/docs/guide/retry.md +0 -0
  60. {datachain-0.28.0 → datachain-0.28.2}/docs/index.md +0 -0
  61. {datachain-0.28.0 → datachain-0.28.2}/docs/overrides/main.html +0 -0
  62. {datachain-0.28.0 → datachain-0.28.2}/docs/quick-start.md +0 -0
  63. {datachain-0.28.0 → datachain-0.28.2}/docs/references/data-types/arrowrow.md +0 -0
  64. {datachain-0.28.0 → datachain-0.28.2}/docs/references/data-types/bbox.md +0 -0
  65. {datachain-0.28.0 → datachain-0.28.2}/docs/references/data-types/file.md +0 -0
  66. {datachain-0.28.0 → datachain-0.28.2}/docs/references/data-types/imagefile.md +0 -0
  67. {datachain-0.28.0 → datachain-0.28.2}/docs/references/data-types/index.md +0 -0
  68. {datachain-0.28.0 → datachain-0.28.2}/docs/references/data-types/pose.md +0 -0
  69. {datachain-0.28.0 → datachain-0.28.2}/docs/references/data-types/segment.md +0 -0
  70. {datachain-0.28.0 → datachain-0.28.2}/docs/references/data-types/tarvfile.md +0 -0
  71. {datachain-0.28.0 → datachain-0.28.2}/docs/references/data-types/textfile.md +0 -0
  72. {datachain-0.28.0 → datachain-0.28.2}/docs/references/data-types/videofile.md +0 -0
  73. {datachain-0.28.0 → datachain-0.28.2}/docs/references/datachain.md +0 -0
  74. {datachain-0.28.0 → datachain-0.28.2}/docs/references/func.md +0 -0
  75. {datachain-0.28.0 → datachain-0.28.2}/docs/references/index.md +0 -0
  76. {datachain-0.28.0 → datachain-0.28.2}/docs/references/toolkit.md +0 -0
  77. {datachain-0.28.0 → datachain-0.28.2}/docs/references/torch.md +0 -0
  78. {datachain-0.28.0 → datachain-0.28.2}/docs/references/udf.md +0 -0
  79. {datachain-0.28.0 → datachain-0.28.2}/docs/tutorials.md +0 -0
  80. {datachain-0.28.0 → datachain-0.28.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  81. {datachain-0.28.0 → datachain-0.28.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  82. {datachain-0.28.0 → datachain-0.28.2}/examples/computer_vision/openimage-detect.py +0 -0
  83. {datachain-0.28.0 → datachain-0.28.2}/examples/computer_vision/ultralytics-bbox.py +0 -0
  84. {datachain-0.28.0 → datachain-0.28.2}/examples/computer_vision/ultralytics-pose.py +0 -0
  85. {datachain-0.28.0 → datachain-0.28.2}/examples/computer_vision/ultralytics-segment.py +0 -0
  86. {datachain-0.28.0 → datachain-0.28.2}/examples/get_started/common_sql_functions.py +0 -0
  87. {datachain-0.28.0 → datachain-0.28.2}/examples/get_started/json-csv-reader.py +0 -0
  88. {datachain-0.28.0 → datachain-0.28.2}/examples/get_started/torch-loader.py +0 -0
  89. {datachain-0.28.0 → datachain-0.28.2}/examples/get_started/udfs/parallel.py +0 -0
  90. {datachain-0.28.0 → datachain-0.28.2}/examples/get_started/udfs/simple.py +0 -0
  91. {datachain-0.28.0 → datachain-0.28.2}/examples/get_started/udfs/stateful.py +0 -0
  92. {datachain-0.28.0 → datachain-0.28.2}/examples/incremental_processing/delta.py +0 -0
  93. {datachain-0.28.0 → datachain-0.28.2}/examples/incremental_processing/retry.py +0 -0
  94. {datachain-0.28.0 → datachain-0.28.2}/examples/incremental_processing/utils.py +0 -0
  95. {datachain-0.28.0 → datachain-0.28.2}/examples/llm_and_nlp/claude-query.py +0 -0
  96. {datachain-0.28.0 → datachain-0.28.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  97. {datachain-0.28.0 → datachain-0.28.2}/examples/multimodal/audio-to-text.py +0 -0
  98. {datachain-0.28.0 → datachain-0.28.2}/examples/multimodal/clip_inference.py +0 -0
  99. {datachain-0.28.0 → datachain-0.28.2}/examples/multimodal/hf_pipeline.py +0 -0
  100. {datachain-0.28.0 → datachain-0.28.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
  101. {datachain-0.28.0 → datachain-0.28.2}/examples/multimodal/wds.py +0 -0
  102. {datachain-0.28.0 → datachain-0.28.2}/examples/multimodal/wds_filtered.py +0 -0
  103. {datachain-0.28.0 → datachain-0.28.2}/mkdocs.yml +0 -0
  104. {datachain-0.28.0 → datachain-0.28.2}/noxfile.py +0 -0
  105. {datachain-0.28.0 → datachain-0.28.2}/pyproject.toml +0 -0
  106. {datachain-0.28.0 → datachain-0.28.2}/setup.cfg +0 -0
  107. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/__init__.py +0 -0
  108. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/__main__.py +0 -0
  109. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/asyn.py +0 -0
  110. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cache.py +0 -0
  111. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/catalog/__init__.py +0 -0
  112. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/catalog/catalog.py +0 -0
  113. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/catalog/datasource.py +0 -0
  114. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/catalog/loader.py +0 -0
  115. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/__init__.py +0 -0
  116. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/commands/__init__.py +0 -0
  117. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/commands/datasets.py +0 -0
  118. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/commands/du.py +0 -0
  119. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/commands/index.py +0 -0
  120. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/commands/ls.py +0 -0
  121. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/commands/misc.py +0 -0
  122. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/commands/query.py +0 -0
  123. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/commands/show.py +0 -0
  124. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/parser/__init__.py +0 -0
  125. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/parser/job.py +0 -0
  126. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/parser/studio.py +0 -0
  127. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/parser/utils.py +0 -0
  128. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/cli/utils.py +0 -0
  129. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/client/__init__.py +0 -0
  130. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/client/azure.py +0 -0
  131. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/client/fileslice.py +0 -0
  132. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/client/fsspec.py +0 -0
  133. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/client/gcs.py +0 -0
  134. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/client/hf.py +0 -0
  135. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/client/local.py +0 -0
  136. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/client/s3.py +0 -0
  137. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/config.py +0 -0
  138. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/data_storage/__init__.py +0 -0
  139. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/data_storage/db_engine.py +0 -0
  140. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/data_storage/job.py +0 -0
  141. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/data_storage/metastore.py +0 -0
  142. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/data_storage/schema.py +0 -0
  143. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/data_storage/serializer.py +0 -0
  144. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/data_storage/sqlite.py +0 -0
  145. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/data_storage/warehouse.py +0 -0
  146. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/dataset.py +0 -0
  147. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/delta.py +0 -0
  148. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/diff/__init__.py +0 -0
  149. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/error.py +0 -0
  150. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/fs/__init__.py +0 -0
  151. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/fs/reference.py +0 -0
  152. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/fs/utils.py +0 -0
  153. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/__init__.py +0 -0
  154. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/aggregate.py +0 -0
  155. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/array.py +0 -0
  156. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/base.py +0 -0
  157. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/conditional.py +0 -0
  158. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/func.py +0 -0
  159. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/numeric.py +0 -0
  160. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/path.py +0 -0
  161. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/random.py +0 -0
  162. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/string.py +0 -0
  163. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/func/window.py +0 -0
  164. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/job.py +0 -0
  165. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/__init__.py +0 -0
  166. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/arrow.py +0 -0
  167. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/audio.py +0 -0
  168. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/clip.py +0 -0
  169. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/convert/__init__.py +0 -0
  170. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/convert/flatten.py +0 -0
  171. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
  172. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
  173. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/convert/unflatten.py +0 -0
  174. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  175. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/data_model.py +0 -0
  176. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dataset_info.py +0 -0
  177. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/__init__.py +0 -0
  178. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/csv.py +0 -0
  179. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/database.py +0 -0
  180. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/datasets.py +0 -0
  181. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/hf.py +0 -0
  182. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/json.py +0 -0
  183. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/listings.py +0 -0
  184. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/pandas.py +0 -0
  185. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/parquet.py +0 -0
  186. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/storage.py +0 -0
  187. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/utils.py +0 -0
  188. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/dc/values.py +0 -0
  189. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/hf.py +0 -0
  190. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/image.py +0 -0
  191. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/listing.py +0 -0
  192. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/listing_info.py +0 -0
  193. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/meta_formats.py +0 -0
  194. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/model_store.py +0 -0
  195. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/namespaces.py +0 -0
  196. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/projects.py +0 -0
  197. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/pytorch.py +0 -0
  198. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/signal_schema.py +0 -0
  199. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/tar.py +0 -0
  200. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/text.py +0 -0
  201. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/udf_signature.py +0 -0
  202. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/video.py +0 -0
  203. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/webdataset.py +0 -0
  204. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/lib/webdataset_laion.py +0 -0
  205. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/listing.py +0 -0
  206. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/model/__init__.py +0 -0
  207. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/model/bbox.py +0 -0
  208. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/model/pose.py +0 -0
  209. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/model/segment.py +0 -0
  210. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/model/ultralytics/__init__.py +0 -0
  211. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/model/ultralytics/bbox.py +0 -0
  212. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/model/ultralytics/pose.py +0 -0
  213. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/model/ultralytics/segment.py +0 -0
  214. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/model/utils.py +0 -0
  215. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/namespace.py +0 -0
  216. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/node.py +0 -0
  217. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/nodes_fetcher.py +0 -0
  218. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/nodes_thread_pool.py +0 -0
  219. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/progress.py +0 -0
  220. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/project.py +0 -0
  221. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/py.typed +0 -0
  222. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/__init__.py +0 -0
  223. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/batch.py +0 -0
  224. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/dispatch.py +0 -0
  225. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/metrics.py +0 -0
  226. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/params.py +0 -0
  227. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/queue.py +0 -0
  228. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/schema.py +0 -0
  229. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/session.py +0 -0
  230. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/udf.py +0 -0
  231. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/query/utils.py +0 -0
  232. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/remote/__init__.py +0 -0
  233. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/remote/studio.py +0 -0
  234. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/script_meta.py +0 -0
  235. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/semver.py +0 -0
  236. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/__init__.py +0 -0
  237. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/default/__init__.py +0 -0
  238. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/default/base.py +0 -0
  239. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/functions/__init__.py +0 -0
  240. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/functions/aggregate.py +0 -0
  241. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/functions/array.py +0 -0
  242. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/functions/conditional.py +0 -0
  243. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/functions/numeric.py +0 -0
  244. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/functions/path.py +0 -0
  245. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/functions/random.py +0 -0
  246. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/functions/string.py +0 -0
  247. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/selectable.py +0 -0
  248. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/sqlite/__init__.py +0 -0
  249. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/sqlite/base.py +0 -0
  250. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/sqlite/types.py +0 -0
  251. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/sqlite/vector.py +0 -0
  252. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/types.py +0 -0
  253. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/sql/utils.py +0 -0
  254. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/studio.py +0 -0
  255. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/telemetry.py +0 -0
  256. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/toolkit/__init__.py +0 -0
  257. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/toolkit/split.py +0 -0
  258. {datachain-0.28.0 → datachain-0.28.2}/src/datachain/torch/__init__.py +0 -0
  259. {datachain-0.28.0 → datachain-0.28.2}/src/datachain.egg-info/dependency_links.txt +0 -0
  260. {datachain-0.28.0 → datachain-0.28.2}/src/datachain.egg-info/entry_points.txt +0 -0
  261. {datachain-0.28.0 → datachain-0.28.2}/src/datachain.egg-info/requires.txt +0 -0
  262. {datachain-0.28.0 → datachain-0.28.2}/src/datachain.egg-info/top_level.txt +0 -0
  263. {datachain-0.28.0 → datachain-0.28.2}/tests/__init__.py +0 -0
  264. {datachain-0.28.0 → datachain-0.28.2}/tests/benchmarks/__init__.py +0 -0
  265. {datachain-0.28.0 → datachain-0.28.2}/tests/benchmarks/conftest.py +0 -0
  266. {datachain-0.28.0 → datachain-0.28.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  267. {datachain-0.28.0 → datachain-0.28.2}/tests/benchmarks/datasets/.dvc/config +0 -0
  268. {datachain-0.28.0 → datachain-0.28.2}/tests/benchmarks/datasets/.gitignore +0 -0
  269. {datachain-0.28.0 → datachain-0.28.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  270. {datachain-0.28.0 → datachain-0.28.2}/tests/benchmarks/test_datachain.py +0 -0
  271. {datachain-0.28.0 → datachain-0.28.2}/tests/benchmarks/test_ls.py +0 -0
  272. {datachain-0.28.0 → datachain-0.28.2}/tests/benchmarks/test_version.py +0 -0
  273. {datachain-0.28.0 → datachain-0.28.2}/tests/conftest.py +0 -0
  274. {datachain-0.28.0 → datachain-0.28.2}/tests/data.py +0 -0
  275. {datachain-0.28.0 → datachain-0.28.2}/tests/examples/__init__.py +0 -0
  276. {datachain-0.28.0 → datachain-0.28.2}/tests/examples/test_examples.py +0 -0
  277. {datachain-0.28.0 → datachain-0.28.2}/tests/examples/test_wds_e2e.py +0 -0
  278. {datachain-0.28.0 → datachain-0.28.2}/tests/examples/wds_data.py +0 -0
  279. {datachain-0.28.0 → datachain-0.28.2}/tests/func/__init__.py +0 -0
  280. {datachain-0.28.0 → datachain-0.28.2}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  281. {datachain-0.28.0 → datachain-0.28.2}/tests/func/data/lena.jpg +0 -0
  282. {datachain-0.28.0 → datachain-0.28.2}/tests/func/fake-service-account-credentials.json +0 -0
  283. {datachain-0.28.0 → datachain-0.28.2}/tests/func/functions/__init__.py +0 -0
  284. {datachain-0.28.0 → datachain-0.28.2}/tests/func/functions/test_aggregate.py +0 -0
  285. {datachain-0.28.0 → datachain-0.28.2}/tests/func/functions/test_array.py +0 -0
  286. {datachain-0.28.0 → datachain-0.28.2}/tests/func/functions/test_conditional.py +0 -0
  287. {datachain-0.28.0 → datachain-0.28.2}/tests/func/functions/test_numeric.py +0 -0
  288. {datachain-0.28.0 → datachain-0.28.2}/tests/func/functions/test_path.py +0 -0
  289. {datachain-0.28.0 → datachain-0.28.2}/tests/func/functions/test_random.py +0 -0
  290. {datachain-0.28.0 → datachain-0.28.2}/tests/func/functions/test_string.py +0 -0
  291. {datachain-0.28.0 → datachain-0.28.2}/tests/func/model/__init__.py +0 -0
  292. {datachain-0.28.0 → datachain-0.28.2}/tests/func/model/data/running-mask0.png +0 -0
  293. {datachain-0.28.0 → datachain-0.28.2}/tests/func/model/data/running-mask1.png +0 -0
  294. {datachain-0.28.0 → datachain-0.28.2}/tests/func/model/data/running.jpg +0 -0
  295. {datachain-0.28.0 → datachain-0.28.2}/tests/func/model/data/ships.jpg +0 -0
  296. {datachain-0.28.0 → datachain-0.28.2}/tests/func/model/test_yolo.py +0 -0
  297. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_audio.py +0 -0
  298. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_batching.py +0 -0
  299. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_catalog.py +0 -0
  300. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_client.py +0 -0
  301. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_cloud_transfer.py +0 -0
  302. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_data_storage.py +0 -0
  303. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_datachain_merge.py +0 -0
  304. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_dataset_query.py +0 -0
  305. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_datasets.py +0 -0
  306. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_delta.py +0 -0
  307. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_feature_pickling.py +0 -0
  308. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_file.py +0 -0
  309. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_hidden_field.py +0 -0
  310. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_image.py +0 -0
  311. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_listing.py +0 -0
  312. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_ls.py +0 -0
  313. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_meta_formats.py +0 -0
  314. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_metastore.py +0 -0
  315. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_metrics.py +0 -0
  316. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_pull.py +0 -0
  317. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_pytorch.py +0 -0
  318. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_query.py +0 -0
  319. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_read_database.py +0 -0
  320. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_read_dataset_remote.py +0 -0
  321. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  322. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_retry.py +0 -0
  323. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_session.py +0 -0
  324. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_studio_datetime_parsing.py +0 -0
  325. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_toolkit.py +0 -0
  326. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_video.py +0 -0
  327. {datachain-0.28.0 → datachain-0.28.2}/tests/func/test_warehouse.py +0 -0
  328. {datachain-0.28.0 → datachain-0.28.2}/tests/scripts/feature_class.py +0 -0
  329. {datachain-0.28.0 → datachain-0.28.2}/tests/scripts/feature_class_exception.py +0 -0
  330. {datachain-0.28.0 → datachain-0.28.2}/tests/scripts/feature_class_parallel.py +0 -0
  331. {datachain-0.28.0 → datachain-0.28.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  332. {datachain-0.28.0 → datachain-0.28.2}/tests/scripts/name_len_slow.py +0 -0
  333. {datachain-0.28.0 → datachain-0.28.2}/tests/test_atomicity.py +0 -0
  334. {datachain-0.28.0 → datachain-0.28.2}/tests/test_cli_e2e.py +0 -0
  335. {datachain-0.28.0 → datachain-0.28.2}/tests/test_cli_studio.py +0 -0
  336. {datachain-0.28.0 → datachain-0.28.2}/tests/test_import_time.py +0 -0
  337. {datachain-0.28.0 → datachain-0.28.2}/tests/test_query_e2e.py +0 -0
  338. {datachain-0.28.0 → datachain-0.28.2}/tests/test_telemetry.py +0 -0
  339. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/__init__.py +0 -0
  340. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/__init__.py +0 -0
  341. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/conftest.py +0 -0
  342. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_arrow.py +0 -0
  343. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_audio.py +0 -0
  344. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_clip.py +0 -0
  345. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_datachain.py +0 -0
  346. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  347. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_datachain_merge.py +0 -0
  348. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_diff.py +0 -0
  349. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_feature.py +0 -0
  350. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_feature_utils.py +0 -0
  351. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_hf.py +0 -0
  352. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_image.py +0 -0
  353. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_listing_info.py +0 -0
  354. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_namespace.py +0 -0
  355. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_partition_by.py +0 -0
  356. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_project.py +0 -0
  357. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_python_to_sql.py +0 -0
  358. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_schema.py +0 -0
  359. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_signal_schema.py +0 -0
  360. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_sql_to_python.py +0 -0
  361. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_text.py +0 -0
  362. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_udf.py +0 -0
  363. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_udf_signature.py +0 -0
  364. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/lib/test_webdataset.py +0 -0
  365. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/model/__init__.py +0 -0
  366. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/model/test_bbox.py +0 -0
  367. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/model/test_pose.py +0 -0
  368. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/model/test_segment.py +0 -0
  369. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/model/test_utils.py +0 -0
  370. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/sql/__init__.py +0 -0
  371. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/sql/sqlite/__init__.py +0 -0
  372. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/sql/sqlite/test_types.py +0 -0
  373. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
  374. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/sql/test_array.py +0 -0
  375. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/sql/test_conditional.py +0 -0
  376. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/sql/test_path.py +0 -0
  377. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/sql/test_random.py +0 -0
  378. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/sql/test_selectable.py +0 -0
  379. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/sql/test_string.py +0 -0
  380. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_asyn.py +0 -0
  381. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_cache.py +0 -0
  382. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_catalog.py +0 -0
  383. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_catalog_loader.py +0 -0
  384. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_cli_parsing.py +0 -0
  385. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_client.py +0 -0
  386. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_client_gcs.py +0 -0
  387. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_client_s3.py +0 -0
  388. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_config.py +0 -0
  389. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_data_storage.py +0 -0
  390. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_database_engine.py +0 -0
  391. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_dataset.py +0 -0
  392. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_dispatch.py +0 -0
  393. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_fileslice.py +0 -0
  394. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_func.py +0 -0
  395. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_listing.py +0 -0
  396. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_metastore.py +0 -0
  397. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_module_exports.py +0 -0
  398. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_pytorch.py +0 -0
  399. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_query.py +0 -0
  400. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_query_metrics.py +0 -0
  401. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_query_params.py +0 -0
  402. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_script_meta.py +0 -0
  403. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_semver.py +0 -0
  404. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_serializer.py +0 -0
  405. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_session.py +0 -0
  406. {datachain-0.28.0 → datachain-0.28.2}/tests/unit/test_warehouse.py +0 -0
  407. {datachain-0.28.0 → datachain-0.28.2}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.12.4'
27
+ rev: 'v0.12.7'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.28.0
3
+ Version: 0.28.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -324,6 +324,7 @@ class DataChain:
324
324
  sys: Optional[bool] = None,
325
325
  namespace: Optional[str] = None,
326
326
  project: Optional[str] = None,
327
+ batch_rows: Optional[int] = None,
327
328
  ) -> "Self":
328
329
  """Change settings for chain.
329
330
 
@@ -331,22 +332,24 @@ class DataChain:
331
332
  It returns chain, so, it can be chained later with next operation.
332
333
 
333
334
  Parameters:
334
- cache : data caching (default=False)
335
+ cache : data caching. (default=False)
335
336
  parallel : number of thread for processors. True is a special value to
336
- enable all available CPUs (default=1)
337
+ enable all available CPUs. (default=1)
337
338
  workers : number of distributed workers. Only for Studio mode. (default=1)
338
- min_task_size : minimum number of tasks (default=1)
339
- prefetch: number of workers to use for downloading files in advance.
339
+ min_task_size : minimum number of tasks. (default=1)
340
+ prefetch : number of workers to use for downloading files in advance.
340
341
  This is enabled by default and uses 2 workers.
341
342
  To disable prefetching, set it to 0.
342
- namespace: namespace name.
343
- project: project name.
343
+ namespace : namespace name.
344
+ project : project name.
345
+ batch_rows : row limit per insert to balance speed and memory usage.
346
+ (default=2000)
344
347
 
345
348
  Example:
346
349
  ```py
347
350
  chain = (
348
351
  chain
349
- .settings(cache=True, parallel=8)
352
+ .settings(cache=True, parallel=8, batch_rows=300)
350
353
  .map(laion=process_webdataset(spec=WDSLaion), params="file")
351
354
  )
352
355
  ```
@@ -356,7 +359,14 @@ class DataChain:
356
359
  settings = copy.copy(self._settings)
357
360
  settings.add(
358
361
  Settings(
359
- cache, parallel, workers, min_task_size, prefetch, namespace, project
362
+ cache,
363
+ parallel,
364
+ workers,
365
+ min_task_size,
366
+ prefetch,
367
+ namespace,
368
+ project,
369
+ batch_rows,
360
370
  )
361
371
  )
362
372
  return self._evolve(settings=settings, _sys=sys)
@@ -711,7 +721,7 @@ class DataChain:
711
721
 
712
722
  return self._evolve(
713
723
  query=self._query.add_signals(
714
- udf_obj.to_udf_wrapper(),
724
+ udf_obj.to_udf_wrapper(self._settings.batch_rows),
715
725
  **self._settings.to_dict(),
716
726
  ),
717
727
  signal_schema=self.signals_schema | udf_obj.output,
@@ -749,7 +759,7 @@ class DataChain:
749
759
  udf_obj.prefetch = prefetch
750
760
  return self._evolve(
751
761
  query=self._query.generate(
752
- udf_obj.to_udf_wrapper(),
762
+ udf_obj.to_udf_wrapper(self._settings.batch_rows),
753
763
  **self._settings.to_dict(),
754
764
  ),
755
765
  signal_schema=udf_obj.output,
@@ -885,7 +895,7 @@ class DataChain:
885
895
  udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
886
896
  return self._evolve(
887
897
  query=self._query.generate(
888
- udf_obj.to_udf_wrapper(),
898
+ udf_obj.to_udf_wrapper(self._settings.batch_rows),
889
899
  partition_by=processed_partition_by,
890
900
  **self._settings.to_dict(),
891
901
  ),
@@ -917,11 +927,24 @@ class DataChain:
917
927
  )
918
928
  chain.save("new_dataset")
919
929
  ```
930
+
931
+ .. deprecated:: 0.29.0
932
+ This method is deprecated and will be removed in a future version.
933
+ Use `agg()` instead, which provides the similar functionality.
920
934
  """
935
+ import warnings
936
+
937
+ warnings.warn(
938
+ "batch_map() is deprecated and will be removed in a future version. "
939
+ "Use agg() instead, which provides the similar functionality.",
940
+ DeprecationWarning,
941
+ stacklevel=2,
942
+ )
921
943
  udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
944
+
922
945
  return self._evolve(
923
946
  query=self._query.add_signals(
924
- udf_obj.to_udf_wrapper(batch),
947
+ udf_obj.to_udf_wrapper(self._settings.batch_rows, batch=batch),
925
948
  **self._settings.to_dict(),
926
949
  ),
927
950
  signal_schema=self.signals_schema | udf_obj.output,
@@ -2340,7 +2363,7 @@ class DataChain:
2340
2363
  def setup(self, **kwargs) -> "Self":
2341
2364
  """Setup variables to pass to UDF functions.
2342
2365
 
2343
- Use before running map/gen/agg/batch_map to save an object and pass it as an
2366
+ Use before running map/gen/agg to save an object and pass it as an
2344
2367
  argument to the UDF.
2345
2368
 
2346
2369
  The value must be a callable (a `lambda: <value>` syntax can be used to quickly
@@ -2419,9 +2442,11 @@ class DataChain:
2419
2442
  ds.to_storage("gs://mybucket", placement="filename")
2420
2443
  ```
2421
2444
  """
2445
+ chain = self.persist()
2446
+ count = chain.count()
2447
+
2422
2448
  if placement == "filename" and (
2423
- self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
2424
- != self._query.count()
2449
+ chain._query.distinct(pathfunc.name(C(f"{signal}__path"))).count() != count
2425
2450
  ):
2426
2451
  raise ValueError("Files with the same name found")
2427
2452
 
@@ -2433,7 +2458,7 @@ class DataChain:
2433
2458
  unit=" files",
2434
2459
  unit_scale=True,
2435
2460
  unit_divisor=10,
2436
- total=self.count(),
2461
+ total=count,
2437
2462
  leave=False,
2438
2463
  )
2439
2464
  file_exporter = FileExporter(
@@ -2444,7 +2469,10 @@ class DataChain:
2444
2469
  max_threads=num_threads or 1,
2445
2470
  client_config=client_config,
2446
2471
  )
2447
- file_exporter.run(self.to_values(signal), progress_bar)
2472
+ file_exporter.run(
2473
+ (rows[0] for rows in chain.to_iter(signal)),
2474
+ progress_bar,
2475
+ )
2448
2476
 
2449
2477
  def shuffle(self) -> "Self":
2450
2478
  """Shuffle the rows of the chain deterministically."""
@@ -15,6 +15,8 @@ if TYPE_CHECKING:
15
15
 
16
16
  P = ParamSpec("P")
17
17
 
18
+ READ_RECORDS_BATCH_SIZE = 10000
19
+
18
20
 
19
21
  def read_records(
20
22
  to_insert: Optional[Union[dict, Iterable[dict]]],
@@ -41,7 +43,7 @@ def read_records(
41
43
  Notes:
42
44
  This call blocks until all records are inserted.
43
45
  """
44
- from datachain.query.dataset import INSERT_BATCH_SIZE, adjust_outputs, get_col_types
46
+ from datachain.query.dataset import adjust_outputs, get_col_types
45
47
  from datachain.sql.types import SQLType
46
48
  from datachain.utils import batched
47
49
 
@@ -94,7 +96,7 @@ def read_records(
94
96
  {c.name: c.type for c in columns if isinstance(c.type, SQLType)},
95
97
  )
96
98
  records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
97
- for chunk in batched(records, INSERT_BATCH_SIZE):
99
+ for chunk in batched(records, READ_RECORDS_BATCH_SIZE):
98
100
  warehouse.insert_rows(table, chunk)
99
101
  warehouse.insert_rows_done(table)
100
102
  return read_dataset(name=dsr.full_name, session=session, settings=settings)
@@ -23,7 +23,7 @@ from pydantic import Field, field_validator
23
23
 
24
24
  from datachain.client.fileslice import FileSlice
25
25
  from datachain.lib.data_model import DataModel
26
- from datachain.lib.utils import DataChainError
26
+ from datachain.lib.utils import DataChainError, rebase_path
27
27
  from datachain.nodes_thread_pool import NodesThreadPool
28
28
  from datachain.sql.types import JSON, Boolean, DateTime, Int, String
29
29
  from datachain.utils import TIME_ZERO
@@ -634,6 +634,40 @@ class File(DataModel):
634
634
  location=self.location,
635
635
  )
636
636
 
637
+ def rebase(
638
+ self,
639
+ old_base: str,
640
+ new_base: str,
641
+ suffix: str = "",
642
+ extension: str = "",
643
+ ) -> str:
644
+ """
645
+ Rebase the file's URI from one base directory to another.
646
+
647
+ Args:
648
+ old_base: Base directory to remove from the file's URI
649
+ new_base: New base directory to prepend
650
+ suffix: Optional suffix to add before file extension
651
+ extension: Optional new file extension (without dot)
652
+
653
+ Returns:
654
+ str: Rebased URI with new base directory
655
+
656
+ Raises:
657
+ ValueError: If old_base is not found in the file's URI
658
+
659
+ Examples:
660
+ >>> file = File(source="s3://bucket", path="data/2025-05-27/file.wav")
661
+ >>> file.rebase("s3://bucket/data", "s3://output-bucket/processed", \
662
+ extension="mp3")
663
+ 's3://output-bucket/processed/2025-05-27/file.mp3'
664
+
665
+ >>> file.rebase("data/audio", "/local/output", suffix="_ch1",
666
+ extension="npy")
667
+ '/local/output/file_ch1.npy'
668
+ """
669
+ return rebase_path(self.get_uri(), old_base, new_base, suffix, extension)
670
+
637
671
 
638
672
  def resolve(file: File) -> File:
639
673
  """
@@ -1219,6 +1253,24 @@ class Audio(DataModel):
1219
1253
  codec: str = Field(default="")
1220
1254
  bit_rate: int = Field(default=-1)
1221
1255
 
1256
+ @staticmethod
1257
+ def get_channel_name(num_channels: int, channel_idx: int) -> str:
1258
+ """Map channel index to meaningful name based on common audio formats"""
1259
+ channel_mappings = {
1260
+ 1: ["Mono"],
1261
+ 2: ["Left", "Right"],
1262
+ 4: ["W", "X", "Y", "Z"], # First-order Ambisonics
1263
+ 6: ["FL", "FR", "FC", "LFE", "BL", "BR"], # 5.1 surround
1264
+ 8: ["FL", "FR", "FC", "LFE", "BL", "BR", "SL", "SR"], # 7.1 surround
1265
+ }
1266
+
1267
+ if num_channels in channel_mappings:
1268
+ channels = channel_mappings[num_channels]
1269
+ if 0 <= channel_idx < len(channels):
1270
+ return channels[channel_idx]
1271
+
1272
+ return f"Ch{channel_idx + 1}"
1273
+
1222
1274
 
1223
1275
  class ArrowRow(DataModel):
1224
1276
  """`DataModel` for reading row from Arrow-supported file."""
@@ -1,4 +1,5 @@
1
1
  from datachain.lib.utils import DataChainParamsError
2
+ from datachain.utils import DEFAULT_CHUNK_ROWS
2
3
 
3
4
 
4
5
  class SettingsError(DataChainParamsError):
@@ -16,6 +17,7 @@ class Settings:
16
17
  prefetch=None,
17
18
  namespace=None,
18
19
  project=None,
20
+ batch_rows=None,
19
21
  ):
20
22
  self._cache = cache
21
23
  self.parallel = parallel
@@ -24,6 +26,7 @@ class Settings:
24
26
  self.prefetch = prefetch
25
27
  self.namespace = namespace
26
28
  self.project = project
29
+ self._chunk_rows = batch_rows
27
30
 
28
31
  if not isinstance(cache, bool) and cache is not None:
29
32
  raise SettingsError(
@@ -53,6 +56,18 @@ class Settings:
53
56
  f", {min_task_size.__class__.__name__} was given"
54
57
  )
55
58
 
59
+ if batch_rows is not None and not isinstance(batch_rows, int):
60
+ raise SettingsError(
61
+ "'batch_rows' argument must be int or None"
62
+ f", {batch_rows.__class__.__name__} was given"
63
+ )
64
+
65
+ if batch_rows is not None and batch_rows <= 0:
66
+ raise SettingsError(
67
+ "'batch_rows' argument must be positive integer"
68
+ f", {batch_rows} was given"
69
+ )
70
+
56
71
  @property
57
72
  def cache(self):
58
73
  return self._cache if self._cache is not None else False
@@ -61,6 +76,10 @@ class Settings:
61
76
  def workers(self):
62
77
  return self._workers if self._workers is not None else False
63
78
 
79
+ @property
80
+ def batch_rows(self):
81
+ return self._chunk_rows if self._chunk_rows is not None else DEFAULT_CHUNK_ROWS
82
+
64
83
  def to_dict(self):
65
84
  res = {}
66
85
  if self._cache is not None:
@@ -75,6 +94,8 @@ class Settings:
75
94
  res["namespace"] = self.namespace
76
95
  if self.project is not None:
77
96
  res["project"] = self.project
97
+ if self._chunk_rows is not None:
98
+ res["batch_rows"] = self._chunk_rows
78
99
  return res
79
100
 
80
101
  def add(self, settings: "Settings"):
@@ -86,3 +107,5 @@ class Settings:
86
107
  self.project = settings.project or self.project
87
108
  if settings.prefetch is not None:
88
109
  self.prefetch = settings.prefetch
110
+ if settings._chunk_rows is not None:
111
+ self._chunk_rows = settings._chunk_rows
@@ -62,19 +62,21 @@ class UDFProperties:
62
62
  return self.udf.get_batching(use_partitioning)
63
63
 
64
64
  @property
65
- def batch(self):
66
- return self.udf.batch
65
+ def batch_rows(self):
66
+ return self.udf.batch_rows
67
67
 
68
68
 
69
69
  @attrs.define(slots=False)
70
70
  class UDFAdapter:
71
71
  inner: "UDFBase"
72
72
  output: UDFOutputSpec
73
+ batch_rows: Optional[int] = None
73
74
  batch: int = 1
74
75
 
75
76
  def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
76
77
  if use_partitioning:
77
78
  return Partition()
79
+
78
80
  if self.batch == 1:
79
81
  return NoBatching()
80
82
  if self.batch > 1:
@@ -233,10 +235,15 @@ class UDFBase(AbstractUDF):
233
235
  def signal_names(self) -> Iterable[str]:
234
236
  return self.output.to_udf_spec().keys()
235
237
 
236
- def to_udf_wrapper(self, batch: int = 1) -> UDFAdapter:
238
+ def to_udf_wrapper(
239
+ self,
240
+ batch_rows: Optional[int] = None,
241
+ batch: int = 1,
242
+ ) -> UDFAdapter:
237
243
  return UDFAdapter(
238
244
  self,
239
245
  self.output.to_udf_spec(),
246
+ batch_rows,
240
247
  batch,
241
248
  )
242
249
 
@@ -418,11 +425,27 @@ class Mapper(UDFBase):
418
425
 
419
426
 
420
427
  class BatchMapper(UDFBase):
421
- """Inherit from this class to pass to `DataChain.batch_map()`."""
428
+ """Inherit from this class to pass to `DataChain.batch_map()`.
429
+
430
+ .. deprecated:: 0.29.0
431
+ This class is deprecated and will be removed in a future version.
432
+ Use `Aggregator` instead, which provides the similar functionality.
433
+ """
422
434
 
423
435
  is_input_batched = True
424
436
  is_output_batched = True
425
437
 
438
+ def __init__(self):
439
+ import warnings
440
+
441
+ warnings.warn(
442
+ "BatchMapper is deprecated and will be removed in a future version. "
443
+ "Use Aggregator instead, which provides the similar functionality.",
444
+ DeprecationWarning,
445
+ stacklevel=2,
446
+ )
447
+ super().__init__()
448
+
426
449
  def run(
427
450
  self,
428
451
  udf_fields: Sequence[str],
@@ -0,0 +1,155 @@
1
+ import re
2
+ from abc import ABC, abstractmethod
3
+ from collections.abc import Sequence
4
+ from pathlib import PurePosixPath
5
+ from urllib.parse import urlparse
6
+
7
+
8
+ class AbstractUDF(ABC):
9
+ @abstractmethod
10
+ def process(self, *args, **kwargs):
11
+ pass
12
+
13
+ @abstractmethod
14
+ def setup(self):
15
+ pass
16
+
17
+ @abstractmethod
18
+ def teardown(self):
19
+ pass
20
+
21
+
22
+ class DataChainError(Exception):
23
+ pass
24
+
25
+
26
+ class DataChainParamsError(DataChainError):
27
+ pass
28
+
29
+
30
+ class DataChainColumnError(DataChainParamsError):
31
+ def __init__(self, col_name: str, msg: str):
32
+ super().__init__(f"Error for column {col_name}: {msg}")
33
+
34
+
35
+ def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
36
+ """Returns normalized_name -> original_name dict."""
37
+ gen_col_counter = 0
38
+ new_col_names = {}
39
+ org_col_names = set(col_names)
40
+
41
+ for org_column in col_names:
42
+ new_column = org_column.lower()
43
+ new_column = re.sub("[^0-9a-z]+", "_", new_column)
44
+ new_column = new_column.strip("_")
45
+
46
+ generated_column = new_column
47
+
48
+ while (
49
+ not generated_column.isidentifier()
50
+ or generated_column in new_col_names
51
+ or (generated_column != org_column and generated_column in org_col_names)
52
+ ):
53
+ if new_column:
54
+ generated_column = f"c{gen_col_counter}_{new_column}"
55
+ else:
56
+ generated_column = f"c{gen_col_counter}"
57
+ gen_col_counter += 1
58
+
59
+ new_col_names[generated_column] = org_column
60
+
61
+ return new_col_names
62
+
63
+
64
+ def rebase_path(
65
+ src_path: str,
66
+ old_base: str,
67
+ new_base: str,
68
+ suffix: str = "",
69
+ extension: str = "",
70
+ ) -> str:
71
+ """
72
+ Rebase a file path from one base directory to another.
73
+
74
+ Args:
75
+ src_path: Source file path (can include URI scheme like s3://)
76
+ old_base: Base directory to remove from src_path
77
+ new_base: New base directory to prepend
78
+ suffix: Optional suffix to add before file extension
79
+ extension: Optional new file extension (without dot)
80
+
81
+ Returns:
82
+ str: Rebased path with new base directory
83
+
84
+ Raises:
85
+ ValueError: If old_base is not found in src_path
86
+ """
87
+ # Parse URIs to handle schemes properly
88
+ src_parsed = urlparse(src_path)
89
+ old_base_parsed = urlparse(old_base)
90
+ new_base_parsed = urlparse(new_base)
91
+
92
+ # Get the path component (without scheme)
93
+ if src_parsed.scheme:
94
+ src_path_only = src_parsed.netloc + src_parsed.path
95
+ else:
96
+ src_path_only = src_path
97
+
98
+ if old_base_parsed.scheme:
99
+ old_base_only = old_base_parsed.netloc + old_base_parsed.path
100
+ else:
101
+ old_base_only = old_base
102
+
103
+ # Normalize paths
104
+ src_path_norm = PurePosixPath(src_path_only).as_posix()
105
+ old_base_norm = PurePosixPath(old_base_only).as_posix()
106
+
107
+ # Find where old_base appears in src_path
108
+ if old_base_norm in src_path_norm:
109
+ # Find the index where old_base appears
110
+ idx = src_path_norm.find(old_base_norm)
111
+ if idx == -1:
112
+ raise ValueError(f"old_base '{old_base}' not found in src_path")
113
+
114
+ # Extract the relative path after old_base
115
+ relative_start = idx + len(old_base_norm)
116
+ # Skip leading slash if present
117
+ if relative_start < len(src_path_norm) and src_path_norm[relative_start] == "/":
118
+ relative_start += 1
119
+ relative_path = src_path_norm[relative_start:]
120
+ else:
121
+ raise ValueError(f"old_base '{old_base}' not found in src_path")
122
+
123
+ # Parse the filename
124
+ path_obj = PurePosixPath(relative_path)
125
+ stem = path_obj.stem
126
+ current_ext = path_obj.suffix
127
+
128
+ # Apply suffix and extension changes
129
+ new_stem = stem + suffix if suffix else stem
130
+ if extension:
131
+ new_ext = f".{extension}"
132
+ elif current_ext:
133
+ new_ext = current_ext
134
+ else:
135
+ new_ext = ""
136
+
137
+ # Build new filename
138
+ new_name = new_stem + new_ext
139
+
140
+ # Reconstruct path with new base
141
+ parent = str(path_obj.parent)
142
+ if parent == ".":
143
+ new_relative_path = new_name
144
+ else:
145
+ new_relative_path = str(PurePosixPath(parent) / new_name)
146
+
147
+ # Handle new_base URI scheme
148
+ if new_base_parsed.scheme:
149
+ # Has schema like s3://
150
+ base_path = new_base_parsed.netloc + new_base_parsed.path
151
+ base_path = PurePosixPath(base_path).as_posix()
152
+ full_path = str(PurePosixPath(base_path) / new_relative_path)
153
+ return f"{new_base_parsed.scheme}://{full_path}"
154
+ # Regular path
155
+ return str(PurePosixPath(new_base) / new_relative_path)
@@ -333,32 +333,24 @@ def process_udf_outputs(
333
333
  udf_table: "Table",
334
334
  udf_results: Iterator[Iterable["UDFResult"]],
335
335
  udf: "UDFAdapter",
336
- batch_size: int = INSERT_BATCH_SIZE,
337
336
  cb: Callback = DEFAULT_CALLBACK,
338
337
  ) -> None:
339
- import psutil
340
-
341
- rows: list[UDFResult] = []
342
338
  # Optimization: Compute row types once, rather than for every row.
343
339
  udf_col_types = get_col_types(warehouse, udf.output)
340
+ batch_rows = udf.batch_rows or INSERT_BATCH_SIZE
344
341
 
345
- for udf_output in udf_results:
346
- if not udf_output:
347
- continue
348
- with safe_closing(udf_output):
349
- for row in udf_output:
350
- cb.relative_update()
351
- rows.append(adjust_outputs(warehouse, row, udf_col_types))
352
- if len(rows) >= batch_size or (
353
- len(rows) % 10 == 0 and psutil.virtual_memory().percent > 80
354
- ):
355
- for row_chunk in batched(rows, batch_size):
356
- warehouse.insert_rows(udf_table, row_chunk)
357
- rows.clear()
342
+ def _insert_rows():
343
+ for udf_output in udf_results:
344
+ if not udf_output:
345
+ continue
346
+
347
+ with safe_closing(udf_output):
348
+ for row in udf_output:
349
+ cb.relative_update()
350
+ yield adjust_outputs(warehouse, row, udf_col_types)
358
351
 
359
- if rows:
360
- for row_chunk in batched(rows, batch_size):
361
- warehouse.insert_rows(udf_table, row_chunk)
352
+ for row_chunk in batched(_insert_rows(), batch_rows):
353
+ warehouse.insert_rows(udf_table, row_chunk)
362
354
 
363
355
  warehouse.insert_rows_done(udf_table)
364
356
 
@@ -401,6 +393,7 @@ class UDFStep(Step, ABC):
401
393
  min_task_size: Optional[int] = None
402
394
  is_generator = False
403
395
  cache: bool = False
396
+ batch_rows: Optional[int] = None
404
397
 
405
398
  @abstractmethod
406
399
  def create_udf_table(self, query: Select) -> "Table":
@@ -602,6 +595,7 @@ class UDFStep(Step, ABC):
602
595
  parallel=self.parallel,
603
596
  workers=self.workers,
604
597
  min_task_size=self.min_task_size,
598
+ batch_rows=self.batch_rows,
605
599
  )
606
600
  return self.__class__(self.udf, self.catalog)
607
601
 
@@ -1633,6 +1627,7 @@ class DatasetQuery:
1633
1627
  min_task_size: Optional[int] = None,
1634
1628
  partition_by: Optional[PartitionByType] = None,
1635
1629
  cache: bool = False,
1630
+ batch_rows: Optional[int] = None,
1636
1631
  ) -> "Self":
1637
1632
  """
1638
1633
  Adds one or more signals based on the results from the provided UDF.
@@ -1658,6 +1653,7 @@ class DatasetQuery:
1658
1653
  workers=workers,
1659
1654
  min_task_size=min_task_size,
1660
1655
  cache=cache,
1656
+ batch_rows=batch_rows,
1661
1657
  )
1662
1658
  )
1663
1659
  return query
@@ -1679,6 +1675,7 @@ class DatasetQuery:
1679
1675
  namespace: Optional[str] = None,
1680
1676
  project: Optional[str] = None,
1681
1677
  cache: bool = False,
1678
+ batch_rows: Optional[int] = None,
1682
1679
  ) -> "Self":
1683
1680
  query = self.clone()
1684
1681
  steps = query.steps
@@ -1691,6 +1688,7 @@ class DatasetQuery:
1691
1688
  workers=workers,
1692
1689
  min_task_size=min_task_size,
1693
1690
  cache=cache,
1691
+ batch_rows=batch_rows,
1694
1692
  )
1695
1693
  )
1696
1694
  return query