datachain 0.5.1__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (251) hide show
  1. {datachain-0.5.1 → datachain-0.6.0}/.pre-commit-config.yaml +2 -2
  2. {datachain-0.5.1/src/datachain.egg-info → datachain-0.6.0}/PKG-INFO +1 -1
  3. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/values_to_tuples.py +2 -2
  4. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/data_model.py +1 -1
  5. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/dc.py +18 -2
  6. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/signal_schema.py +6 -6
  7. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/udf.py +177 -151
  8. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/batch.py +0 -10
  9. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/dataset.py +1 -1
  10. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/dispatch.py +0 -12
  11. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/string.py +12 -0
  12. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/sqlite/base.py +10 -5
  13. {datachain-0.5.1 → datachain-0.6.0/src/datachain.egg-info}/PKG-INFO +1 -1
  14. datachain-0.6.0/tests/scripts/feature_class_exception.py +11 -0
  15. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_datachain.py +11 -0
  16. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_string.py +15 -0
  17. datachain-0.5.1/tests/scripts/feature_class_exception.py +0 -24
  18. {datachain-0.5.1 → datachain-0.6.0}/.cruft.json +0 -0
  19. {datachain-0.5.1 → datachain-0.6.0}/.gitattributes +0 -0
  20. {datachain-0.5.1 → datachain-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  21. {datachain-0.5.1 → datachain-0.6.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  22. {datachain-0.5.1 → datachain-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  23. {datachain-0.5.1 → datachain-0.6.0}/.github/codecov.yaml +0 -0
  24. {datachain-0.5.1 → datachain-0.6.0}/.github/dependabot.yml +0 -0
  25. {datachain-0.5.1 → datachain-0.6.0}/.github/workflows/benchmarks.yml +0 -0
  26. {datachain-0.5.1 → datachain-0.6.0}/.github/workflows/release.yml +0 -0
  27. {datachain-0.5.1 → datachain-0.6.0}/.github/workflows/tests-studio.yml +0 -0
  28. {datachain-0.5.1 → datachain-0.6.0}/.github/workflows/tests.yml +0 -0
  29. {datachain-0.5.1 → datachain-0.6.0}/.github/workflows/update-template.yaml +0 -0
  30. {datachain-0.5.1 → datachain-0.6.0}/.gitignore +0 -0
  31. {datachain-0.5.1 → datachain-0.6.0}/CODE_OF_CONDUCT.rst +0 -0
  32. {datachain-0.5.1 → datachain-0.6.0}/CONTRIBUTING.rst +0 -0
  33. {datachain-0.5.1 → datachain-0.6.0}/LICENSE +0 -0
  34. {datachain-0.5.1 → datachain-0.6.0}/README.rst +0 -0
  35. {datachain-0.5.1 → datachain-0.6.0}/docs/assets/captioned_cartoons.png +0 -0
  36. {datachain-0.5.1 → datachain-0.6.0}/docs/assets/datachain-white.svg +0 -0
  37. {datachain-0.5.1 → datachain-0.6.0}/docs/assets/datachain.svg +0 -0
  38. {datachain-0.5.1 → datachain-0.6.0}/docs/assets/flowchart.png +0 -0
  39. {datachain-0.5.1 → datachain-0.6.0}/docs/index.md +0 -0
  40. {datachain-0.5.1 → datachain-0.6.0}/docs/references/datachain.md +0 -0
  41. {datachain-0.5.1 → datachain-0.6.0}/docs/references/datatype.md +0 -0
  42. {datachain-0.5.1 → datachain-0.6.0}/docs/references/file.md +0 -0
  43. {datachain-0.5.1 → datachain-0.6.0}/docs/references/index.md +0 -0
  44. {datachain-0.5.1 → datachain-0.6.0}/docs/references/sql.md +0 -0
  45. {datachain-0.5.1 → datachain-0.6.0}/docs/references/torch.md +0 -0
  46. {datachain-0.5.1 → datachain-0.6.0}/docs/references/udf.md +0 -0
  47. {datachain-0.5.1 → datachain-0.6.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  48. {datachain-0.5.1 → datachain-0.6.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  49. {datachain-0.5.1 → datachain-0.6.0}/examples/computer_vision/openimage-detect.py +0 -0
  50. {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/common_sql_functions.py +0 -0
  51. {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/json-csv-reader.py +0 -0
  52. {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/torch-loader.py +0 -0
  53. {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/udfs/parallel.py +0 -0
  54. {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/udfs/simple.py +0 -0
  55. {datachain-0.5.1 → datachain-0.6.0}/examples/get_started/udfs/stateful.py +0 -0
  56. {datachain-0.5.1 → datachain-0.6.0}/examples/llm_and_nlp/claude-query.py +0 -0
  57. {datachain-0.5.1 → datachain-0.6.0}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  58. {datachain-0.5.1 → datachain-0.6.0}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  59. {datachain-0.5.1 → datachain-0.6.0}/examples/multimodal/clip_inference.py +0 -0
  60. {datachain-0.5.1 → datachain-0.6.0}/examples/multimodal/hf_pipeline.py +0 -0
  61. {datachain-0.5.1 → datachain-0.6.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  62. {datachain-0.5.1 → datachain-0.6.0}/examples/multimodal/wds.py +0 -0
  63. {datachain-0.5.1 → datachain-0.6.0}/examples/multimodal/wds_filtered.py +0 -0
  64. {datachain-0.5.1 → datachain-0.6.0}/mkdocs.yml +0 -0
  65. {datachain-0.5.1 → datachain-0.6.0}/noxfile.py +0 -0
  66. {datachain-0.5.1 → datachain-0.6.0}/overrides/main.html +0 -0
  67. {datachain-0.5.1 → datachain-0.6.0}/pyproject.toml +0 -0
  68. {datachain-0.5.1 → datachain-0.6.0}/setup.cfg +0 -0
  69. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/__init__.py +0 -0
  70. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/__main__.py +0 -0
  71. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/asyn.py +0 -0
  72. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/cache.py +0 -0
  73. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/catalog/__init__.py +0 -0
  74. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/catalog/catalog.py +0 -0
  75. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/catalog/datasource.py +0 -0
  76. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/catalog/loader.py +0 -0
  77. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/cli.py +0 -0
  78. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/cli_utils.py +0 -0
  79. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/__init__.py +0 -0
  80. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/azure.py +0 -0
  81. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/fileslice.py +0 -0
  82. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/fsspec.py +0 -0
  83. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/gcs.py +0 -0
  84. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/hf.py +0 -0
  85. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/local.py +0 -0
  86. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/client/s3.py +0 -0
  87. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/config.py +0 -0
  88. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/__init__.py +0 -0
  89. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/db_engine.py +0 -0
  90. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/id_generator.py +0 -0
  91. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/job.py +0 -0
  92. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/metastore.py +0 -0
  93. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/schema.py +0 -0
  94. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/serializer.py +0 -0
  95. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/sqlite.py +0 -0
  96. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/data_storage/warehouse.py +0 -0
  97. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/dataset.py +0 -0
  98. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/error.py +0 -0
  99. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/job.py +0 -0
  100. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/__init__.py +0 -0
  101. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/arrow.py +0 -0
  102. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/clip.py +0 -0
  103. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/__init__.py +0 -0
  104. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/flatten.py +0 -0
  105. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  106. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  107. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/unflatten.py +0 -0
  108. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/dataset_info.py +0 -0
  109. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/file.py +0 -0
  110. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/hf.py +0 -0
  111. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/image.py +0 -0
  112. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/listing.py +0 -0
  113. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/listing_info.py +0 -0
  114. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/meta_formats.py +0 -0
  115. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/model_store.py +0 -0
  116. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/pytorch.py +0 -0
  117. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/settings.py +0 -0
  118. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/tar.py +0 -0
  119. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/text.py +0 -0
  120. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/udf_signature.py +0 -0
  121. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/utils.py +0 -0
  122. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/vfile.py +0 -0
  123. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/webdataset.py +0 -0
  124. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/webdataset_laion.py +0 -0
  125. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/listing.py +0 -0
  126. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/node.py +0 -0
  127. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/nodes_fetcher.py +0 -0
  128. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/nodes_thread_pool.py +0 -0
  129. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/progress.py +0 -0
  130. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/py.typed +0 -0
  131. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/__init__.py +0 -0
  132. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/metrics.py +0 -0
  133. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/params.py +0 -0
  134. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/queue.py +0 -0
  135. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/schema.py +0 -0
  136. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/session.py +0 -0
  137. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/remote/__init__.py +0 -0
  138. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/remote/studio.py +0 -0
  139. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/__init__.py +0 -0
  140. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/default/__init__.py +0 -0
  141. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/default/base.py +0 -0
  142. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/__init__.py +0 -0
  143. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/array.py +0 -0
  144. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/conditional.py +0 -0
  145. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/path.py +0 -0
  146. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/random.py +0 -0
  147. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/selectable.py +0 -0
  148. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  149. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/sqlite/types.py +0 -0
  150. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/sqlite/vector.py +0 -0
  151. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/types.py +0 -0
  152. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/utils.py +0 -0
  153. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/storage.py +0 -0
  154. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/telemetry.py +0 -0
  155. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/torch/__init__.py +0 -0
  156. {datachain-0.5.1 → datachain-0.6.0}/src/datachain/utils.py +0 -0
  157. {datachain-0.5.1 → datachain-0.6.0}/src/datachain.egg-info/SOURCES.txt +0 -0
  158. {datachain-0.5.1 → datachain-0.6.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  159. {datachain-0.5.1 → datachain-0.6.0}/src/datachain.egg-info/entry_points.txt +0 -0
  160. {datachain-0.5.1 → datachain-0.6.0}/src/datachain.egg-info/requires.txt +0 -0
  161. {datachain-0.5.1 → datachain-0.6.0}/src/datachain.egg-info/top_level.txt +0 -0
  162. {datachain-0.5.1 → datachain-0.6.0}/tests/__init__.py +0 -0
  163. {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/__init__.py +0 -0
  164. {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/conftest.py +0 -0
  165. {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  166. {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  167. {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/datasets/.gitignore +0 -0
  168. {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  169. {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/test_datachain.py +0 -0
  170. {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/test_ls.py +0 -0
  171. {datachain-0.5.1 → datachain-0.6.0}/tests/benchmarks/test_version.py +0 -0
  172. {datachain-0.5.1 → datachain-0.6.0}/tests/conftest.py +0 -0
  173. {datachain-0.5.1 → datachain-0.6.0}/tests/data.py +0 -0
  174. {datachain-0.5.1 → datachain-0.6.0}/tests/examples/__init__.py +0 -0
  175. {datachain-0.5.1 → datachain-0.6.0}/tests/examples/test_examples.py +0 -0
  176. {datachain-0.5.1 → datachain-0.6.0}/tests/examples/test_wds_e2e.py +0 -0
  177. {datachain-0.5.1 → datachain-0.6.0}/tests/examples/wds_data.py +0 -0
  178. {datachain-0.5.1 → datachain-0.6.0}/tests/func/__init__.py +0 -0
  179. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_catalog.py +0 -0
  180. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_client.py +0 -0
  181. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_datachain.py +0 -0
  182. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_dataset_query.py +0 -0
  183. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_datasets.py +0 -0
  184. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_feature_pickling.py +0 -0
  185. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_listing.py +0 -0
  186. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_ls.py +0 -0
  187. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_meta_formats.py +0 -0
  188. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_metrics.py +0 -0
  189. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_pull.py +0 -0
  190. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_pytorch.py +0 -0
  191. {datachain-0.5.1 → datachain-0.6.0}/tests/func/test_query.py +0 -0
  192. {datachain-0.5.1 → datachain-0.6.0}/tests/scripts/feature_class.py +0 -0
  193. {datachain-0.5.1 → datachain-0.6.0}/tests/scripts/feature_class_parallel.py +0 -0
  194. {datachain-0.5.1 → datachain-0.6.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  195. {datachain-0.5.1 → datachain-0.6.0}/tests/scripts/name_len_slow.py +0 -0
  196. {datachain-0.5.1 → datachain-0.6.0}/tests/test_atomicity.py +0 -0
  197. {datachain-0.5.1 → datachain-0.6.0}/tests/test_cli_e2e.py +0 -0
  198. {datachain-0.5.1 → datachain-0.6.0}/tests/test_query_e2e.py +0 -0
  199. {datachain-0.5.1 → datachain-0.6.0}/tests/test_telemetry.py +0 -0
  200. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/__init__.py +0 -0
  201. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/__init__.py +0 -0
  202. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/conftest.py +0 -0
  203. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_arrow.py +0 -0
  204. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_clip.py +0 -0
  205. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  206. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  207. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_feature.py +0 -0
  208. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_feature_utils.py +0 -0
  209. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_file.py +0 -0
  210. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_hf.py +0 -0
  211. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_image.py +0 -0
  212. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_schema.py +0 -0
  213. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_signal_schema.py +0 -0
  214. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  215. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_text.py +0 -0
  216. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_udf_signature.py +0 -0
  217. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_utils.py +0 -0
  218. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/lib/test_webdataset.py +0 -0
  219. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/__init__.py +0 -0
  220. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  221. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  222. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_array.py +0 -0
  223. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_conditional.py +0 -0
  224. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_path.py +0 -0
  225. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_random.py +0 -0
  226. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/sql/test_selectable.py +0 -0
  227. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_asyn.py +0 -0
  228. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_cache.py +0 -0
  229. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_catalog.py +0 -0
  230. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_catalog_loader.py +0 -0
  231. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_cli_parsing.py +0 -0
  232. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_client.py +0 -0
  233. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_client_s3.py +0 -0
  234. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_data_storage.py +0 -0
  235. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_database_engine.py +0 -0
  236. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_dataset.py +0 -0
  237. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_dispatch.py +0 -0
  238. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_fileslice.py +0 -0
  239. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_id_generator.py +0 -0
  240. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_listing.py +0 -0
  241. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_metastore.py +0 -0
  242. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_module_exports.py +0 -0
  243. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_query.py +0 -0
  244. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_query_metrics.py +0 -0
  245. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_query_params.py +0 -0
  246. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_serializer.py +0 -0
  247. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_session.py +0 -0
  248. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_storage.py +0 -0
  249. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_utils.py +0 -0
  250. {datachain-0.5.1 → datachain-0.6.0}/tests/unit/test_warehouse.py +0 -0
  251. {datachain-0.5.1 → datachain-0.6.0}/tests/utils.py +0 -0
@@ -4,7 +4,7 @@ ci:
4
4
  skip: [mypy]
5
5
  repos:
6
6
  - repo: https://github.com/pre-commit/pre-commit-hooks
7
- rev: v4.6.0
7
+ rev: v5.0.0
8
8
  hooks:
9
9
  - id: check-added-large-files
10
10
  exclude: '^tests/examples/data/'
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.6.8'
27
+ rev: 'v0.6.9'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -4,7 +4,7 @@ from typing import Any, Union
4
4
  from datachain.lib.data_model import (
5
5
  DataType,
6
6
  DataTypeNames,
7
- DataValuesType,
7
+ DataValue,
8
8
  is_chain_type,
9
9
  )
10
10
  from datachain.lib.utils import DataChainParamsError
@@ -20,7 +20,7 @@ class ValuesToTupleError(DataChainParamsError):
20
20
  def values_to_tuples( # noqa: C901, PLR0912
21
21
  ds_name: str = "",
22
22
  output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
23
- **fr_map: Sequence[DataValuesType],
23
+ **fr_map: Sequence[DataValue],
24
24
  ) -> tuple[Any, Any, Any]:
25
25
  if output:
26
26
  if not isinstance(output, (Sequence, str, dict)):
@@ -18,7 +18,7 @@ StandardType = Union[
18
18
  ]
19
19
  DataType = Union[type[BaseModel], StandardType]
20
20
  DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
21
- DataValuesType = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
21
+ DataValue = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
22
22
 
23
23
 
24
24
  class DataModel(BaseModel):
@@ -62,6 +62,7 @@ from datachain.telemetry import telemetry
62
62
  from datachain.utils import batched_it, inside_notebook
63
63
 
64
64
  if TYPE_CHECKING:
65
+ from pyarrow import DataType as ArrowDataType
65
66
  from typing_extensions import Concatenate, ParamSpec, Self
66
67
 
67
68
  from datachain.lib.hf import HFDatasetType
@@ -1024,7 +1025,7 @@ class DataChain:
1024
1025
  The supported functions:
1025
1026
  Numerical: +, -, *, /, rand(), avg(), count(), func(),
1026
1027
  greatest(), least(), max(), min(), sum()
1027
- String: length(), split()
1028
+ String: length(), split(), replace(), regexp_replace()
1028
1029
  Filename: name(), parent(), file_stem(), file_ext()
1029
1030
  Array: length(), sip_hash_64(), euclidean_distance(),
1030
1031
  cosine_distance()
@@ -1709,6 +1710,7 @@ class DataChain:
1709
1710
  nrows=None,
1710
1711
  session: Optional[Session] = None,
1711
1712
  settings: Optional[dict] = None,
1713
+ column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
1712
1714
  **kwargs,
1713
1715
  ) -> "DataChain":
1714
1716
  """Generate chain from csv files.
@@ -1727,6 +1729,9 @@ class DataChain:
1727
1729
  nrows : Optional row limit.
1728
1730
  session : Session to use for the chain.
1729
1731
  settings : Settings to use for the chain.
1732
+ column_types : Dictionary of column names and their corresponding types.
1733
+ It is passed to CSV reader and for each column specified type auto
1734
+ inference is disabled.
1730
1735
 
1731
1736
  Example:
1732
1737
  Reading a csv file:
@@ -1742,6 +1747,15 @@ class DataChain:
1742
1747
  from pandas.io.parsers.readers import STR_NA_VALUES
1743
1748
  from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
1744
1749
  from pyarrow.dataset import CsvFileFormat
1750
+ from pyarrow.lib import type_for_alias
1751
+
1752
+ if column_types:
1753
+ column_types = {
1754
+ name: type_for_alias(typ) if isinstance(typ, str) else typ
1755
+ for name, typ in column_types.items()
1756
+ }
1757
+ else:
1758
+ column_types = {}
1745
1759
 
1746
1760
  chain = DataChain.from_storage(
1747
1761
  path, session=session, settings=settings, **kwargs
@@ -1767,7 +1781,9 @@ class DataChain:
1767
1781
  parse_options = ParseOptions(delimiter=delimiter)
1768
1782
  read_options = ReadOptions(column_names=column_names)
1769
1783
  convert_options = ConvertOptions(
1770
- strings_can_be_null=True, null_values=STR_NA_VALUES
1784
+ strings_can_be_null=True,
1785
+ null_values=STR_NA_VALUES,
1786
+ column_types=column_types,
1771
1787
  )
1772
1788
  format = CsvFileFormat(
1773
1789
  parse_options=parse_options,
@@ -25,7 +25,7 @@ from typing_extensions import Literal as LiteralEx
25
25
  from datachain.lib.convert.python_to_sql import python_to_sql
26
26
  from datachain.lib.convert.sql_to_python import sql_to_python
27
27
  from datachain.lib.convert.unflatten import unflatten_to_json_pos
28
- from datachain.lib.data_model import DataModel, DataType
28
+ from datachain.lib.data_model import DataModel, DataType, DataValue
29
29
  from datachain.lib.file import File
30
30
  from datachain.lib.model_store import ModelStore
31
31
  from datachain.lib.utils import DataChainParamsError
@@ -110,7 +110,7 @@ class SignalSchema:
110
110
  values: dict[str, DataType]
111
111
  tree: dict[str, Any]
112
112
  setup_func: dict[str, Callable]
113
- setup_values: Optional[dict[str, Callable]]
113
+ setup_values: Optional[dict[str, Any]]
114
114
 
115
115
  def __init__(
116
116
  self,
@@ -333,21 +333,21 @@ class SignalSchema:
333
333
  res[db_name] = python_to_sql(type_)
334
334
  return res
335
335
 
336
- def row_to_objs(self, row: Sequence[Any]) -> list[DataType]:
336
+ def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
337
337
  self._init_setup_values()
338
338
 
339
- objs = []
339
+ objs: list[DataValue] = []
340
340
  pos = 0
341
341
  for name, fr_type in self.values.items():
342
342
  if self.setup_values and (val := self.setup_values.get(name, None)):
343
343
  objs.append(val)
344
344
  elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
345
345
  j, pos = unflatten_to_json_pos(fr, row, pos)
346
- objs.append(fr(**j)) # type: ignore[arg-type]
346
+ objs.append(fr(**j))
347
347
  else:
348
348
  objs.append(row[pos])
349
349
  pos += 1
350
- return objs # type: ignore[return-value]
350
+ return objs
351
351
 
352
352
  def contains_file(self) -> bool:
353
353
  for type_ in self.values.values():
@@ -1,14 +1,15 @@
1
1
  import sys
2
2
  import traceback
3
3
  from collections.abc import Iterable, Iterator, Mapping, Sequence
4
- from dataclasses import dataclass
5
4
  from typing import TYPE_CHECKING, Any, Callable, Optional
6
5
 
6
+ import attrs
7
7
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
8
8
  from pydantic import BaseModel
9
9
 
10
10
  from datachain.dataset import RowDict
11
11
  from datachain.lib.convert.flatten import flatten
12
+ from datachain.lib.data_model import DataValue
12
13
  from datachain.lib.file import File
13
14
  from datachain.lib.signal_schema import SignalSchema
14
15
  from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
@@ -18,16 +19,14 @@ from datachain.query.batch import (
18
19
  NoBatching,
19
20
  Partition,
20
21
  RowsOutputBatch,
21
- UDFInputBatch,
22
22
  )
23
- from datachain.query.schema import ColumnParameter, UDFParameter
24
23
 
25
24
  if TYPE_CHECKING:
26
25
  from typing_extensions import Self
27
26
 
28
27
  from datachain.catalog import Catalog
29
28
  from datachain.lib.udf_signature import UdfSignature
30
- from datachain.query.batch import RowsOutput, UDFInput
29
+ from datachain.query.batch import RowsOutput
31
30
 
32
31
 
33
32
  class UdfError(DataChainParamsError):
@@ -45,11 +44,21 @@ UDFOutputSpec = Mapping[str, ColumnType]
45
44
  UDFResult = dict[str, Any]
46
45
 
47
46
 
48
- @dataclass
47
+ @attrs.define
49
48
  class UDFProperties:
50
- """Container for basic UDF properties."""
49
+ udf: "UDFAdapter"
51
50
 
52
- params: list[UDFParameter]
51
+ def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
52
+ return self.udf.get_batching(use_partitioning)
53
+
54
+ @property
55
+ def batch(self):
56
+ return self.udf.batch
57
+
58
+
59
+ @attrs.define(slots=False)
60
+ class UDFAdapter:
61
+ inner: "UDFBase"
53
62
  output: UDFOutputSpec
54
63
  batch: int = 1
55
64
 
@@ -62,20 +71,10 @@ class UDFProperties:
62
71
  return Batch(self.batch)
63
72
  raise ValueError(f"invalid batch size {self.batch}")
64
73
 
65
- def signal_names(self) -> Iterable[str]:
66
- return self.output.keys()
67
-
68
-
69
- class UDFAdapter:
70
- def __init__(
71
- self,
72
- inner: "UDFBase",
73
- properties: UDFProperties,
74
- ):
75
- self.inner = inner
76
- self.properties = properties
77
- self.signal_names = properties.signal_names()
78
- self.output = properties.output
74
+ @property
75
+ def properties(self):
76
+ # For backwards compatibility.
77
+ return UDFProperties(self)
79
78
 
80
79
  def run(
81
80
  self,
@@ -87,72 +86,14 @@ class UDFAdapter:
87
86
  download_cb: Callback = DEFAULT_CALLBACK,
88
87
  processed_cb: Callback = DEFAULT_CALLBACK,
89
88
  ) -> Iterator[Iterable[UDFResult]]:
90
- self.inner.catalog = catalog
91
- if hasattr(self.inner, "setup") and callable(self.inner.setup):
92
- self.inner.setup()
93
-
94
- for batch in udf_inputs:
95
- if isinstance(batch, RowsOutputBatch):
96
- n_rows = len(batch.rows)
97
- inputs: UDFInput = UDFInputBatch(
98
- [RowDict(zip(udf_fields, row)) for row in batch.rows]
99
- )
100
- else:
101
- n_rows = 1
102
- inputs = RowDict(zip(udf_fields, batch))
103
- output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
104
- processed_cb.relative_update(n_rows)
105
- yield output
106
-
107
- if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
108
- self.inner.teardown()
109
-
110
- def run_once(
111
- self,
112
- catalog: "Catalog",
113
- arg: "UDFInput",
114
- is_generator: bool = False,
115
- cache: bool = False,
116
- cb: Callback = DEFAULT_CALLBACK,
117
- ) -> Iterable[UDFResult]:
118
- if isinstance(arg, UDFInputBatch):
119
- udf_inputs = [
120
- self.bind_parameters(catalog, row, cache=cache, cb=cb)
121
- for row in arg.rows
122
- ]
123
- udf_outputs = self.inner.run_once(udf_inputs, cache=cache, download_cb=cb)
124
- return self._process_results(arg.rows, udf_outputs, is_generator)
125
- if isinstance(arg, RowDict):
126
- udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
127
- udf_outputs = self.inner.run_once(udf_inputs, cache=cache, download_cb=cb)
128
- if not is_generator:
129
- # udf_outputs is generator already if is_generator=True
130
- udf_outputs = [udf_outputs]
131
- return self._process_results([arg], udf_outputs, is_generator)
132
- raise ValueError(f"Unexpected UDF argument: {arg}")
133
-
134
- def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
135
- return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
136
-
137
- def _process_results(
138
- self,
139
- rows: Sequence["RowDict"],
140
- results: Sequence[Sequence[Any]],
141
- is_generator=False,
142
- ) -> Iterable[UDFResult]:
143
- """Create a list of dictionaries representing UDF results."""
144
-
145
- # outputting rows
146
- if is_generator:
147
- # each row in results is a tuple of column values
148
- return (dict(zip(self.signal_names, row)) for row in results)
149
-
150
- # outputting signals
151
- row_ids = [row["sys__id"] for row in rows]
152
- return [
153
- {"sys__id": row_id} | dict(zip(self.signal_names, signals))
154
- for row_id, signals in zip(row_ids, results)
155
- ]
89
+ yield from self.inner.run(
90
+ udf_fields,
91
+ udf_inputs,
92
+ catalog,
93
+ cache,
94
+ download_cb,
95
+ processed_cb,
96
+ )
156
97
 
157
98
 
158
99
  class UDFBase(AbstractUDF):
@@ -203,17 +144,12 @@ class UDFBase(AbstractUDF):
203
144
  ```
204
145
  """
205
146
 
206
- is_input_batched = False
207
147
  is_output_batched = False
208
- is_input_grouped = False
209
- params_spec: Optional[list[str]]
210
148
  catalog: "Optional[Catalog]"
211
149
 
212
150
  def __init__(self):
213
- self.params = None
151
+ self.params: Optional[SignalSchema] = None
214
152
  self.output = None
215
- self.params_spec = None
216
- self.output_spec = None
217
153
  self.catalog = None
218
154
  self._func = None
219
155
 
@@ -241,11 +177,6 @@ class UDFBase(AbstractUDF):
241
177
  ):
242
178
  self.params = params
243
179
  self.output = sign.output_schema
244
-
245
- params_spec = self.params.to_udf_spec()
246
- self.params_spec = list(params_spec.keys())
247
- self.output_spec = self.output.to_udf_spec()
248
-
249
180
  self._func = func
250
181
 
251
182
  @classmethod
@@ -273,48 +204,27 @@ class UDFBase(AbstractUDF):
273
204
  def name(self):
274
205
  return self.__class__.__name__
275
206
 
207
+ @property
208
+ def signal_names(self) -> Iterable[str]:
209
+ return self.output.to_udf_spec().keys()
210
+
276
211
  def to_udf_wrapper(self, batch: int = 1) -> UDFAdapter:
277
- assert self.params_spec is not None
278
- properties = UDFProperties(
279
- [ColumnParameter(p) for p in self.params_spec], self.output_spec, batch
212
+ return UDFAdapter(
213
+ self,
214
+ self.output.to_udf_spec(),
215
+ batch,
280
216
  )
281
- return UDFAdapter(self, properties)
282
-
283
- def validate_results(self, results, *args, **kwargs):
284
- return results
285
217
 
286
- def run_once(self, rows, cache, download_cb):
287
- if self.is_input_batched:
288
- objs = zip(*self._parse_rows(rows, cache, download_cb))
289
- else:
290
- objs = self._parse_rows([rows], cache, download_cb)[0]
291
-
292
- result_objs = self.process_safe(objs)
293
-
294
- if not self.is_output_batched:
295
- result_objs = [result_objs]
296
-
297
- # Generator expression is required, otherwise the value will be materialized
298
- res = (self._flatten_row(row) for row in result_objs)
299
-
300
- if not self.is_output_batched:
301
- res = list(res)
302
- assert (
303
- len(res) == 1
304
- ), f"{self.name} returns {len(res)} rows while it's not batched"
305
- if isinstance(res[0], tuple):
306
- res = res[0]
307
- elif (
308
- self.is_input_batched
309
- and self.is_output_batched
310
- and not self.is_input_grouped
311
- ):
312
- res = list(res)
313
- assert len(res) == len(
314
- rows
315
- ), f"{self.name} returns {len(res)} rows while {len(rows)} expected"
316
-
317
- return res
218
+ def run(
219
+ self,
220
+ udf_fields: "Sequence[str]",
221
+ udf_inputs: "Iterable[Any]",
222
+ catalog: "Catalog",
223
+ cache: bool,
224
+ download_cb: Callback = DEFAULT_CALLBACK,
225
+ processed_cb: Callback = DEFAULT_CALLBACK,
226
+ ) -> Iterator[Iterable[UDFResult]]:
227
+ raise NotImplementedError
318
228
 
319
229
  def _flatten_row(self, row):
320
230
  if len(self.output.values) > 1 and not isinstance(row, BaseModel):
@@ -328,17 +238,28 @@ class UDFBase(AbstractUDF):
328
238
  def _obj_to_list(obj):
329
239
  return flatten(obj) if isinstance(obj, BaseModel) else [obj]
330
240
 
331
- def _parse_rows(self, rows, cache, download_cb):
332
- objs = []
333
- for row in rows:
334
- obj_row = self.params.row_to_objs(row)
335
- for obj in obj_row:
336
- if isinstance(obj, File):
337
- obj._set_stream(
338
- self.catalog, caching_enabled=cache, download_cb=download_cb
339
- )
340
- objs.append(obj_row)
341
- return objs
241
+ def _parse_row(
242
+ self, row_dict: RowDict, cache: bool, download_cb: Callback
243
+ ) -> list[DataValue]:
244
+ assert self.params
245
+ row = [row_dict[p] for p in self.params.to_udf_spec()]
246
+ obj_row = self.params.row_to_objs(row)
247
+ for obj in obj_row:
248
+ if isinstance(obj, File):
249
+ assert self.catalog is not None
250
+ obj._set_stream(
251
+ self.catalog, caching_enabled=cache, download_cb=download_cb
252
+ )
253
+ return obj_row
254
+
255
+ def _prepare_row(self, row, udf_fields, cache, download_cb):
256
+ row_dict = RowDict(zip(udf_fields, row))
257
+ return self._parse_row(row_dict, cache, download_cb)
258
+
259
+ def _prepare_row_and_id(self, row, udf_fields, cache, download_cb):
260
+ row_dict = RowDict(zip(udf_fields, row))
261
+ udf_input = self._parse_row(row_dict, cache, download_cb)
262
+ return row_dict["sys__id"], *udf_input
342
263
 
343
264
  def process_safe(self, obj_rows):
344
265
  try:
@@ -358,23 +279,128 @@ class UDFBase(AbstractUDF):
358
279
  class Mapper(UDFBase):
359
280
  """Inherit from this class to pass to `DataChain.map()`."""
360
281
 
282
+ def run(
283
+ self,
284
+ udf_fields: "Sequence[str]",
285
+ udf_inputs: "Iterable[Sequence[Any]]",
286
+ catalog: "Catalog",
287
+ cache: bool,
288
+ download_cb: Callback = DEFAULT_CALLBACK,
289
+ processed_cb: Callback = DEFAULT_CALLBACK,
290
+ ) -> Iterator[Iterable[UDFResult]]:
291
+ self.catalog = catalog
292
+ self.setup()
293
+
294
+ for row in udf_inputs:
295
+ id_, *udf_args = self._prepare_row_and_id(
296
+ row, udf_fields, cache, download_cb
297
+ )
298
+ result_objs = self.process_safe(udf_args)
299
+ udf_output = self._flatten_row(result_objs)
300
+ output = [{"sys__id": id_} | dict(zip(self.signal_names, udf_output))]
301
+ processed_cb.relative_update(1)
302
+ yield output
303
+
304
+ self.teardown()
305
+
361
306
 
362
307
  class BatchMapper(UDFBase):
363
308
  """Inherit from this class to pass to `DataChain.batch_map()`."""
364
309
 
365
- is_input_batched = True
366
310
  is_output_batched = True
367
311
 
312
+ def run(
313
+ self,
314
+ udf_fields: Sequence[str],
315
+ udf_inputs: Iterable[RowsOutputBatch],
316
+ catalog: "Catalog",
317
+ cache: bool,
318
+ download_cb: Callback = DEFAULT_CALLBACK,
319
+ processed_cb: Callback = DEFAULT_CALLBACK,
320
+ ) -> Iterator[Iterable[UDFResult]]:
321
+ self.catalog = catalog
322
+ self.setup()
323
+
324
+ for batch in udf_inputs:
325
+ n_rows = len(batch.rows)
326
+ row_ids, *udf_args = zip(
327
+ *[
328
+ self._prepare_row_and_id(row, udf_fields, cache, download_cb)
329
+ for row in batch.rows
330
+ ]
331
+ )
332
+ result_objs = list(self.process_safe(udf_args))
333
+ n_objs = len(result_objs)
334
+ assert (
335
+ n_objs == n_rows
336
+ ), f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
337
+ udf_outputs = (self._flatten_row(row) for row in result_objs)
338
+ output = [
339
+ {"sys__id": row_id} | dict(zip(self.signal_names, signals))
340
+ for row_id, signals in zip(row_ids, udf_outputs)
341
+ ]
342
+ processed_cb.relative_update(n_rows)
343
+ yield output
344
+
345
+ self.teardown()
346
+
368
347
 
369
348
  class Generator(UDFBase):
370
349
  """Inherit from this class to pass to `DataChain.gen()`."""
371
350
 
372
351
  is_output_batched = True
373
352
 
353
+ def run(
354
+ self,
355
+ udf_fields: "Sequence[str]",
356
+ udf_inputs: "Iterable[Sequence[Any]]",
357
+ catalog: "Catalog",
358
+ cache: bool,
359
+ download_cb: Callback = DEFAULT_CALLBACK,
360
+ processed_cb: Callback = DEFAULT_CALLBACK,
361
+ ) -> Iterator[Iterable[UDFResult]]:
362
+ self.catalog = catalog
363
+ self.setup()
364
+
365
+ for row in udf_inputs:
366
+ udf_args = self._prepare_row(row, udf_fields, cache, download_cb)
367
+ result_objs = self.process_safe(udf_args)
368
+ udf_outputs = (self._flatten_row(row) for row in result_objs)
369
+ output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
370
+ processed_cb.relative_update(1)
371
+ yield output
372
+
373
+ self.teardown()
374
+
374
375
 
375
376
  class Aggregator(UDFBase):
376
377
  """Inherit from this class to pass to `DataChain.agg()`."""
377
378
 
378
- is_input_batched = True
379
379
  is_output_batched = True
380
- is_input_grouped = True
380
+
381
+ def run(
382
+ self,
383
+ udf_fields: "Sequence[str]",
384
+ udf_inputs: Iterable[RowsOutputBatch],
385
+ catalog: "Catalog",
386
+ cache: bool,
387
+ download_cb: Callback = DEFAULT_CALLBACK,
388
+ processed_cb: Callback = DEFAULT_CALLBACK,
389
+ ) -> Iterator[Iterable[UDFResult]]:
390
+ self.catalog = catalog
391
+ self.setup()
392
+
393
+ for batch in udf_inputs:
394
+ udf_args = zip(
395
+ *[
396
+ self._prepare_row(row, udf_fields, cache, download_cb)
397
+ for row in batch.rows
398
+ ]
399
+ )
400
+ result_objs = self.process_safe(udf_args)
401
+ udf_outputs = (self._flatten_row(row) for row in result_objs)
402
+ output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
403
+ processed_cb.relative_update(len(batch.rows))
404
+ yield output
405
+
406
+ self.teardown()
@@ -11,8 +11,6 @@ from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
11
11
  if TYPE_CHECKING:
12
12
  from sqlalchemy import Select
13
13
 
14
- from datachain.dataset import RowDict
15
-
16
14
 
17
15
  @dataclass
18
16
  class RowsOutputBatch:
@@ -22,14 +20,6 @@ class RowsOutputBatch:
22
20
  RowsOutput = Union[Sequence, RowsOutputBatch]
23
21
 
24
22
 
25
- @dataclass
26
- class UDFInputBatch:
27
- rows: Sequence["RowDict"]
28
-
29
-
30
- UDFInput = Union["RowDict", UDFInputBatch]
31
-
32
-
33
23
  class BatchingStrategy(ABC):
34
24
  """BatchingStrategy provides means of batching UDF executions."""
35
25
 
@@ -392,7 +392,7 @@ class UDFStep(Step, ABC):
392
392
 
393
393
  def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
394
394
  use_partitioning = self.partition_by is not None
395
- batching = self.udf.properties.get_batching(use_partitioning)
395
+ batching = self.udf.get_batching(use_partitioning)
396
396
  workers = self.workers
397
397
  if (
398
398
  not workers
@@ -114,7 +114,6 @@ class UDFDispatcher:
114
114
  catalog: Optional[Catalog] = None
115
115
  task_queue: Optional[multiprocess.Queue] = None
116
116
  done_queue: Optional[multiprocess.Queue] = None
117
- _batch_size: Optional[int] = None
118
117
 
119
118
  def __init__(
120
119
  self,
@@ -154,17 +153,6 @@ class UDFDispatcher:
154
153
  self.done_queue = None
155
154
  self.ctx = get_context("spawn")
156
155
 
157
- @property
158
- def batch_size(self):
159
- if self._batch_size is None:
160
- if hasattr(self.udf, "properties") and hasattr(
161
- self.udf.properties, "batch"
162
- ):
163
- self._batch_size = self.udf.properties.batch
164
- else:
165
- self._batch_size = 1
166
- return self._batch_size
167
-
168
156
  def _create_worker(self) -> "UDFWorker":
169
157
  if not self.catalog:
170
158
  id_generator = self.id_generator_class(
@@ -37,6 +37,18 @@ class regexp_replace(GenericFunction): # noqa: N801
37
37
  inherit_cache = True
38
38
 
39
39
 
40
+ class replace(GenericFunction): # noqa: N801
41
+ """
42
+ Replaces substring with another string.
43
+ """
44
+
45
+ type = String()
46
+ package = "string"
47
+ name = "replace"
48
+ inherit_cache = True
49
+
50
+
40
51
  compiler_not_implemented(length)
41
52
  compiler_not_implemented(split)
42
53
  compiler_not_implemented(regexp_replace)
54
+ compiler_not_implemented(replace)