datachain 0.6.10__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (274) hide show
  1. {datachain-0.6.10 → datachain-0.7.0}/.github/workflows/tests.yml +1 -1
  2. {datachain-0.6.10 → datachain-0.7.0}/.pre-commit-config.yaml +1 -1
  3. {datachain-0.6.10/src/datachain.egg-info → datachain-0.7.0}/PKG-INFO +1 -1
  4. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/asyn.py +36 -4
  5. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/data_storage/warehouse.py +4 -1
  6. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/dc.py +6 -1
  7. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/file.py +5 -0
  8. datachain-0.7.0/src/datachain/lib/models/__init__.py +5 -0
  9. datachain-0.7.0/src/datachain/lib/models/bbox.py +45 -0
  10. datachain-0.7.0/src/datachain/lib/models/pose.py +37 -0
  11. datachain-0.7.0/src/datachain/lib/models/yolo.py +39 -0
  12. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/settings.py +11 -1
  13. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/udf.py +45 -18
  14. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/node.py +1 -1
  15. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/query/dataset.py +25 -27
  16. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/sqlite/types.py +8 -1
  17. {datachain-0.6.10 → datachain-0.7.0/src/datachain.egg-info}/PKG-INFO +1 -1
  18. {datachain-0.6.10 → datachain-0.7.0}/src/datachain.egg-info/SOURCES.txt +2 -5
  19. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_datachain.py +5 -2
  20. {datachain-0.6.10 → datachain-0.7.0}/tests/scripts/name_len_slow.py +1 -1
  21. datachain-0.7.0/tests/unit/lib/test_models.py +50 -0
  22. datachain-0.7.0/tests/unit/sql/sqlite/test_types.py +19 -0
  23. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_asyn.py +32 -0
  24. datachain-0.6.10/src/datachain/lib/models/__init__.py +0 -6
  25. datachain-0.6.10/src/datachain/lib/models/bbox.py +0 -116
  26. datachain-0.6.10/src/datachain/lib/models/pose.py +0 -108
  27. datachain-0.6.10/src/datachain/lib/models/segment.py +0 -53
  28. datachain-0.6.10/src/datachain/lib/models/ultralytics/__init__.py +0 -14
  29. datachain-0.6.10/src/datachain/lib/models/ultralytics/bbox.py +0 -189
  30. datachain-0.6.10/src/datachain/lib/models/ultralytics/pose.py +0 -126
  31. datachain-0.6.10/src/datachain/lib/models/ultralytics/segment.py +0 -121
  32. datachain-0.6.10/tests/unit/lib/test_models.py +0 -142
  33. {datachain-0.6.10 → datachain-0.7.0}/.cruft.json +0 -0
  34. {datachain-0.6.10 → datachain-0.7.0}/.gitattributes +0 -0
  35. {datachain-0.6.10 → datachain-0.7.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  36. {datachain-0.6.10 → datachain-0.7.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  37. {datachain-0.6.10 → datachain-0.7.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  38. {datachain-0.6.10 → datachain-0.7.0}/.github/codecov.yaml +0 -0
  39. {datachain-0.6.10 → datachain-0.7.0}/.github/dependabot.yml +0 -0
  40. {datachain-0.6.10 → datachain-0.7.0}/.github/workflows/benchmarks.yml +0 -0
  41. {datachain-0.6.10 → datachain-0.7.0}/.github/workflows/release.yml +0 -0
  42. {datachain-0.6.10 → datachain-0.7.0}/.github/workflows/tests-studio.yml +0 -0
  43. {datachain-0.6.10 → datachain-0.7.0}/.github/workflows/update-template.yaml +0 -0
  44. {datachain-0.6.10 → datachain-0.7.0}/.gitignore +0 -0
  45. {datachain-0.6.10 → datachain-0.7.0}/CODE_OF_CONDUCT.rst +0 -0
  46. {datachain-0.6.10 → datachain-0.7.0}/CONTRIBUTING.rst +0 -0
  47. {datachain-0.6.10 → datachain-0.7.0}/LICENSE +0 -0
  48. {datachain-0.6.10 → datachain-0.7.0}/README.rst +0 -0
  49. {datachain-0.6.10 → datachain-0.7.0}/docs/assets/captioned_cartoons.png +0 -0
  50. {datachain-0.6.10 → datachain-0.7.0}/docs/assets/datachain-white.svg +0 -0
  51. {datachain-0.6.10 → datachain-0.7.0}/docs/assets/datachain.svg +0 -0
  52. {datachain-0.6.10 → datachain-0.7.0}/docs/index.md +0 -0
  53. {datachain-0.6.10 → datachain-0.7.0}/docs/overrides/main.html +0 -0
  54. {datachain-0.6.10 → datachain-0.7.0}/docs/references/datachain.md +0 -0
  55. {datachain-0.6.10 → datachain-0.7.0}/docs/references/datatype.md +0 -0
  56. {datachain-0.6.10 → datachain-0.7.0}/docs/references/file.md +0 -0
  57. {datachain-0.6.10 → datachain-0.7.0}/docs/references/index.md +0 -0
  58. {datachain-0.6.10 → datachain-0.7.0}/docs/references/sql.md +0 -0
  59. {datachain-0.6.10 → datachain-0.7.0}/docs/references/torch.md +0 -0
  60. {datachain-0.6.10 → datachain-0.7.0}/docs/references/udf.md +0 -0
  61. {datachain-0.6.10 → datachain-0.7.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  62. {datachain-0.6.10 → datachain-0.7.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  63. {datachain-0.6.10 → datachain-0.7.0}/examples/computer_vision/openimage-detect.py +0 -0
  64. {datachain-0.6.10 → datachain-0.7.0}/examples/get_started/common_sql_functions.py +0 -0
  65. {datachain-0.6.10 → datachain-0.7.0}/examples/get_started/json-csv-reader.py +0 -0
  66. {datachain-0.6.10 → datachain-0.7.0}/examples/get_started/torch-loader.py +0 -0
  67. {datachain-0.6.10 → datachain-0.7.0}/examples/get_started/udfs/parallel.py +0 -0
  68. {datachain-0.6.10 → datachain-0.7.0}/examples/get_started/udfs/simple.py +0 -0
  69. {datachain-0.6.10 → datachain-0.7.0}/examples/get_started/udfs/stateful.py +0 -0
  70. {datachain-0.6.10 → datachain-0.7.0}/examples/llm_and_nlp/claude-query.py +0 -0
  71. {datachain-0.6.10 → datachain-0.7.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  72. {datachain-0.6.10 → datachain-0.7.0}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  73. {datachain-0.6.10 → datachain-0.7.0}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  74. {datachain-0.6.10 → datachain-0.7.0}/examples/multimodal/clip_inference.py +0 -0
  75. {datachain-0.6.10 → datachain-0.7.0}/examples/multimodal/hf_pipeline.py +0 -0
  76. {datachain-0.6.10 → datachain-0.7.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  77. {datachain-0.6.10 → datachain-0.7.0}/examples/multimodal/wds.py +0 -0
  78. {datachain-0.6.10 → datachain-0.7.0}/examples/multimodal/wds_filtered.py +0 -0
  79. {datachain-0.6.10 → datachain-0.7.0}/mkdocs.yml +0 -0
  80. {datachain-0.6.10 → datachain-0.7.0}/noxfile.py +0 -0
  81. {datachain-0.6.10 → datachain-0.7.0}/pyproject.toml +0 -0
  82. {datachain-0.6.10 → datachain-0.7.0}/setup.cfg +0 -0
  83. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/__init__.py +0 -0
  84. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/__main__.py +0 -0
  85. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/cache.py +0 -0
  86. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/catalog/__init__.py +0 -0
  87. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/catalog/catalog.py +0 -0
  88. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/catalog/datasource.py +0 -0
  89. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/catalog/loader.py +0 -0
  90. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/cli.py +0 -0
  91. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/cli_utils.py +0 -0
  92. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/client/__init__.py +0 -0
  93. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/client/azure.py +0 -0
  94. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/client/fileslice.py +0 -0
  95. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/client/fsspec.py +0 -0
  96. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/client/gcs.py +0 -0
  97. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/client/hf.py +0 -0
  98. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/client/local.py +0 -0
  99. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/client/s3.py +0 -0
  100. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/config.py +0 -0
  101. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/data_storage/__init__.py +0 -0
  102. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/data_storage/db_engine.py +0 -0
  103. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/data_storage/id_generator.py +0 -0
  104. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/data_storage/job.py +0 -0
  105. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/data_storage/metastore.py +0 -0
  106. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/data_storage/schema.py +0 -0
  107. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/data_storage/serializer.py +0 -0
  108. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/data_storage/sqlite.py +0 -0
  109. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/dataset.py +0 -0
  110. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/error.py +0 -0
  111. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/job.py +0 -0
  112. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/__init__.py +0 -0
  113. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/arrow.py +0 -0
  114. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/clip.py +0 -0
  115. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/convert/__init__.py +0 -0
  116. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/convert/flatten.py +0 -0
  117. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  118. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  119. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/convert/unflatten.py +0 -0
  120. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  121. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/data_model.py +0 -0
  122. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/dataset_info.py +0 -0
  123. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/func/__init__.py +0 -0
  124. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/func/aggregate.py +0 -0
  125. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/func/func.py +0 -0
  126. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/hf.py +0 -0
  127. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/image.py +0 -0
  128. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/listing.py +0 -0
  129. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/listing_info.py +0 -0
  130. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/meta_formats.py +0 -0
  131. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/model_store.py +0 -0
  132. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/pytorch.py +0 -0
  133. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/signal_schema.py +0 -0
  134. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/tar.py +0 -0
  135. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/text.py +0 -0
  136. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/udf_signature.py +0 -0
  137. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/utils.py +0 -0
  138. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/vfile.py +0 -0
  139. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/webdataset.py +0 -0
  140. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/lib/webdataset_laion.py +0 -0
  141. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/listing.py +0 -0
  142. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/nodes_fetcher.py +0 -0
  143. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/nodes_thread_pool.py +0 -0
  144. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/progress.py +0 -0
  145. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/py.typed +0 -0
  146. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/query/__init__.py +0 -0
  147. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/query/batch.py +0 -0
  148. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/query/dispatch.py +0 -0
  149. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/query/metrics.py +0 -0
  150. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/query/params.py +0 -0
  151. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/query/queue.py +0 -0
  152. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/query/schema.py +0 -0
  153. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/query/session.py +0 -0
  154. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/remote/__init__.py +0 -0
  155. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/remote/studio.py +0 -0
  156. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/__init__.py +0 -0
  157. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/default/__init__.py +0 -0
  158. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/default/base.py +0 -0
  159. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/functions/__init__.py +0 -0
  160. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/functions/aggregate.py +0 -0
  161. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/functions/array.py +0 -0
  162. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/functions/conditional.py +0 -0
  163. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/functions/path.py +0 -0
  164. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/functions/random.py +0 -0
  165. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/functions/string.py +0 -0
  166. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/selectable.py +0 -0
  167. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  168. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/sqlite/base.py +0 -0
  169. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/sqlite/vector.py +0 -0
  170. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/types.py +0 -0
  171. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/sql/utils.py +0 -0
  172. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/studio.py +0 -0
  173. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/telemetry.py +0 -0
  174. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/toolkit/__init__.py +0 -0
  175. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/toolkit/split.py +0 -0
  176. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/torch/__init__.py +0 -0
  177. {datachain-0.6.10 → datachain-0.7.0}/src/datachain/utils.py +0 -0
  178. {datachain-0.6.10 → datachain-0.7.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  179. {datachain-0.6.10 → datachain-0.7.0}/src/datachain.egg-info/entry_points.txt +0 -0
  180. {datachain-0.6.10 → datachain-0.7.0}/src/datachain.egg-info/requires.txt +0 -0
  181. {datachain-0.6.10 → datachain-0.7.0}/src/datachain.egg-info/top_level.txt +0 -0
  182. {datachain-0.6.10 → datachain-0.7.0}/tests/__init__.py +0 -0
  183. {datachain-0.6.10 → datachain-0.7.0}/tests/benchmarks/__init__.py +0 -0
  184. {datachain-0.6.10 → datachain-0.7.0}/tests/benchmarks/conftest.py +0 -0
  185. {datachain-0.6.10 → datachain-0.7.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  186. {datachain-0.6.10 → datachain-0.7.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  187. {datachain-0.6.10 → datachain-0.7.0}/tests/benchmarks/datasets/.gitignore +0 -0
  188. {datachain-0.6.10 → datachain-0.7.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  189. {datachain-0.6.10 → datachain-0.7.0}/tests/benchmarks/test_datachain.py +0 -0
  190. {datachain-0.6.10 → datachain-0.7.0}/tests/benchmarks/test_ls.py +0 -0
  191. {datachain-0.6.10 → datachain-0.7.0}/tests/benchmarks/test_version.py +0 -0
  192. {datachain-0.6.10 → datachain-0.7.0}/tests/conftest.py +0 -0
  193. {datachain-0.6.10 → datachain-0.7.0}/tests/data.py +0 -0
  194. {datachain-0.6.10 → datachain-0.7.0}/tests/examples/__init__.py +0 -0
  195. {datachain-0.6.10 → datachain-0.7.0}/tests/examples/test_examples.py +0 -0
  196. {datachain-0.6.10 → datachain-0.7.0}/tests/examples/test_wds_e2e.py +0 -0
  197. {datachain-0.6.10 → datachain-0.7.0}/tests/examples/wds_data.py +0 -0
  198. {datachain-0.6.10 → datachain-0.7.0}/tests/func/__init__.py +0 -0
  199. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_catalog.py +0 -0
  200. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_client.py +0 -0
  201. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_dataset_query.py +0 -0
  202. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_datasets.py +0 -0
  203. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_feature_pickling.py +0 -0
  204. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_listing.py +0 -0
  205. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_ls.py +0 -0
  206. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_meta_formats.py +0 -0
  207. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_metrics.py +0 -0
  208. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_pull.py +0 -0
  209. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_pytorch.py +0 -0
  210. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_query.py +0 -0
  211. {datachain-0.6.10 → datachain-0.7.0}/tests/func/test_toolkit.py +0 -0
  212. {datachain-0.6.10 → datachain-0.7.0}/tests/scripts/feature_class.py +0 -0
  213. {datachain-0.6.10 → datachain-0.7.0}/tests/scripts/feature_class_exception.py +0 -0
  214. {datachain-0.6.10 → datachain-0.7.0}/tests/scripts/feature_class_parallel.py +0 -0
  215. {datachain-0.6.10 → datachain-0.7.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  216. {datachain-0.6.10 → datachain-0.7.0}/tests/test_atomicity.py +0 -0
  217. {datachain-0.6.10 → datachain-0.7.0}/tests/test_cli_e2e.py +0 -0
  218. {datachain-0.6.10 → datachain-0.7.0}/tests/test_cli_studio.py +0 -0
  219. {datachain-0.6.10 → datachain-0.7.0}/tests/test_query_e2e.py +0 -0
  220. {datachain-0.6.10 → datachain-0.7.0}/tests/test_telemetry.py +0 -0
  221. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/__init__.py +0 -0
  222. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/__init__.py +0 -0
  223. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/conftest.py +0 -0
  224. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_arrow.py +0 -0
  225. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_clip.py +0 -0
  226. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_datachain.py +0 -0
  227. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  228. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  229. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_feature.py +0 -0
  230. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_feature_utils.py +0 -0
  231. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_file.py +0 -0
  232. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_hf.py +0 -0
  233. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_image.py +0 -0
  234. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_listing_info.py +0 -0
  235. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_schema.py +0 -0
  236. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_signal_schema.py +0 -0
  237. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  238. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_text.py +0 -0
  239. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_udf_signature.py +0 -0
  240. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_utils.py +0 -0
  241. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/lib/test_webdataset.py +0 -0
  242. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/sql/__init__.py +0 -0
  243. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  244. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  245. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/sql/test_array.py +0 -0
  246. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/sql/test_conditional.py +0 -0
  247. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/sql/test_path.py +0 -0
  248. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/sql/test_random.py +0 -0
  249. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/sql/test_selectable.py +0 -0
  250. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/sql/test_string.py +0 -0
  251. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_cache.py +0 -0
  252. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_catalog.py +0 -0
  253. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_catalog_loader.py +0 -0
  254. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_cli_parsing.py +0 -0
  255. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_client.py +0 -0
  256. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_client_s3.py +0 -0
  257. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_config.py +0 -0
  258. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_data_storage.py +0 -0
  259. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_database_engine.py +0 -0
  260. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_dataset.py +0 -0
  261. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_dispatch.py +0 -0
  262. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_fileslice.py +0 -0
  263. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_id_generator.py +0 -0
  264. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_listing.py +0 -0
  265. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_metastore.py +0 -0
  266. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_module_exports.py +0 -0
  267. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_query.py +0 -0
  268. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_query_metrics.py +0 -0
  269. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_query_params.py +0 -0
  270. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_serializer.py +0 -0
  271. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_session.py +0 -0
  272. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_utils.py +0 -0
  273. {datachain-0.6.10 → datachain-0.7.0}/tests/unit/test_warehouse.py +0 -0
  274. {datachain-0.6.10 → datachain-0.7.0}/tests/utils.py +0 -0
@@ -104,7 +104,7 @@ jobs:
104
104
  shell: bash
105
105
 
106
106
  - name: Upload coverage report
107
- uses: codecov/codecov-action@v4
107
+ uses: codecov/codecov-action@v5
108
108
  with:
109
109
  token: ${{ secrets.CODECOV_TOKEN }}
110
110
  files: coverage.xml
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.7.3'
27
+ rev: 'v0.7.4'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.10
3
+ Version: 0.7.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -1,5 +1,13 @@
1
1
  import asyncio
2
- from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
2
+ import threading
3
+ from collections.abc import (
4
+ AsyncIterable,
5
+ Awaitable,
6
+ Coroutine,
7
+ Generator,
8
+ Iterable,
9
+ Iterator,
10
+ )
3
11
  from concurrent.futures import ThreadPoolExecutor
4
12
  from heapq import heappop, heappush
5
13
  from typing import Any, Callable, Generic, Optional, TypeVar
@@ -47,6 +55,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
47
55
  self.loop = get_loop() if loop is None else loop
48
56
  self.pool = ThreadPoolExecutor(workers)
49
57
  self._tasks: set[asyncio.Task] = set()
58
+ self._shutdown_producer = threading.Event()
50
59
 
51
60
  def start_task(self, coro: Coroutine) -> asyncio.Task:
52
61
  task = self.loop.create_task(coro)
@@ -54,9 +63,31 @@ class AsyncMapper(Generic[InputT, ResultT]):
54
63
  task.add_done_callback(self._tasks.discard)
55
64
  return task
56
65
 
57
- async def produce(self) -> None:
66
+ def _produce(self) -> None:
58
67
  for item in self.iterable:
59
- await self.work_queue.put(item)
68
+ if self._shutdown_producer.is_set():
69
+ return
70
+ fut = asyncio.run_coroutine_threadsafe(self.work_queue.put(item), self.loop)
71
+ fut.result() # wait until the item is in the queue
72
+
73
+ async def produce(self) -> None:
74
+ await self.to_thread(self._produce)
75
+
76
+ def shutdown_producer(self) -> None:
77
+ """
78
+ Signal the producer to stop and drain any remaining items from the work_queue.
79
+
80
+ This method sets an internal event, `_shutdown_producer`, which tells the
81
+ producer that it should stop adding items to the queue. To ensure that the
82
+ producer notices this signal promptly, we also attempt to drain any items
83
+ currently in the queue, clearing it so that the event can be checked without
84
+ delay.
85
+ """
86
+ self._shutdown_producer.set()
87
+ q = self.work_queue
88
+ while not q.empty():
89
+ q.get_nowait()
90
+ q.task_done()
60
91
 
61
92
  async def worker(self) -> None:
62
93
  while (item := await self.work_queue.get()) is not None:
@@ -132,7 +163,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
132
163
  self.result_queue.get_nowait()
133
164
  await self.result_queue.put(None)
134
165
 
135
- def iterate(self, timeout=None) -> Iterable[ResultT]:
166
+ def iterate(self, timeout=None) -> Generator[ResultT, None, None]:
136
167
  init = asyncio.run_coroutine_threadsafe(self.init(), self.loop)
137
168
  init.result(timeout=1)
138
169
  async_run = asyncio.run_coroutine_threadsafe(self.run(), self.loop)
@@ -145,6 +176,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
145
176
  if exc := async_run.exception():
146
177
  raise exc
147
178
  finally:
179
+ self.shutdown_producer()
148
180
  if not async_run.done():
149
181
  async_run.cancel()
150
182
 
@@ -232,7 +232,10 @@ class AbstractWarehouse(ABC, Serializable):
232
232
  if limit < page_size:
233
233
  paginated_query = paginated_query.limit(None).limit(limit)
234
234
 
235
- results = self.dataset_rows_select(paginated_query.offset(offset))
235
+ # Ensure we're using a thread-local connection
236
+ with self.clone() as wh:
237
+ # Cursor results are not thread-safe, so we convert them to a list
238
+ results = list(wh.dataset_rows_select(paginated_query.offset(offset)))
236
239
 
237
240
  processed = False
238
241
  for row in results:
@@ -334,6 +334,7 @@ class DataChain:
334
334
  parallel=None,
335
335
  workers=None,
336
336
  min_task_size=None,
337
+ prefetch: Optional[int] = None,
337
338
  sys: Optional[bool] = None,
338
339
  ) -> "Self":
339
340
  """Change settings for chain.
@@ -360,7 +361,7 @@ class DataChain:
360
361
  if sys is None:
361
362
  sys = self._sys
362
363
  settings = copy.copy(self._settings)
363
- settings.add(Settings(cache, parallel, workers, min_task_size))
364
+ settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
364
365
  return self._evolve(settings=settings, _sys=sys)
365
366
 
366
367
  def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
@@ -882,6 +883,8 @@ class DataChain:
882
883
  ```
883
884
  """
884
885
  udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
886
+ if (prefetch := self._settings.prefetch) is not None:
887
+ udf_obj.prefetch = prefetch
885
888
 
886
889
  return self._evolve(
887
890
  query=self._query.add_signals(
@@ -919,6 +922,8 @@ class DataChain:
919
922
  ```
920
923
  """
921
924
  udf_obj = self._udf_to_obj(Generator, func, params, output, signal_map)
925
+ if (prefetch := self._settings.prefetch) is not None:
926
+ udf_obj.prefetch = prefetch
922
927
  return self._evolve(
923
928
  query=self._query.generate(
924
929
  udf_obj.to_udf_wrapper(),
@@ -268,6 +268,11 @@ class File(DataModel):
268
268
  client = self._catalog.get_client(self.source)
269
269
  client.download(self, callback=self._download_cb)
270
270
 
271
+ async def _prefetch(self) -> None:
272
+ if self._caching_enabled:
273
+ client = self._catalog.get_client(self.source)
274
+ await client._download(self, callback=self._download_cb)
275
+
271
276
  def get_local_path(self) -> Optional[str]:
272
277
  """Return path to a file in a local cache.
273
278
 
@@ -0,0 +1,5 @@
1
+ from . import yolo
2
+ from .bbox import BBox
3
+ from .pose import Pose, Pose3D
4
+
5
+ __all__ = ["BBox", "Pose", "Pose3D", "yolo"]
@@ -0,0 +1,45 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import Field
4
+
5
+ from datachain.lib.data_model import DataModel
6
+
7
+
8
+ class BBox(DataModel):
9
+ """
10
+ A data model for representing bounding boxes.
11
+
12
+ Attributes:
13
+ title (str): The title of the bounding box.
14
+ x1 (float): The x-coordinate of the top-left corner of the bounding box.
15
+ y1 (float): The y-coordinate of the top-left corner of the bounding box.
16
+ x2 (float): The x-coordinate of the bottom-right corner of the bounding box.
17
+ y2 (float): The y-coordinate of the bottom-right corner of the bounding box.
18
+
19
+ The bounding box is defined by two points:
20
+ - (x1, y1): The top-left corner of the box.
21
+ - (x2, y2): The bottom-right corner of the box.
22
+ """
23
+
24
+ title: str = Field(default="")
25
+ x1: float = Field(default=0)
26
+ y1: float = Field(default=0)
27
+ x2: float = Field(default=0)
28
+ y2: float = Field(default=0)
29
+
30
+ @staticmethod
31
+ def from_xywh(bbox: list[float], title: Optional[str] = None) -> "BBox":
32
+ """
33
+ Converts a bounding box in (x, y, width, height) format
34
+ to a BBox data model instance.
35
+
36
+ Args:
37
+ bbox (list[float]): A bounding box, represented as a list
38
+ of four floats [x, y, width, height].
39
+
40
+ Returns:
41
+ BBox2D: An instance of the BBox data model.
42
+ """
43
+ assert len(bbox) == 4, f"Bounding box must have 4 elements, got f{len(bbox)}"
44
+ x, y, w, h = bbox
45
+ return BBox(title=title or "", x1=x, y1=y, x2=x + w, y2=y + h)
@@ -0,0 +1,37 @@
1
+ from pydantic import Field
2
+
3
+ from datachain.lib.data_model import DataModel
4
+
5
+
6
+ class Pose(DataModel):
7
+ """
8
+ A data model for representing pose keypoints.
9
+
10
+ Attributes:
11
+ x (list[float]): The x-coordinates of the keypoints.
12
+ y (list[float]): The y-coordinates of the keypoints.
13
+
14
+ The keypoints are represented as lists of x and y coordinates, where each index
15
+ corresponds to a specific body part.
16
+ """
17
+
18
+ x: list[float] = Field(default=None)
19
+ y: list[float] = Field(default=None)
20
+
21
+
22
+ class Pose3D(DataModel):
23
+ """
24
+ A data model for representing 3D pose keypoints.
25
+
26
+ Attributes:
27
+ x (list[float]): The x-coordinates of the keypoints.
28
+ y (list[float]): The y-coordinates of the keypoints.
29
+ visible (list[float]): The visibility of the keypoints.
30
+
31
+ The keypoints are represented as lists of x, y, and visibility values,
32
+ where each index corresponds to a specific body part.
33
+ """
34
+
35
+ x: list[float] = Field(default=None)
36
+ y: list[float] = Field(default=None)
37
+ visible: list[float] = Field(default=None)
@@ -0,0 +1,39 @@
1
+ """
2
+ This module contains the YOLO models.
3
+
4
+ YOLO stands for "You Only Look Once", a family of object detection models that
5
+ are designed to be fast and accurate. The models are trained to detect objects
6
+ in images by dividing the image into a grid and predicting the bounding boxes
7
+ and class probabilities for each grid cell.
8
+
9
+ More information about YOLO can be found here:
10
+ - https://pjreddie.com/darknet/yolo/
11
+ - https://docs.ultralytics.com/
12
+ """
13
+
14
+
15
+ class PoseBodyPart:
16
+ """
17
+ An enumeration of body parts for YOLO pose keypoints.
18
+
19
+ More information about the body parts can be found here:
20
+ https://docs.ultralytics.com/tasks/pose/
21
+ """
22
+
23
+ nose = 0
24
+ left_eye = 1
25
+ right_eye = 2
26
+ left_ear = 3
27
+ right_ear = 4
28
+ left_shoulder = 5
29
+ right_shoulder = 6
30
+ left_elbow = 7
31
+ right_elbow = 8
32
+ left_wrist = 9
33
+ right_wrist = 10
34
+ left_hip = 11
35
+ right_hip = 12
36
+ left_knee = 13
37
+ right_knee = 14
38
+ left_ankle = 15
39
+ right_ankle = 16
@@ -7,11 +7,19 @@ class SettingsError(DataChainParamsError):
7
7
 
8
8
 
9
9
  class Settings:
10
- def __init__(self, cache=None, parallel=None, workers=None, min_task_size=None):
10
+ def __init__(
11
+ self,
12
+ cache=None,
13
+ parallel=None,
14
+ workers=None,
15
+ min_task_size=None,
16
+ prefetch=None,
17
+ ):
11
18
  self._cache = cache
12
19
  self.parallel = parallel
13
20
  self._workers = workers
14
21
  self.min_task_size = min_task_size
22
+ self.prefetch = prefetch
15
23
 
16
24
  if not isinstance(cache, bool) and cache is not None:
17
25
  raise SettingsError(
@@ -66,3 +74,5 @@ class Settings:
66
74
  self.parallel = settings.parallel or self.parallel
67
75
  self._workers = settings._workers or self._workers
68
76
  self.min_task_size = settings.min_task_size or self.min_task_size
77
+ if settings.prefetch is not None:
78
+ self.prefetch = settings.prefetch
@@ -1,3 +1,4 @@
1
+ import contextlib
1
2
  import sys
2
3
  import traceback
3
4
  from collections.abc import Iterable, Iterator, Mapping, Sequence
@@ -7,6 +8,7 @@ import attrs
7
8
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
8
9
  from pydantic import BaseModel
9
10
 
11
+ from datachain.asyn import AsyncMapper
10
12
  from datachain.dataset import RowDict
11
13
  from datachain.lib.convert.flatten import flatten
12
14
  from datachain.lib.data_model import DataValue
@@ -21,6 +23,8 @@ from datachain.query.batch import (
21
23
  )
22
24
 
23
25
  if TYPE_CHECKING:
26
+ from collections import abc
27
+
24
28
  from typing_extensions import Self
25
29
 
26
30
  from datachain.catalog import Catalog
@@ -276,9 +280,18 @@ class UDFBase(AbstractUDF):
276
280
  return result_objs
277
281
 
278
282
 
283
+ async def _prefetch_input(row):
284
+ for obj in row:
285
+ if isinstance(obj, File):
286
+ await obj._prefetch()
287
+ return row
288
+
289
+
279
290
  class Mapper(UDFBase):
280
291
  """Inherit from this class to pass to `DataChain.map()`."""
281
292
 
293
+ prefetch: int = 2
294
+
282
295
  def run(
283
296
  self,
284
297
  udf_fields: "Sequence[str]",
@@ -290,16 +303,22 @@ class Mapper(UDFBase):
290
303
  ) -> Iterator[Iterable[UDFResult]]:
291
304
  self.catalog = catalog
292
305
  self.setup()
293
-
294
- for row in udf_inputs:
295
- id_, *udf_args = self._prepare_row_and_id(
296
- row, udf_fields, cache, download_cb
297
- )
298
- result_objs = self.process_safe(udf_args)
299
- udf_output = self._flatten_row(result_objs)
300
- output = [{"sys__id": id_} | dict(zip(self.signal_names, udf_output))]
301
- processed_cb.relative_update(1)
302
- yield output
306
+ prepared_inputs: abc.Generator[Sequence[Any], None, None] = (
307
+ self._prepare_row_and_id(row, udf_fields, cache, download_cb)
308
+ for row in udf_inputs
309
+ )
310
+ if self.prefetch > 0:
311
+ prepared_inputs = AsyncMapper(
312
+ _prefetch_input, prepared_inputs, workers=self.prefetch
313
+ ).iterate()
314
+
315
+ with contextlib.closing(prepared_inputs):
316
+ for id_, *udf_args in prepared_inputs:
317
+ result_objs = self.process_safe(udf_args)
318
+ udf_output = self._flatten_row(result_objs)
319
+ output = [{"sys__id": id_} | dict(zip(self.signal_names, udf_output))]
320
+ processed_cb.relative_update(1)
321
+ yield output
303
322
 
304
323
  self.teardown()
305
324
 
@@ -349,6 +368,7 @@ class Generator(UDFBase):
349
368
  """Inherit from this class to pass to `DataChain.gen()`."""
350
369
 
351
370
  is_output_batched = True
371
+ prefetch: int = 2
352
372
 
353
373
  def run(
354
374
  self,
@@ -361,14 +381,21 @@ class Generator(UDFBase):
361
381
  ) -> Iterator[Iterable[UDFResult]]:
362
382
  self.catalog = catalog
363
383
  self.setup()
364
-
365
- for row in udf_inputs:
366
- udf_args = self._prepare_row(row, udf_fields, cache, download_cb)
367
- result_objs = self.process_safe(udf_args)
368
- udf_outputs = (self._flatten_row(row) for row in result_objs)
369
- output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
370
- processed_cb.relative_update(1)
371
- yield output
384
+ prepared_inputs: abc.Generator[Sequence[Any], None, None] = (
385
+ self._prepare_row(row, udf_fields, cache, download_cb) for row in udf_inputs
386
+ )
387
+ if self.prefetch > 0:
388
+ prepared_inputs = AsyncMapper(
389
+ _prefetch_input, prepared_inputs, workers=self.prefetch
390
+ ).iterate()
391
+
392
+ with contextlib.closing(prepared_inputs):
393
+ for row in prepared_inputs:
394
+ result_objs = self.process_safe(row)
395
+ udf_outputs = (self._flatten_row(row) for row in result_objs)
396
+ output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
397
+ processed_cb.relative_update(1)
398
+ yield output
372
399
 
373
400
  self.teardown()
374
401
 
@@ -55,7 +55,7 @@ class Node:
55
55
  last_modified: Optional[datetime] = None
56
56
  size: int = 0
57
57
  location: Optional[str] = None
58
- source: StorageURI = StorageURI("")
58
+ source: StorageURI = StorageURI("") # noqa: RUF009
59
59
  dir_type: int = DirType.FILE
60
60
 
61
61
  @property
@@ -473,33 +473,31 @@ class UDFStep(Step, ABC):
473
473
  # Otherwise process single-threaded (faster for smaller UDFs)
474
474
  warehouse = self.catalog.warehouse
475
475
 
476
- with contextlib.closing(
477
- batching(warehouse.dataset_select_paginated, query)
478
- ) as udf_inputs:
479
- download_cb = get_download_callback()
480
- processed_cb = get_processed_callback()
481
- generated_cb = get_generated_callback(self.is_generator)
482
- try:
483
- udf_results = self.udf.run(
484
- udf_fields,
485
- udf_inputs,
486
- self.catalog,
487
- self.is_generator,
488
- self.cache,
489
- download_cb,
490
- processed_cb,
491
- )
492
- process_udf_outputs(
493
- warehouse,
494
- udf_table,
495
- udf_results,
496
- self.udf,
497
- cb=generated_cb,
498
- )
499
- finally:
500
- download_cb.close()
501
- processed_cb.close()
502
- generated_cb.close()
476
+ udf_inputs = batching(warehouse.dataset_select_paginated, query)
477
+ download_cb = get_download_callback()
478
+ processed_cb = get_processed_callback()
479
+ generated_cb = get_generated_callback(self.is_generator)
480
+ try:
481
+ udf_results = self.udf.run(
482
+ udf_fields,
483
+ udf_inputs,
484
+ self.catalog,
485
+ self.is_generator,
486
+ self.cache,
487
+ download_cb,
488
+ processed_cb,
489
+ )
490
+ process_udf_outputs(
491
+ warehouse,
492
+ udf_table,
493
+ udf_results,
494
+ self.udf,
495
+ cb=generated_cb,
496
+ )
497
+ finally:
498
+ download_cb.close()
499
+ processed_cb.close()
500
+ generated_cb.close()
503
501
 
504
502
  warehouse.insert_rows_done(udf_table)
505
503
 
@@ -36,7 +36,14 @@ def convert_array(arr):
36
36
 
37
37
 
38
38
  def adapt_np_array(arr):
39
- return orjson.dumps(arr, option=orjson.OPT_SERIALIZE_NUMPY).decode("utf-8")
39
+ def _json_serialize(obj):
40
+ if isinstance(obj, np.ndarray):
41
+ return obj.tolist()
42
+ return obj
43
+
44
+ return orjson.dumps(
45
+ arr, option=orjson.OPT_SERIALIZE_NUMPY, default=_json_serialize
46
+ ).decode("utf-8")
40
47
 
41
48
 
42
49
  def adapt_np_generic(val):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.10
3
+ Version: 0.7.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -131,11 +131,7 @@ src/datachain/lib/func/func.py
131
131
  src/datachain/lib/models/__init__.py
132
132
  src/datachain/lib/models/bbox.py
133
133
  src/datachain/lib/models/pose.py
134
- src/datachain/lib/models/segment.py
135
- src/datachain/lib/models/ultralytics/__init__.py
136
- src/datachain/lib/models/ultralytics/bbox.py
137
- src/datachain/lib/models/ultralytics/pose.py
138
- src/datachain/lib/models/ultralytics/segment.py
134
+ src/datachain/lib/models/yolo.py
139
135
  src/datachain/query/__init__.py
140
136
  src/datachain/query/batch.py
141
137
  src/datachain/query/dataset.py
@@ -263,4 +259,5 @@ tests/unit/sql/test_random.py
263
259
  tests/unit/sql/test_selectable.py
264
260
  tests/unit/sql/test_string.py
265
261
  tests/unit/sql/sqlite/__init__.py
262
+ tests/unit/sql/sqlite/test_types.py
266
263
  tests/unit/sql/sqlite/test_utils.py
@@ -213,17 +213,20 @@ def test_from_storage_dependencies(cloud_test_catalog, cloud_type):
213
213
 
214
214
 
215
215
  @pytest.mark.parametrize("use_cache", [True, False])
216
- def test_map_file(cloud_test_catalog, use_cache):
216
+ @pytest.mark.parametrize("prefetch", [0, 2])
217
+ def test_map_file(cloud_test_catalog, use_cache, prefetch):
217
218
  ctc = cloud_test_catalog
218
219
 
219
220
  def new_signal(file: File) -> str:
221
+ assert bool(file.get_local_path()) is (use_cache and prefetch > 0)
220
222
  with file.open() as f:
221
223
  return file.name + " -> " + f.read().decode("utf-8")
222
224
 
223
225
  dc = (
224
226
  DataChain.from_storage(ctc.src_uri, session=ctc.session)
225
- .settings(cache=use_cache)
227
+ .settings(cache=use_cache, prefetch=prefetch)
226
228
  .map(signal=new_signal)
229
+ .save()
227
230
  )
228
231
 
229
232
  expected = {
@@ -36,5 +36,5 @@ DataChain.from_storage(
36
36
  "gs://dvcx-datalakes/dogs-and-cats/",
37
37
  anon=True,
38
38
  ).filter(C("file.path").glob("*cat*")).settings(parallel=1).map(
39
- name_len, params=["file.path"], output={"name_len": int}
39
+ name_len, params=["file"], output={"name_len": int}
40
40
  ).save("name_len")
@@ -0,0 +1,50 @@
1
+ from datachain.lib import models
2
+
3
+
4
+ def test_bbox():
5
+ bbox = models.BBox(title="BBox", x1=0.5, y1=1.5, x2=2.5, y2=3.5)
6
+ assert bbox.model_dump() == {
7
+ "title": "BBox",
8
+ "x1": 0.5,
9
+ "y1": 1.5,
10
+ "x2": 2.5,
11
+ "y2": 3.5,
12
+ }
13
+
14
+
15
+ def test_bbox_from_xywh():
16
+ bbox = models.BBox.from_xywh([0.5, 1.5, 2.5, 3.5])
17
+ assert bbox.model_dump() == {"title": "", "x1": 0.5, "y1": 1.5, "x2": 3, "y2": 5}
18
+
19
+ bbox = models.BBox.from_xywh([0.5, 1.5, 2.5, 3.5], title="BBox")
20
+ assert bbox.model_dump() == {
21
+ "title": "BBox",
22
+ "x1": 0.5,
23
+ "y1": 1.5,
24
+ "x2": 3,
25
+ "y2": 5,
26
+ }
27
+
28
+
29
+ def test_pose():
30
+ x = [x * 0.5 for x in range(17)]
31
+ y = [y * 1.5 for y in range(17)]
32
+ pose = models.Pose(x=x, y=y)
33
+ assert pose.model_dump() == {"x": x, "y": y}
34
+ assert pose.x[models.yolo.PoseBodyPart.nose] == 0
35
+ assert pose.x[models.yolo.PoseBodyPart.left_eye] == 0.5
36
+ assert pose.x[models.yolo.PoseBodyPart.right_eye] == 1
37
+ assert pose.x[models.yolo.PoseBodyPart.left_ear] == 1.5
38
+ assert pose.x[models.yolo.PoseBodyPart.right_ear] == 2
39
+ assert pose.x[models.yolo.PoseBodyPart.left_shoulder] == 2.5
40
+ assert pose.x[models.yolo.PoseBodyPart.right_shoulder] == 3
41
+ assert pose.x[models.yolo.PoseBodyPart.left_elbow] == 3.5
42
+ assert pose.x[models.yolo.PoseBodyPart.right_elbow] == 4
43
+ assert pose.x[models.yolo.PoseBodyPart.left_wrist] == 4.5
44
+ assert pose.x[models.yolo.PoseBodyPart.right_wrist] == 5
45
+ assert pose.x[models.yolo.PoseBodyPart.left_hip] == 5.5
46
+ assert pose.x[models.yolo.PoseBodyPart.right_hip] == 6
47
+ assert pose.x[models.yolo.PoseBodyPart.left_knee] == 6.5
48
+ assert pose.x[models.yolo.PoseBodyPart.right_knee] == 7
49
+ assert pose.x[models.yolo.PoseBodyPart.left_ankle] == 7.5
50
+ assert pose.x[models.yolo.PoseBodyPart.right_ankle] == 8
@@ -0,0 +1,19 @@
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from datachain.sql.sqlite.types import adapt_np_array
5
+
6
+
7
+ @pytest.mark.parametrize(
8
+ "dtype,arr,expected",
9
+ (
10
+ (float, [], "[]"),
11
+ (float, [0.5, 0.6], "[0.5,0.6]"),
12
+ (float, [[0.5, 0.6], [0.7, 0.8]], "[[0.5,0.6],[0.7,0.8]]"),
13
+ (np.dtypes.ObjectDType, [], "[]"),
14
+ (np.dtypes.ObjectDType, [0.5, 0.6], "[0.5,0.6]"),
15
+ (np.dtypes.ObjectDType, [[0.5, 0.6], [0.7, 0.8]], "[[0.5,0.6],[0.7,0.8]]"),
16
+ ),
17
+ )
18
+ def test_adapt_np_array(dtype, arr, expected):
19
+ assert adapt_np_array(np.array(arr, dtype=dtype)) == expected