datachain 0.3.13__tar.gz → 0.3.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (245) hide show
  1. {datachain-0.3.13/src/datachain.egg-info → datachain-0.3.14}/PKG-INFO +1 -1
  2. {datachain-0.3.13 → datachain-0.3.14}/examples/computer_vision/iptc_exif_xmp_lib.py +7 -1
  3. {datachain-0.3.13 → datachain-0.3.14}/examples/computer_vision/llava2_image_desc_lib.py +7 -1
  4. {datachain-0.3.13 → datachain-0.3.14}/examples/get_started/json-csv-reader.py +0 -2
  5. {datachain-0.3.13 → datachain-0.3.14}/examples/get_started/torch-loader.py +6 -1
  6. {datachain-0.3.13 → datachain-0.3.14}/examples/get_started/udfs/stateful.py +2 -2
  7. {datachain-0.3.13 → datachain-0.3.14}/noxfile.py +1 -0
  8. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/asyn.py +4 -9
  9. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/catalog/catalog.py +2 -2
  10. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/client/azure.py +1 -13
  11. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/client/fsspec.py +7 -7
  12. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/client/gcs.py +2 -13
  13. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/client/hf.py +0 -10
  14. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/client/local.py +3 -12
  15. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/client/s3.py +9 -19
  16. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/data_storage/sqlite.py +10 -1
  17. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/data_storage/warehouse.py +11 -17
  18. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/listing.py +1 -2
  19. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/model_store.py +2 -2
  20. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/pytorch.py +32 -26
  21. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/signal_schema.py +146 -58
  22. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/listing.py +6 -8
  23. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/node.py +0 -43
  24. {datachain-0.3.13 → datachain-0.3.14/src/datachain.egg-info}/PKG-INFO +1 -1
  25. {datachain-0.3.13 → datachain-0.3.14}/tests/conftest.py +35 -0
  26. {datachain-0.3.13 → datachain-0.3.14}/tests/data.py +11 -11
  27. {datachain-0.3.13 → datachain-0.3.14}/tests/examples/test_wds_e2e.py +10 -8
  28. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_catalog.py +28 -3
  29. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_datachain.py +164 -3
  30. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_dataset_query.py +6 -186
  31. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_feature_pickling.py +66 -1
  32. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_pull.py +1 -2
  33. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_query.py +3 -0
  34. {datachain-0.3.13 → datachain-0.3.14}/tests/test_cli_e2e.py +10 -3
  35. {datachain-0.3.13 → datachain-0.3.14}/tests/test_query_e2e.py +10 -3
  36. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_datachain.py +1 -0
  37. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_signal_schema.py +244 -8
  38. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_id_generator.py +3 -1
  39. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_listing.py +3 -2
  40. {datachain-0.3.13 → datachain-0.3.14}/.cruft.json +0 -0
  41. {datachain-0.3.13 → datachain-0.3.14}/.gitattributes +0 -0
  42. {datachain-0.3.13 → datachain-0.3.14}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  43. {datachain-0.3.13 → datachain-0.3.14}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  44. {datachain-0.3.13 → datachain-0.3.14}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  45. {datachain-0.3.13 → datachain-0.3.14}/.github/codecov.yaml +0 -0
  46. {datachain-0.3.13 → datachain-0.3.14}/.github/dependabot.yml +0 -0
  47. {datachain-0.3.13 → datachain-0.3.14}/.github/workflows/benchmarks.yml +0 -0
  48. {datachain-0.3.13 → datachain-0.3.14}/.github/workflows/release.yml +0 -0
  49. {datachain-0.3.13 → datachain-0.3.14}/.github/workflows/tests-studio.yml +0 -0
  50. {datachain-0.3.13 → datachain-0.3.14}/.github/workflows/tests.yml +0 -0
  51. {datachain-0.3.13 → datachain-0.3.14}/.github/workflows/update-template.yaml +0 -0
  52. {datachain-0.3.13 → datachain-0.3.14}/.gitignore +0 -0
  53. {datachain-0.3.13 → datachain-0.3.14}/.pre-commit-config.yaml +0 -0
  54. {datachain-0.3.13 → datachain-0.3.14}/CODE_OF_CONDUCT.rst +0 -0
  55. {datachain-0.3.13 → datachain-0.3.14}/CONTRIBUTING.rst +0 -0
  56. {datachain-0.3.13 → datachain-0.3.14}/LICENSE +0 -0
  57. {datachain-0.3.13 → datachain-0.3.14}/README.rst +0 -0
  58. {datachain-0.3.13 → datachain-0.3.14}/docs/assets/captioned_cartoons.png +0 -0
  59. {datachain-0.3.13 → datachain-0.3.14}/docs/assets/datachain-white.svg +0 -0
  60. {datachain-0.3.13 → datachain-0.3.14}/docs/assets/datachain.svg +0 -0
  61. {datachain-0.3.13 → datachain-0.3.14}/docs/assets/flowchart.png +0 -0
  62. {datachain-0.3.13 → datachain-0.3.14}/docs/index.md +0 -0
  63. {datachain-0.3.13 → datachain-0.3.14}/docs/references/datachain.md +0 -0
  64. {datachain-0.3.13 → datachain-0.3.14}/docs/references/datatype.md +0 -0
  65. {datachain-0.3.13 → datachain-0.3.14}/docs/references/file.md +0 -0
  66. {datachain-0.3.13 → datachain-0.3.14}/docs/references/index.md +0 -0
  67. {datachain-0.3.13 → datachain-0.3.14}/docs/references/sql.md +0 -0
  68. {datachain-0.3.13 → datachain-0.3.14}/docs/references/torch.md +0 -0
  69. {datachain-0.3.13 → datachain-0.3.14}/docs/references/udf.md +0 -0
  70. {datachain-0.3.13 → datachain-0.3.14}/examples/computer_vision/openimage-detect.py +0 -0
  71. {datachain-0.3.13 → datachain-0.3.14}/examples/get_started/common_sql_functions.py +0 -0
  72. {datachain-0.3.13 → datachain-0.3.14}/examples/get_started/udfs/parallel.py +0 -0
  73. {datachain-0.3.13 → datachain-0.3.14}/examples/get_started/udfs/simple.py +0 -0
  74. {datachain-0.3.13 → datachain-0.3.14}/examples/llm_and_nlp/claude-query.py +0 -0
  75. {datachain-0.3.13 → datachain-0.3.14}/examples/llm_and_nlp/unstructured-text.py +0 -0
  76. {datachain-0.3.13 → datachain-0.3.14}/examples/multimodal/clip_inference.py +0 -0
  77. {datachain-0.3.13 → datachain-0.3.14}/examples/multimodal/hf_pipeline.py +0 -0
  78. {datachain-0.3.13 → datachain-0.3.14}/examples/multimodal/openai_image_desc_lib.py +0 -0
  79. {datachain-0.3.13 → datachain-0.3.14}/examples/multimodal/wds.py +0 -0
  80. {datachain-0.3.13 → datachain-0.3.14}/examples/multimodal/wds_filtered.py +0 -0
  81. {datachain-0.3.13 → datachain-0.3.14}/mkdocs.yml +0 -0
  82. {datachain-0.3.13 → datachain-0.3.14}/pyproject.toml +0 -0
  83. {datachain-0.3.13 → datachain-0.3.14}/setup.cfg +0 -0
  84. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/__init__.py +0 -0
  85. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/__main__.py +0 -0
  86. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/cache.py +0 -0
  87. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/catalog/__init__.py +0 -0
  88. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/catalog/datasource.py +0 -0
  89. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/catalog/loader.py +0 -0
  90. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/cli.py +0 -0
  91. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/cli_utils.py +0 -0
  92. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/client/__init__.py +0 -0
  93. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/client/fileslice.py +0 -0
  94. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/config.py +0 -0
  95. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/data_storage/__init__.py +0 -0
  96. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/data_storage/db_engine.py +0 -0
  97. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/data_storage/id_generator.py +0 -0
  98. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/data_storage/job.py +0 -0
  99. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/data_storage/metastore.py +0 -0
  100. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/data_storage/schema.py +0 -0
  101. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/data_storage/serializer.py +0 -0
  102. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/dataset.py +0 -0
  103. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/error.py +0 -0
  104. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/job.py +0 -0
  105. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/__init__.py +0 -0
  106. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/arrow.py +0 -0
  107. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/clip.py +0 -0
  108. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/convert/__init__.py +0 -0
  109. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/convert/flatten.py +0 -0
  110. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/convert/python_to_sql.py +0 -0
  111. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/convert/sql_to_python.py +0 -0
  112. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/convert/unflatten.py +0 -0
  113. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  114. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/data_model.py +0 -0
  115. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/dataset_info.py +0 -0
  116. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/dc.py +0 -0
  117. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/file.py +0 -0
  118. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/hf.py +0 -0
  119. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/image.py +0 -0
  120. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/listing_info.py +0 -0
  121. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/meta_formats.py +0 -0
  122. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/settings.py +0 -0
  123. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/text.py +0 -0
  124. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/udf.py +0 -0
  125. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/udf_signature.py +0 -0
  126. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/utils.py +0 -0
  127. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/vfile.py +0 -0
  128. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/webdataset.py +0 -0
  129. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/lib/webdataset_laion.py +0 -0
  130. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/nodes_fetcher.py +0 -0
  131. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/nodes_thread_pool.py +0 -0
  132. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/progress.py +0 -0
  133. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/py.typed +0 -0
  134. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/__init__.py +0 -0
  135. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/batch.py +0 -0
  136. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/builtins.py +0 -0
  137. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/dataset.py +0 -0
  138. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/dispatch.py +0 -0
  139. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/metrics.py +0 -0
  140. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/params.py +0 -0
  141. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/queue.py +0 -0
  142. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/schema.py +0 -0
  143. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/session.py +0 -0
  144. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/query/udf.py +0 -0
  145. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/remote/__init__.py +0 -0
  146. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/remote/studio.py +0 -0
  147. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/__init__.py +0 -0
  148. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/default/__init__.py +0 -0
  149. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/default/base.py +0 -0
  150. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/functions/__init__.py +0 -0
  151. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/functions/array.py +0 -0
  152. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/functions/conditional.py +0 -0
  153. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/functions/path.py +0 -0
  154. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/functions/random.py +0 -0
  155. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/functions/string.py +0 -0
  156. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/selectable.py +0 -0
  157. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/sqlite/__init__.py +0 -0
  158. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/sqlite/base.py +0 -0
  159. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/sqlite/types.py +0 -0
  160. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/sqlite/vector.py +0 -0
  161. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/types.py +0 -0
  162. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/sql/utils.py +0 -0
  163. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/storage.py +0 -0
  164. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/torch/__init__.py +0 -0
  165. {datachain-0.3.13 → datachain-0.3.14}/src/datachain/utils.py +0 -0
  166. {datachain-0.3.13 → datachain-0.3.14}/src/datachain.egg-info/SOURCES.txt +0 -0
  167. {datachain-0.3.13 → datachain-0.3.14}/src/datachain.egg-info/dependency_links.txt +0 -0
  168. {datachain-0.3.13 → datachain-0.3.14}/src/datachain.egg-info/entry_points.txt +0 -0
  169. {datachain-0.3.13 → datachain-0.3.14}/src/datachain.egg-info/requires.txt +0 -0
  170. {datachain-0.3.13 → datachain-0.3.14}/src/datachain.egg-info/top_level.txt +0 -0
  171. {datachain-0.3.13 → datachain-0.3.14}/tests/__init__.py +0 -0
  172. {datachain-0.3.13 → datachain-0.3.14}/tests/benchmarks/__init__.py +0 -0
  173. {datachain-0.3.13 → datachain-0.3.14}/tests/benchmarks/conftest.py +0 -0
  174. {datachain-0.3.13 → datachain-0.3.14}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  175. {datachain-0.3.13 → datachain-0.3.14}/tests/benchmarks/datasets/.dvc/config +0 -0
  176. {datachain-0.3.13 → datachain-0.3.14}/tests/benchmarks/datasets/.gitignore +0 -0
  177. {datachain-0.3.13 → datachain-0.3.14}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  178. {datachain-0.3.13 → datachain-0.3.14}/tests/benchmarks/test_datachain.py +0 -0
  179. {datachain-0.3.13 → datachain-0.3.14}/tests/benchmarks/test_ls.py +0 -0
  180. {datachain-0.3.13 → datachain-0.3.14}/tests/benchmarks/test_version.py +0 -0
  181. {datachain-0.3.13 → datachain-0.3.14}/tests/examples/__init__.py +0 -0
  182. {datachain-0.3.13 → datachain-0.3.14}/tests/examples/test_examples.py +0 -0
  183. {datachain-0.3.13 → datachain-0.3.14}/tests/examples/wds_data.py +0 -0
  184. {datachain-0.3.13 → datachain-0.3.14}/tests/func/__init__.py +0 -0
  185. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_client.py +0 -0
  186. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_datasets.py +0 -0
  187. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_listing.py +0 -0
  188. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_ls.py +0 -0
  189. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_meta_formats.py +0 -0
  190. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_metrics.py +0 -0
  191. {datachain-0.3.13 → datachain-0.3.14}/tests/func/test_pytorch.py +0 -0
  192. {datachain-0.3.13 → datachain-0.3.14}/tests/scripts/feature_class.py +0 -0
  193. {datachain-0.3.13 → datachain-0.3.14}/tests/scripts/feature_class_parallel.py +0 -0
  194. {datachain-0.3.13 → datachain-0.3.14}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  195. {datachain-0.3.13 → datachain-0.3.14}/tests/scripts/name_len_slow.py +0 -0
  196. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/__init__.py +0 -0
  197. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/__init__.py +0 -0
  198. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/conftest.py +0 -0
  199. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_arrow.py +0 -0
  200. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_clip.py +0 -0
  201. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  202. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_datachain_merge.py +0 -0
  203. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_feature.py +0 -0
  204. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_feature_utils.py +0 -0
  205. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_file.py +0 -0
  206. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_hf.py +0 -0
  207. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_image.py +0 -0
  208. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_schema.py +0 -0
  209. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_sql_to_python.py +0 -0
  210. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_text.py +0 -0
  211. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_udf_signature.py +0 -0
  212. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_utils.py +0 -0
  213. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/lib/test_webdataset.py +0 -0
  214. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/sql/__init__.py +0 -0
  215. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/sql/sqlite/__init__.py +0 -0
  216. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/sql/sqlite/test_utils.py +0 -0
  217. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/sql/test_array.py +0 -0
  218. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/sql/test_conditional.py +0 -0
  219. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/sql/test_path.py +0 -0
  220. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/sql/test_random.py +0 -0
  221. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/sql/test_selectable.py +0 -0
  222. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/sql/test_string.py +0 -0
  223. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_asyn.py +0 -0
  224. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_cache.py +0 -0
  225. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_catalog.py +0 -0
  226. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_catalog_loader.py +0 -0
  227. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_cli_parsing.py +0 -0
  228. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_client.py +0 -0
  229. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_client_s3.py +0 -0
  230. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_data_storage.py +0 -0
  231. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_database_engine.py +0 -0
  232. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_dataset.py +0 -0
  233. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_dispatch.py +0 -0
  234. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_fileslice.py +0 -0
  235. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_metastore.py +0 -0
  236. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_module_exports.py +0 -0
  237. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_query_metrics.py +0 -0
  238. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_query_params.py +0 -0
  239. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_serializer.py +0 -0
  240. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_session.py +0 -0
  241. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_storage.py +0 -0
  242. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_udf.py +0 -0
  243. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_utils.py +0 -0
  244. {datachain-0.3.13 → datachain-0.3.14}/tests/unit/test_warehouse.py +0 -0
  245. {datachain-0.3.13 → datachain-0.3.14}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.13
3
+ Version: 0.3.14
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -1,4 +1,10 @@
1
- # pip install defusedxml
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
2
8
  import json
3
9
 
4
10
  from PIL import (
@@ -1,4 +1,10 @@
1
- # pip install accelerate torch
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
2
8
  import torch
3
9
  from transformers import (
4
10
  AutoProcessor,
@@ -1,5 +1,3 @@
1
- # pip install datamodel-code-generator jmespath
2
-
3
1
  from typing import Optional
4
2
 
5
3
  from pydantic import BaseModel
@@ -1,4 +1,9 @@
1
- # pip install Pillow torchvision
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[torch]
5
+
6
+ """
2
7
 
3
8
  import os
4
9
  from posixpath import basename
@@ -1,7 +1,7 @@
1
1
  """
2
- To install dependencies:
2
+ To install the required dependencies:
3
3
 
4
- pip install open_clip_torch
4
+ pip install datachain[examples]
5
5
 
6
6
  """
7
7
 
@@ -40,6 +40,7 @@ def tests(session: nox.Session) -> None:
40
40
  "--cov-report=xml",
41
41
  "--durations=10",
42
42
  "--numprocesses=logical",
43
+ "--dist=loadgroup",
43
44
  *session.posargs,
44
45
  env={"COVERAGE_FILE": f".coverage.{session.python}"},
45
46
  )
@@ -1,14 +1,8 @@
1
1
  import asyncio
2
- from collections.abc import Awaitable, Coroutine, Iterable
2
+ from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
3
3
  from concurrent.futures import ThreadPoolExecutor
4
4
  from heapq import heappop, heappush
5
- from typing import (
6
- Any,
7
- Callable,
8
- Generic,
9
- Optional,
10
- TypeVar,
11
- )
5
+ from typing import Any, Callable, Generic, Optional, TypeVar
12
6
 
13
7
  from fsspec.asyn import get_loop
14
8
 
@@ -16,6 +10,7 @@ ASYNC_WORKERS = 20
16
10
 
17
11
  InputT = TypeVar("InputT", contravariant=True) # noqa: PLC0105
18
12
  ResultT = TypeVar("ResultT", covariant=True) # noqa: PLC0105
13
+ T = TypeVar("T")
19
14
 
20
15
 
21
16
  class AsyncMapper(Generic[InputT, ResultT]):
@@ -226,7 +221,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
226
221
  self._push_result(self._next_yield, None)
227
222
 
228
223
 
229
- def iter_over_async(ait, loop):
224
+ def iter_over_async(ait: AsyncIterable[T], loop) -> Iterator[T]:
230
225
  """Wrap an asynchronous iterator into a synchronous one"""
231
226
  ait = ait.__aiter__()
232
227
 
@@ -1390,12 +1390,12 @@ class Catalog:
1390
1390
  dataset = self.get_dataset(name)
1391
1391
  return self.warehouse.dataset_table_export_file_names(dataset, version)
1392
1392
 
1393
- def dataset_stats(self, name: str, version: int) -> DatasetStats:
1393
+ def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
1394
1394
  """
1395
1395
  Returns tuple with dataset stats: total number of rows and total dataset size.
1396
1396
  """
1397
1397
  dataset = self.get_dataset(name)
1398
- dataset_version = dataset.get_version(version)
1398
+ dataset_version = dataset.get_version(version or dataset.latest_version)
1399
1399
  return DatasetStats(
1400
1400
  num_objects=dataset_version.num_objects,
1401
1401
  size=dataset_version.size,
@@ -4,7 +4,6 @@ from adlfs import AzureBlobFileSystem
4
4
  from tqdm import tqdm
5
5
 
6
6
  from datachain.lib.file import File
7
- from datachain.node import Entry
8
7
 
9
8
  from .fsspec import DELIMITER, Client, ResultQueue
10
9
 
@@ -14,17 +13,6 @@ class AzureClient(Client):
14
13
  PREFIX = "az://"
15
14
  protocol = "az"
16
15
 
17
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
18
- version_id = v.get("version_id")
19
- return Entry.from_file(
20
- path=path,
21
- etag=v.get("etag", "").strip('"'),
22
- version=version_id or "",
23
- is_latest=version_id is None or bool(v.get("is_current_version")),
24
- last_modified=v["last_modified"],
25
- size=v.get("size", ""),
26
- )
27
-
28
16
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
29
17
  version_id = v.get("version_id")
30
18
  return File(
@@ -57,7 +45,7 @@ class AzureClient(Client):
57
45
  continue
58
46
  info = (await self.fs._details([b]))[0]
59
47
  entries.append(
60
- self.convert_info(info, self.rel_path(info["name"]))
48
+ self.info_to_file(info, self.rel_path(info["name"]))
61
49
  )
62
50
  if entries:
63
51
  await result_queue.put(entries)
@@ -29,7 +29,7 @@ from tqdm import tqdm
29
29
  from datachain.cache import DataChainCache, UniqueId
30
30
  from datachain.client.fileslice import FileSlice, FileWrapper
31
31
  from datachain.error import ClientError as DataChainClientError
32
- from datachain.node import Entry
32
+ from datachain.lib.file import File
33
33
  from datachain.nodes_fetcher import NodesFetcher
34
34
  from datachain.nodes_thread_pool import NodeChunk
35
35
  from datachain.storage import StorageURI
@@ -45,7 +45,7 @@ DELIMITER = "/" # Path delimiter.
45
45
 
46
46
  DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
47
47
 
48
- ResultQueue = asyncio.Queue[Optional[Sequence[Entry]]]
48
+ ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
49
49
 
50
50
 
51
51
  def _is_win_local_path(uri: str) -> bool:
@@ -188,7 +188,7 @@ class Client(ABC):
188
188
 
189
189
  async def get_current_etag(self, uid: UniqueId) -> str:
190
190
  info = await self.fs._info(self.get_full_path(uid.path))
191
- return self.convert_info(info, "").etag
191
+ return self.info_to_file(info, "").etag
192
192
 
193
193
  async def get_size(self, path: str) -> int:
194
194
  return await self.fs._size(path)
@@ -198,7 +198,7 @@ class Client(ABC):
198
198
 
199
199
  async def scandir(
200
200
  self, start_prefix: str, method: str = "default"
201
- ) -> AsyncIterator[Sequence[Entry]]:
201
+ ) -> AsyncIterator[Sequence[File]]:
202
202
  try:
203
203
  impl = getattr(self, f"_fetch_{method}")
204
204
  except AttributeError:
@@ -264,7 +264,7 @@ class Client(ABC):
264
264
  ) -> None:
265
265
  await self._fetch_nested(start_prefix, result_queue)
266
266
 
267
- async def _fetch_dir(self, prefix, pbar, result_queue) -> set[str]:
267
+ async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
268
268
  path = f"{self.name}/{prefix}"
269
269
  infos = await self.ls_dir(path)
270
270
  files = []
@@ -277,7 +277,7 @@ class Client(ABC):
277
277
  if info["type"] == "directory":
278
278
  subdirs.add(subprefix)
279
279
  else:
280
- files.append(self.convert_info(info, subprefix))
280
+ files.append(self.info_to_file(info, subprefix))
281
281
  if files:
282
282
  await result_queue.put(files)
283
283
  found_count = len(subdirs) + len(files)
@@ -303,7 +303,7 @@ class Client(ABC):
303
303
  return f"{self.PREFIX}{self.name}/{rel_path}"
304
304
 
305
305
  @abstractmethod
306
- def convert_info(self, v: dict[str, Any], parent: str) -> Entry: ...
306
+ def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
307
307
 
308
308
  def fetch_nodes(
309
309
  self,
@@ -10,7 +10,6 @@ from gcsfs import GCSFileSystem
10
10
  from tqdm import tqdm
11
11
 
12
12
  from datachain.lib.file import File
13
- from datachain.node import Entry
14
13
 
15
14
  from .fsspec import DELIMITER, Client, ResultQueue
16
15
 
@@ -108,19 +107,9 @@ class GCSClient(Client):
108
107
  finally:
109
108
  await page_queue.put(None)
110
109
 
111
- def _entry_from_dict(self, d: dict[str, Any]) -> Entry:
110
+ def _entry_from_dict(self, d: dict[str, Any]) -> File:
112
111
  info = self.fs._process_object(self.name, d)
113
- return self.convert_info(info, self.rel_path(info["name"]))
114
-
115
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
116
- return Entry.from_file(
117
- path=path,
118
- etag=v.get("etag", ""),
119
- version=v.get("generation", ""),
120
- is_latest=not v.get("timeDeleted"),
121
- last_modified=self.parse_timestamp(v["updated"]),
122
- size=v.get("size", ""),
123
- )
112
+ return self.info_to_file(info, self.rel_path(info["name"]))
124
113
 
125
114
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
126
115
  return File(
@@ -5,7 +5,6 @@ from typing import Any, cast
5
5
  from huggingface_hub import HfFileSystem
6
6
 
7
7
  from datachain.lib.file import File
8
- from datachain.node import Entry
9
8
 
10
9
  from .fsspec import Client
11
10
 
@@ -22,15 +21,6 @@ class HfClient(Client):
22
21
 
23
22
  return cast(HfFileSystem, super().create_fs(**kwargs))
24
23
 
25
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
26
- return Entry.from_file(
27
- path=path,
28
- size=v["size"],
29
- version=v["last_commit"].oid,
30
- etag=v.get("blob_id", ""),
31
- last_modified=v["last_commit"].date,
32
- )
33
-
34
24
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
35
25
  return File(
36
26
  path=path,
@@ -7,8 +7,8 @@ from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
9
9
 
10
+ from datachain.cache import UniqueId
10
11
  from datachain.lib.file import File
11
- from datachain.node import Entry
12
12
  from datachain.storage import StorageURI
13
13
 
14
14
  from .fsspec import Client
@@ -114,9 +114,9 @@ class FileClient(Client):
114
114
  use_symlinks=use_symlinks,
115
115
  )
116
116
 
117
- async def get_current_etag(self, uid) -> str:
117
+ async def get_current_etag(self, uid: UniqueId) -> str:
118
118
  info = self.fs.info(self.get_full_path(uid.path))
119
- return self.convert_info(info, "").etag
119
+ return self.info_to_file(info, "").etag
120
120
 
121
121
  async def get_size(self, path: str) -> int:
122
122
  return self.fs.size(path)
@@ -136,15 +136,6 @@ class FileClient(Client):
136
136
  full_path += "/"
137
137
  return full_path
138
138
 
139
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
140
- return Entry.from_file(
141
- path=path,
142
- etag=v["mtime"].hex(),
143
- is_latest=True,
144
- last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
145
- size=v.get("size", ""),
146
- )
147
-
148
139
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
149
140
  return File(
150
141
  source=self.uri,
@@ -1,12 +1,11 @@
1
1
  import asyncio
2
- from typing import Any, cast
2
+ from typing import Any, Optional, cast
3
3
 
4
4
  from botocore.exceptions import NoCredentialsError
5
5
  from s3fs import S3FileSystem
6
6
  from tqdm import tqdm
7
7
 
8
8
  from datachain.lib.file import File
9
- from datachain.node import Entry
10
9
 
11
10
  from .fsspec import DELIMITER, Client, ResultQueue
12
11
 
@@ -111,8 +110,9 @@ class ClientS3(Client):
111
110
  ) -> None:
112
111
  await self._fetch_flat(start_prefix, result_queue)
113
112
 
114
- def _entry_from_boto(self, v, bucket, versions=False):
115
- return Entry.from_file(
113
+ def _entry_from_boto(self, v, bucket, versions=False) -> File:
114
+ return File(
115
+ source=self.uri,
116
116
  path=v["Key"],
117
117
  etag=v.get("ETag", "").strip('"'),
118
118
  version=ClientS3.clean_s3_version(v.get("VersionId", "")),
@@ -125,8 +125,8 @@ class ClientS3(Client):
125
125
  self,
126
126
  prefix,
127
127
  pbar,
128
- result_queue,
129
- ):
128
+ result_queue: ResultQueue,
129
+ ) -> set[str]:
130
130
  if prefix:
131
131
  prefix = prefix.lstrip(DELIMITER) + DELIMITER
132
132
  files = []
@@ -141,7 +141,7 @@ class ClientS3(Client):
141
141
  if info["type"] == "directory":
142
142
  subdirs.add(subprefix)
143
143
  else:
144
- files.append(self.convert_info(info, subprefix))
144
+ files.append(self.info_to_file(info, subprefix))
145
145
  pbar.update()
146
146
  found = True
147
147
  if not found:
@@ -152,18 +152,8 @@ class ClientS3(Client):
152
152
  return subdirs
153
153
 
154
154
  @staticmethod
155
- def clean_s3_version(ver):
156
- return ver if ver != "null" else ""
157
-
158
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
159
- return Entry.from_file(
160
- path=path,
161
- etag=v.get("ETag", "").strip('"'),
162
- version=ClientS3.clean_s3_version(v.get("VersionId", "")),
163
- is_latest=v.get("IsLatest", True),
164
- last_modified=v.get("LastModified", ""),
165
- size=v["size"],
166
- )
155
+ def clean_s3_version(ver: Optional[str]) -> str:
156
+ return ver if (ver is not None and ver != "null") else ""
167
157
 
168
158
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
169
159
  return File(
@@ -43,6 +43,8 @@ if TYPE_CHECKING:
43
43
  from sqlalchemy.sql.elements import ColumnElement
44
44
  from sqlalchemy.types import TypeEngine
45
45
 
46
+ from datachain.lib.file import File
47
+
46
48
 
47
49
  logger = logging.getLogger("datachain")
48
50
 
@@ -58,6 +60,10 @@ quote_schema = sqlite_dialect.identifier_preparer.quote_schema
58
60
  quote = sqlite_dialect.identifier_preparer.quote
59
61
 
60
62
 
63
+ def _get_in_memory_uri():
64
+ return "file::memory:?cache=shared"
65
+
66
+
61
67
  def get_retry_sleep_sec(retry_count: int) -> int:
62
68
  return RETRY_START_SEC * (RETRY_FACTOR**retry_count)
63
69
 
@@ -119,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
119
125
  if db_file == ":memory:":
120
126
  # Enable multithreaded usage of the same in-memory db
121
127
  db = sqlite3.connect(
122
- "file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
128
+ _get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
123
129
  )
124
130
  else:
125
131
  db = sqlite3.connect(
@@ -704,6 +710,9 @@ class SQLiteWarehouse(AbstractWarehouse):
704
710
 
705
711
  self.db.execute(insert_query)
706
712
 
713
+ def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
714
+ return (e.model_dump() for e in entries)
715
+
707
716
  def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
708
717
  rows = list(rows)
709
718
  if not rows:
@@ -20,7 +20,7 @@ from datachain.client import Client
20
20
  from datachain.data_storage.schema import convert_rows_custom_column_types
21
21
  from datachain.data_storage.serializer import Serializable
22
22
  from datachain.dataset import DatasetRecord
23
- from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
23
+ from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
24
24
  from datachain.sql.functions import path as pathfunc
25
25
  from datachain.sql.types import Int, SQLType
26
26
  from datachain.storage import StorageURI
@@ -34,6 +34,7 @@ if TYPE_CHECKING:
34
34
  from datachain.data_storage import AbstractIDGenerator, schema
35
35
  from datachain.data_storage.db_engine import DatabaseEngine
36
36
  from datachain.data_storage.schema import DataTable
37
+ from datachain.lib.file import File
37
38
 
38
39
  try:
39
40
  import numpy as np
@@ -401,25 +402,18 @@ class AbstractWarehouse(ABC, Serializable):
401
402
  expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
402
403
  sa.func.count(table.c.sys__id),
403
404
  )
404
- if "file__size" in table.columns:
405
- expressions = (*expressions, sa.func.sum(table.c.file__size))
406
- elif "size" in table.columns:
407
- expressions = (*expressions, sa.func.sum(table.c.size))
405
+ size_columns = [
406
+ c for c in table.columns if c.name == "size" or c.name.endswith("__size")
407
+ ]
408
+ if size_columns:
409
+ expressions = (*expressions, sa.func.sum(sum(size_columns)))
408
410
  query = select(*expressions)
409
411
  ((nrows, *rest),) = self.db.execute(query)
410
- return nrows, rest[0] if rest else None
411
-
412
- def prepare_entries(
413
- self, uri: str, entries: Iterable[Entry]
414
- ) -> list[dict[str, Any]]:
415
- """
416
- Prepares bucket listing entry (row) for inserting into database
417
- """
418
-
419
- def _prepare_entry(entry: Entry):
420
- return attrs.asdict(entry) | {"source": uri}
412
+ return nrows, rest[0] if rest else 0
421
413
 
422
- return [_prepare_entry(e) for e in entries]
414
+ @abstractmethod
415
+ def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
416
+ """Convert File entries so they can be passed on to `insert_rows()`"""
423
417
 
424
418
  @abstractmethod
425
419
  def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
@@ -30,8 +30,7 @@ def list_bucket(uri: str, client_config=None) -> Callable:
30
30
  config = client_config or {}
31
31
  client, path = Client.parse_url(uri, None, **config) # type: ignore[arg-type]
32
32
  for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
33
- for entry in entries:
34
- yield entry.to_file(client.uri)
33
+ yield from entries
35
34
 
36
35
  return list_func
37
36
 
@@ -1,6 +1,6 @@
1
1
  import inspect
2
2
  import logging
3
- from typing import ClassVar, Optional
3
+ from typing import Any, ClassVar, Optional
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
@@ -69,7 +69,7 @@ class ModelStore:
69
69
  del cls.store[fr.__name__][version]
70
70
 
71
71
  @staticmethod
72
- def is_pydantic(val):
72
+ def is_pydantic(val: Any) -> bool:
73
73
  return (
74
74
  not hasattr(val, "__origin__")
75
75
  and inspect.isclass(val)
@@ -7,6 +7,7 @@ from torch import float32
7
7
  from torch.distributed import get_rank, get_world_size
8
8
  from torch.utils.data import IterableDataset, get_worker_info
9
9
  from torchvision.transforms import v2
10
+ from tqdm import tqdm
10
11
 
11
12
  from datachain.catalog import Catalog, get_catalog
12
13
  from datachain.lib.dc import DataChain
@@ -93,33 +94,38 @@ class PytorchDataset(IterableDataset):
93
94
  if self.num_samples > 0:
94
95
  ds = ds.sample(self.num_samples)
95
96
  ds = ds.chunk(total_rank, total_workers)
96
- for row_features in ds.collect():
97
- row = []
98
- for fr in row_features:
99
- if hasattr(fr, "read"):
100
- row.append(fr.read()) # type: ignore[unreachable]
101
- else:
102
- row.append(fr)
103
- # Apply transforms
104
- if self.transform:
105
- try:
106
- if isinstance(self.transform, v2.Transform):
107
- row = self.transform(row)
97
+ desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
98
+ with tqdm(desc=desc, unit=" rows") as pbar:
99
+ for row_features in ds.collect():
100
+ row = []
101
+ for fr in row_features:
102
+ if hasattr(fr, "read"):
103
+ row.append(fr.read()) # type: ignore[unreachable]
104
+ else:
105
+ row.append(fr)
106
+ # Apply transforms
107
+ if self.transform:
108
+ try:
109
+ if isinstance(self.transform, v2.Transform):
110
+ row = self.transform(row)
111
+ for i, val in enumerate(row):
112
+ if isinstance(val, Image.Image):
113
+ row[i] = self.transform(val)
114
+ except ValueError:
115
+ logger.warning(
116
+ "Skipping transform due to unsupported data types."
117
+ )
118
+ self.transform = None
119
+ if self.tokenizer:
108
120
  for i, val in enumerate(row):
109
- if isinstance(val, Image.Image):
110
- row[i] = self.transform(val)
111
- except ValueError:
112
- logger.warning("Skipping transform due to unsupported data types.")
113
- self.transform = None
114
- if self.tokenizer:
115
- for i, val in enumerate(row):
116
- if isinstance(val, str) or (
117
- isinstance(val, list) and isinstance(val[0], str)
118
- ):
119
- row[i] = convert_text(
120
- val, self.tokenizer, self.tokenizer_kwargs
121
- ).squeeze(0) # type: ignore[union-attr]
122
- yield row
121
+ if isinstance(val, str) or (
122
+ isinstance(val, list) and isinstance(val[0], str)
123
+ ):
124
+ row[i] = convert_text(
125
+ val, self.tokenizer, self.tokenizer_kwargs
126
+ ).squeeze(0) # type: ignore[union-attr]
127
+ yield row
128
+ pbar.update(1)
123
129
 
124
130
  @staticmethod
125
131
  def get_rank_and_workers() -> tuple[int, int]: