datachain 0.3.7__tar.gz → 0.3.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (244) hide show
  1. {datachain-0.3.7 → datachain-0.3.9}/.github/workflows/tests.yml +1 -1
  2. {datachain-0.3.7 → datachain-0.3.9}/.pre-commit-config.yaml +1 -1
  3. {datachain-0.3.7/src/datachain.egg-info → datachain-0.3.9}/PKG-INFO +19 -15
  4. {datachain-0.3.7 → datachain-0.3.9}/README.rst +11 -12
  5. {datachain-0.3.7 → datachain-0.3.9}/examples/llm_and_nlp/unstructured-text.py +1 -1
  6. {datachain-0.3.7 → datachain-0.3.9}/examples/multimodal/wds_filtered.py +1 -3
  7. {datachain-0.3.7 → datachain-0.3.9}/pyproject.toml +10 -4
  8. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/catalog/catalog.py +2 -92
  9. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/cli.py +0 -37
  10. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/arrow.py +5 -5
  11. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/clip.py +14 -3
  12. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/python_to_sql.py +9 -0
  13. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/data_model.py +10 -1
  14. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/dc.py +135 -39
  15. datachain-0.3.9/src/datachain/lib/hf.py +166 -0
  16. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/image.py +9 -1
  17. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/pytorch.py +1 -2
  18. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/signal_schema.py +124 -20
  19. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/text.py +4 -0
  20. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/udf.py +14 -20
  21. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/webdataset.py +1 -1
  22. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/dataset.py +24 -9
  23. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/session.py +5 -3
  24. {datachain-0.3.7 → datachain-0.3.9/src/datachain.egg-info}/PKG-INFO +19 -15
  25. {datachain-0.3.7 → datachain-0.3.9}/src/datachain.egg-info/SOURCES.txt +3 -4
  26. {datachain-0.3.7 → datachain-0.3.9}/src/datachain.egg-info/requires.txt +8 -2
  27. {datachain-0.3.7 → datachain-0.3.9}/tests/examples/wds_data.py +11 -11
  28. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_catalog.py +30 -0
  29. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_datasets.py +0 -127
  30. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_feature_pickling.py +70 -0
  31. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_pytorch.py +17 -2
  32. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/conftest.py +5 -2
  33. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_arrow.py +3 -3
  34. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_datachain.py +54 -0
  35. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_feature.py +3 -2
  36. datachain-0.3.9/tests/unit/lib/test_hf.py +132 -0
  37. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_signal_schema.py +92 -3
  38. datachain-0.3.7/examples/computer_vision/blip2_image_desc_lib.py +0 -100
  39. datachain-0.3.7/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -64
  40. datachain-0.3.7/examples/llm_and_nlp/llm-claude.py +0 -46
  41. {datachain-0.3.7 → datachain-0.3.9}/.cruft.json +0 -0
  42. {datachain-0.3.7 → datachain-0.3.9}/.gitattributes +0 -0
  43. {datachain-0.3.7 → datachain-0.3.9}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  44. {datachain-0.3.7 → datachain-0.3.9}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  45. {datachain-0.3.7 → datachain-0.3.9}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  46. {datachain-0.3.7 → datachain-0.3.9}/.github/codecov.yaml +0 -0
  47. {datachain-0.3.7 → datachain-0.3.9}/.github/dependabot.yml +0 -0
  48. {datachain-0.3.7 → datachain-0.3.9}/.github/workflows/benchmarks.yml +0 -0
  49. {datachain-0.3.7 → datachain-0.3.9}/.github/workflows/release.yml +0 -0
  50. {datachain-0.3.7 → datachain-0.3.9}/.github/workflows/tests-studio.yml +0 -0
  51. {datachain-0.3.7 → datachain-0.3.9}/.github/workflows/update-template.yaml +0 -0
  52. {datachain-0.3.7 → datachain-0.3.9}/.gitignore +0 -0
  53. {datachain-0.3.7 → datachain-0.3.9}/CODE_OF_CONDUCT.rst +0 -0
  54. {datachain-0.3.7 → datachain-0.3.9}/CONTRIBUTING.rst +0 -0
  55. {datachain-0.3.7 → datachain-0.3.9}/LICENSE +0 -0
  56. {datachain-0.3.7 → datachain-0.3.9}/docs/assets/captioned_cartoons.png +0 -0
  57. {datachain-0.3.7 → datachain-0.3.9}/docs/assets/datachain.png +0 -0
  58. {datachain-0.3.7 → datachain-0.3.9}/docs/assets/flowchart.png +0 -0
  59. {datachain-0.3.7 → datachain-0.3.9}/docs/index.md +0 -0
  60. {datachain-0.3.7 → datachain-0.3.9}/docs/references/datachain.md +0 -0
  61. {datachain-0.3.7 → datachain-0.3.9}/docs/references/datatype.md +0 -0
  62. {datachain-0.3.7 → datachain-0.3.9}/docs/references/file.md +0 -0
  63. {datachain-0.3.7 → datachain-0.3.9}/docs/references/index.md +0 -0
  64. {datachain-0.3.7 → datachain-0.3.9}/docs/references/sql.md +0 -0
  65. {datachain-0.3.7 → datachain-0.3.9}/docs/references/torch.md +0 -0
  66. {datachain-0.3.7 → datachain-0.3.9}/docs/references/udf.md +0 -0
  67. {datachain-0.3.7 → datachain-0.3.9}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  68. {datachain-0.3.7 → datachain-0.3.9}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  69. {datachain-0.3.7 → datachain-0.3.9}/examples/computer_vision/openimage-detect.py +0 -0
  70. {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/common_sql_functions.py +0 -0
  71. {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/json-csv-reader.py +0 -0
  72. {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/torch-loader.py +0 -0
  73. {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/udfs/parallel.py +0 -0
  74. {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/udfs/simple.py +0 -0
  75. {datachain-0.3.7 → datachain-0.3.9}/examples/get_started/udfs/stateful.py +0 -0
  76. /datachain-0.3.7/examples/llm_and_nlp/llm-claude-simple-query.py → /datachain-0.3.9/examples/llm_and_nlp/claude-query.py +0 -0
  77. {datachain-0.3.7 → datachain-0.3.9}/examples/multimodal/clip_inference.py +0 -0
  78. {datachain-0.3.7 → datachain-0.3.9}/examples/multimodal/hf_pipeline.py +0 -0
  79. {datachain-0.3.7 → datachain-0.3.9}/examples/multimodal/openai_image_desc_lib.py +0 -0
  80. {datachain-0.3.7 → datachain-0.3.9}/examples/multimodal/wds.py +0 -0
  81. {datachain-0.3.7 → datachain-0.3.9}/mkdocs.yml +0 -0
  82. {datachain-0.3.7 → datachain-0.3.9}/noxfile.py +0 -0
  83. {datachain-0.3.7 → datachain-0.3.9}/setup.cfg +0 -0
  84. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/__init__.py +0 -0
  85. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/__main__.py +0 -0
  86. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/asyn.py +0 -0
  87. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/cache.py +0 -0
  88. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/catalog/__init__.py +0 -0
  89. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/catalog/datasource.py +0 -0
  90. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/catalog/loader.py +0 -0
  91. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/catalog/subclass.py +0 -0
  92. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/cli_utils.py +0 -0
  93. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/__init__.py +0 -0
  94. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/azure.py +0 -0
  95. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/fileslice.py +0 -0
  96. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/fsspec.py +0 -0
  97. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/gcs.py +0 -0
  98. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/local.py +0 -0
  99. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/client/s3.py +0 -0
  100. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/config.py +0 -0
  101. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/__init__.py +0 -0
  102. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/db_engine.py +0 -0
  103. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/id_generator.py +0 -0
  104. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/job.py +0 -0
  105. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/metastore.py +0 -0
  106. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/schema.py +0 -0
  107. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/serializer.py +0 -0
  108. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/sqlite.py +0 -0
  109. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/data_storage/warehouse.py +0 -0
  110. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/dataset.py +0 -0
  111. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/error.py +0 -0
  112. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/job.py +0 -0
  113. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/__init__.py +0 -0
  114. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/__init__.py +0 -0
  115. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/flatten.py +0 -0
  116. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/sql_to_python.py +0 -0
  117. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/unflatten.py +0 -0
  118. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  119. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/dataset_info.py +0 -0
  120. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/file.py +0 -0
  121. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/listing.py +0 -0
  122. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/meta_formats.py +0 -0
  123. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/model_store.py +0 -0
  124. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/settings.py +0 -0
  125. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/udf_signature.py +0 -0
  126. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/utils.py +0 -0
  127. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/vfile.py +0 -0
  128. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/lib/webdataset_laion.py +0 -0
  129. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/listing.py +0 -0
  130. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/node.py +0 -0
  131. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/nodes_fetcher.py +0 -0
  132. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/nodes_thread_pool.py +0 -0
  133. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/progress.py +0 -0
  134. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/py.typed +0 -0
  135. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/__init__.py +0 -0
  136. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/batch.py +0 -0
  137. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/builtins.py +0 -0
  138. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/dispatch.py +0 -0
  139. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/metrics.py +0 -0
  140. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/params.py +0 -0
  141. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/queue.py +0 -0
  142. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/schema.py +0 -0
  143. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/query/udf.py +0 -0
  144. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/remote/__init__.py +0 -0
  145. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/remote/studio.py +0 -0
  146. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/__init__.py +0 -0
  147. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/default/__init__.py +0 -0
  148. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/default/base.py +0 -0
  149. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/__init__.py +0 -0
  150. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/array.py +0 -0
  151. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/conditional.py +0 -0
  152. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/path.py +0 -0
  153. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/random.py +0 -0
  154. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/functions/string.py +0 -0
  155. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/selectable.py +0 -0
  156. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/sqlite/__init__.py +0 -0
  157. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/sqlite/base.py +0 -0
  158. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/sqlite/types.py +0 -0
  159. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/sqlite/vector.py +0 -0
  160. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/types.py +0 -0
  161. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/sql/utils.py +0 -0
  162. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/storage.py +0 -0
  163. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/torch/__init__.py +0 -0
  164. {datachain-0.3.7 → datachain-0.3.9}/src/datachain/utils.py +0 -0
  165. {datachain-0.3.7 → datachain-0.3.9}/src/datachain.egg-info/dependency_links.txt +0 -0
  166. {datachain-0.3.7 → datachain-0.3.9}/src/datachain.egg-info/entry_points.txt +0 -0
  167. {datachain-0.3.7 → datachain-0.3.9}/src/datachain.egg-info/top_level.txt +0 -0
  168. {datachain-0.3.7 → datachain-0.3.9}/tests/__init__.py +0 -0
  169. {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/__init__.py +0 -0
  170. {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/conftest.py +0 -0
  171. {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  172. {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/datasets/.dvc/config +0 -0
  173. {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/datasets/.gitignore +0 -0
  174. {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  175. {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/test_datachain.py +0 -0
  176. {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/test_ls.py +0 -0
  177. {datachain-0.3.7 → datachain-0.3.9}/tests/benchmarks/test_version.py +0 -0
  178. {datachain-0.3.7 → datachain-0.3.9}/tests/conftest.py +0 -0
  179. {datachain-0.3.7 → datachain-0.3.9}/tests/data.py +0 -0
  180. {datachain-0.3.7 → datachain-0.3.9}/tests/examples/__init__.py +0 -0
  181. {datachain-0.3.7 → datachain-0.3.9}/tests/examples/test_examples.py +0 -0
  182. {datachain-0.3.7 → datachain-0.3.9}/tests/examples/test_wds_e2e.py +0 -0
  183. {datachain-0.3.7 → datachain-0.3.9}/tests/func/__init__.py +0 -0
  184. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_client.py +0 -0
  185. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_datachain.py +0 -0
  186. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_dataset_query.py +0 -0
  187. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_listing.py +0 -0
  188. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_ls.py +0 -0
  189. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_pull.py +0 -0
  190. {datachain-0.3.7 → datachain-0.3.9}/tests/func/test_query.py +0 -0
  191. {datachain-0.3.7 → datachain-0.3.9}/tests/scripts/feature_class.py +0 -0
  192. {datachain-0.3.7 → datachain-0.3.9}/tests/scripts/feature_class_parallel.py +0 -0
  193. {datachain-0.3.7 → datachain-0.3.9}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  194. {datachain-0.3.7 → datachain-0.3.9}/tests/scripts/name_len_slow.py +0 -0
  195. {datachain-0.3.7 → datachain-0.3.9}/tests/test_cli_e2e.py +0 -0
  196. {datachain-0.3.7 → datachain-0.3.9}/tests/test_query_e2e.py +0 -0
  197. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/__init__.py +0 -0
  198. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/__init__.py +0 -0
  199. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_clip.py +0 -0
  200. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  201. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_datachain_merge.py +0 -0
  202. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_feature_utils.py +0 -0
  203. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_file.py +0 -0
  204. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_image.py +0 -0
  205. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_schema.py +0 -0
  206. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_sql_to_python.py +0 -0
  207. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_text.py +0 -0
  208. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_udf_signature.py +0 -0
  209. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_utils.py +0 -0
  210. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/lib/test_webdataset.py +0 -0
  211. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/__init__.py +0 -0
  212. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/sqlite/__init__.py +0 -0
  213. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/sqlite/test_utils.py +0 -0
  214. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_array.py +0 -0
  215. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_conditional.py +0 -0
  216. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_path.py +0 -0
  217. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_random.py +0 -0
  218. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_selectable.py +0 -0
  219. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/sql/test_string.py +0 -0
  220. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_asyn.py +0 -0
  221. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_cache.py +0 -0
  222. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_catalog.py +0 -0
  223. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_catalog_loader.py +0 -0
  224. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_cli_parsing.py +0 -0
  225. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_client.py +0 -0
  226. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_client_s3.py +0 -0
  227. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_data_storage.py +0 -0
  228. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_database_engine.py +0 -0
  229. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_dataset.py +0 -0
  230. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_dispatch.py +0 -0
  231. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_fileslice.py +0 -0
  232. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_id_generator.py +0 -0
  233. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_listing.py +0 -0
  234. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_metastore.py +0 -0
  235. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_module_exports.py +0 -0
  236. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_query_metrics.py +0 -0
  237. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_query_params.py +0 -0
  238. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_serializer.py +0 -0
  239. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_session.py +0 -0
  240. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_storage.py +0 -0
  241. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_udf.py +0 -0
  242. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_utils.py +0 -0
  243. {datachain-0.3.7 → datachain-0.3.9}/tests/unit/test_warehouse.py +0 -0
  244. {datachain-0.3.7 → datachain-0.3.9}/tests/utils.py +0 -0
@@ -50,7 +50,7 @@ jobs:
50
50
  run: nox -s lint
51
51
 
52
52
  datachain:
53
- timeout-minutes: 25
53
+ timeout-minutes: 40
54
54
  runs-on: ${{ matrix.os }}
55
55
  strategy:
56
56
  fail-fast: false
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.6.1'
27
+ rev: 'v0.6.2'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.7
3
+ Version: 0.3.9
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -42,6 +42,7 @@ Requires-Dist: jmespath>=1.0
42
42
  Requires-Dist: datamodel-code-generator>=0.25
43
43
  Requires-Dist: Pillow<11,>=10.0.0
44
44
  Requires-Dist: msgpack<2,>=1.0.4
45
+ Requires-Dist: psutil
45
46
  Provides-Extra: docs
46
47
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
47
48
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -58,8 +59,11 @@ Requires-Dist: lz4; extra == "remote"
58
59
  Requires-Dist: requests>=2.22.0; extra == "remote"
59
60
  Provides-Extra: vector
60
61
  Requires-Dist: usearch; extra == "vector"
62
+ Provides-Extra: hf
63
+ Requires-Dist: numba>=0.60.0; extra == "hf"
64
+ Requires-Dist: datasets[audio,vision]; extra == "hf"
61
65
  Provides-Extra: tests
62
- Requires-Dist: datachain[remote,torch,vector]; extra == "tests"
66
+ Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
63
67
  Requires-Dist: pytest<9,>=8; extra == "tests"
64
68
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
65
69
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
@@ -74,9 +78,10 @@ Requires-Dist: hypothesis; extra == "tests"
74
78
  Requires-Dist: open_clip_torch; extra == "tests"
75
79
  Requires-Dist: aiotools>=1.7.0; extra == "tests"
76
80
  Requires-Dist: requests-mock; extra == "tests"
81
+ Requires-Dist: scipy; extra == "tests"
77
82
  Provides-Extra: dev
78
83
  Requires-Dist: datachain[docs,tests]; extra == "dev"
79
- Requires-Dist: mypy==1.11.1; extra == "dev"
84
+ Requires-Dist: mypy==1.11.2; extra == "dev"
80
85
  Requires-Dist: types-python-dateutil; extra == "dev"
81
86
  Requires-Dist: types-pytz; extra == "dev"
82
87
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -110,31 +115,30 @@ AI 🔗 DataChain
110
115
 
111
116
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
112
117
  It is made to organize your unstructured data into datasets and wrangle it at scale on
113
- your local machine.
118
+ your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
114
119
 
115
120
  Key Features
116
121
  ============
117
122
 
118
123
  📂 **Storage as a Source of Truth.**
119
- - Process unstructured data without redundant copies: S3, GCP, Azure, and local
124
+ - Process unstructured data without redundant copies from S3, GCP, Azure, and local
120
125
  file systems.
121
- - Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
122
- - Join files and metadata together into persistent, versioned, columnar datasets.
126
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
127
+ - Unite files and metadata together into persistent, versioned, columnar datasets.
123
128
 
124
129
  🐍 **Python-friendly data pipelines.**
125
130
  - Operate on Python objects and object fields.
126
- - Built-in parallelization and out-of-memory compute without a need in SQL or
127
- Spark jobs.
131
+ - Built-in parallelization and out-of-memory compute without SQL or Spark.
128
132
 
129
133
  🧠 **Data Enrichment and Processing.**
130
- - Generate metadata columns using local AI models and LLM APIs.
131
- - Filter, join, and group by AI metadata. Vector similarity search.
132
- - Pass datasets to Pytorch and Tensorflow, or export back into storage.
134
+ - Generate metadata using local AI models and LLM APIs.
135
+ - Filter, join, and group by metadata. Search by vector embeddings.
136
+ - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
133
137
 
134
138
  🚀 **Efficiency.**
135
139
  - Parallelization, out-of-memory workloads and data caching.
136
140
  - Vectorized operations on Python object fields: sum, count, avg, etc.
137
- - Vector search on embeddings.
141
+ - Optimized vector search.
138
142
 
139
143
 
140
144
  Quick Start
@@ -159,7 +163,7 @@ where each image has a matching JSON file like `cat.1009.json`:
159
163
  "inference": {"class": "dog", "confidence": 0.68}
160
164
  }
161
165
 
162
- Example of downloading only high-confidence cat images using JSON metadata:
166
+ Example of downloading only "high-confidence cat" inferred images using JSON metadata:
163
167
 
164
168
 
165
169
  .. code:: py
@@ -229,7 +233,7 @@ detected are then copied to the local directory.
229
233
  LLM judging chatbots
230
234
  =============================
231
235
 
232
- LLMs can work as efficient universal classifiers. In the example below,
236
+ LLMs can work as universal classifiers. In the example below,
233
237
  we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
234
238
  Mistral API key at https://console.mistral.ai
235
239
 
@@ -18,31 +18,30 @@ AI 🔗 DataChain
18
18
 
19
19
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
20
20
  It is made to organize your unstructured data into datasets and wrangle it at scale on
21
- your local machine.
21
+ your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
22
22
 
23
23
  Key Features
24
24
  ============
25
25
 
26
26
  📂 **Storage as a Source of Truth.**
27
- - Process unstructured data without redundant copies: S3, GCP, Azure, and local
27
+ - Process unstructured data without redundant copies from S3, GCP, Azure, and local
28
28
  file systems.
29
- - Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
30
- - Join files and metadata together into persistent, versioned, columnar datasets.
29
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
30
+ - Unite files and metadata together into persistent, versioned, columnar datasets.
31
31
 
32
32
  🐍 **Python-friendly data pipelines.**
33
33
  - Operate on Python objects and object fields.
34
- - Built-in parallelization and out-of-memory compute without a need in SQL or
35
- Spark jobs.
34
+ - Built-in parallelization and out-of-memory compute without SQL or Spark.
36
35
 
37
36
  🧠 **Data Enrichment and Processing.**
38
- - Generate metadata columns using local AI models and LLM APIs.
39
- - Filter, join, and group by AI metadata. Vector similarity search.
40
- - Pass datasets to Pytorch and Tensorflow, or export back into storage.
37
+ - Generate metadata using local AI models and LLM APIs.
38
+ - Filter, join, and group by metadata. Search by vector embeddings.
39
+ - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
41
40
 
42
41
  🚀 **Efficiency.**
43
42
  - Parallelization, out-of-memory workloads and data caching.
44
43
  - Vectorized operations on Python object fields: sum, count, avg, etc.
45
- - Vector search on embeddings.
44
+ - Optimized vector search.
46
45
 
47
46
 
48
47
  Quick Start
@@ -67,7 +66,7 @@ where each image has a matching JSON file like `cat.1009.json`:
67
66
  "inference": {"class": "dog", "confidence": 0.68}
68
67
  }
69
68
 
70
- Example of downloading only high-confidence cat images using JSON metadata:
69
+ Example of downloading only "high-confidence cat" inferred images using JSON metadata:
71
70
 
72
71
 
73
72
  .. code:: py
@@ -137,7 +136,7 @@ detected are then copied to the local directory.
137
136
  LLM judging chatbots
138
137
  =============================
139
138
 
140
- LLMs can work as efficient universal classifiers. In the example below,
139
+ LLMs can work as universal classifiers. In the example below,
141
140
  we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
142
141
  Mistral API key at https://console.mistral.ai
143
142
 
@@ -1,5 +1,5 @@
1
1
  #
2
- # pip install unstructured[pdf] nltk==3.8.1 huggingface_hub[hf_transfer]
2
+ # pip install unstructured[pdf] huggingface_hub[hf_transfer]
3
3
  #
4
4
  import os
5
5
 
@@ -1,13 +1,11 @@
1
1
  import datachain.error
2
2
  from datachain import C, DataChain
3
- from datachain.lib.model_store import ModelStore
4
3
  from datachain.lib.webdataset import process_webdataset
5
- from datachain.lib.webdataset_laion import LaionMeta, WDSLaion
4
+ from datachain.lib.webdataset_laion import WDSLaion
6
5
  from datachain.sql import literal
7
6
  from datachain.sql.functions import array, greatest, least, string
8
7
 
9
8
  name = "wds"
10
- ModelStore.register(LaionMeta)
11
9
  try:
12
10
  wds = DataChain.from_dataset(name=name)
13
11
  except datachain.error.DatasetNotFoundError:
@@ -44,7 +44,8 @@ dependencies = [
44
44
  "jmespath>=1.0",
45
45
  "datamodel-code-generator>=0.25",
46
46
  "Pillow>=10.0.0,<11",
47
- "msgpack>=1.0.4,<2"
47
+ "msgpack>=1.0.4,<2",
48
+ "psutil"
48
49
  ]
49
50
 
50
51
  [project.optional-dependencies]
@@ -68,8 +69,12 @@ remote = [
68
69
  vector = [
69
70
  "usearch"
70
71
  ]
72
+ hf = [
73
+ "numba>=0.60.0",
74
+ "datasets[audio,vision]"
75
+ ]
71
76
  tests = [
72
- "datachain[torch,remote,vector]",
77
+ "datachain[torch,remote,vector,hf]",
73
78
  "pytest>=8,<9",
74
79
  "pytest-sugar>=0.9.6",
75
80
  "pytest-cov>=4.1.0",
@@ -83,11 +88,12 @@ tests = [
83
88
  "hypothesis",
84
89
  "open_clip_torch",
85
90
  "aiotools>=1.7.0",
86
- "requests-mock"
91
+ "requests-mock",
92
+ "scipy"
87
93
  ]
88
94
  dev = [
89
95
  "datachain[docs,tests]",
90
- "mypy==1.11.1",
96
+ "mypy==1.11.2",
91
97
  "types-python-dateutil",
92
98
  "types-pytz",
93
99
  "types-PyYAML",
@@ -1540,87 +1540,6 @@ class Catalog:
1540
1540
  dataset = self.get_dataset(name)
1541
1541
  return self.update_dataset(dataset, **update_data)
1542
1542
 
1543
- def merge_datasets(
1544
- self,
1545
- src: DatasetRecord,
1546
- dst: DatasetRecord,
1547
- src_version: int,
1548
- dst_version: Optional[int] = None,
1549
- ) -> DatasetRecord:
1550
- """
1551
- Merges records from source to destination dataset.
1552
- It will create new version
1553
- of a dataset with records merged from old version and the source, unless
1554
- existing version is specified for destination in which case it must
1555
- be in non final status as datasets are immutable
1556
- """
1557
- if (
1558
- dst_version
1559
- and not dst.is_valid_next_version(dst_version)
1560
- and dst.get_version(dst_version).is_final_status()
1561
- ):
1562
- raise DatasetInvalidVersionError(
1563
- f"Version {dst_version} must be higher than the current latest one"
1564
- )
1565
-
1566
- src_dep = self.get_dataset_dependencies(src.name, src_version)
1567
- dst_dep = self.get_dataset_dependencies(
1568
- dst.name,
1569
- dst.latest_version, # type: ignore[arg-type]
1570
- )
1571
-
1572
- if dst.has_version(dst_version): # type: ignore[arg-type]
1573
- # case where we don't create new version, but append to the existing one
1574
- self.warehouse.merge_dataset_rows(
1575
- src,
1576
- dst,
1577
- src_version,
1578
- dst_version=dst_version, # type: ignore[arg-type]
1579
- )
1580
- merged_schema = src.serialized_schema | dst.serialized_schema
1581
- self.update_dataset(dst, schema=merged_schema)
1582
- self.update_dataset_version_with_warehouse_info(
1583
- dst,
1584
- dst_version, # type: ignore[arg-type]
1585
- schema=merged_schema,
1586
- )
1587
- for dep in src_dep:
1588
- if dep and dep not in dst_dep:
1589
- self.metastore.add_dependency(
1590
- dep,
1591
- dst.name,
1592
- dst_version, # type: ignore[arg-type]
1593
- )
1594
- else:
1595
- # case where we create new version of merged results
1596
- src_dr = self.warehouse.dataset_rows(src, src_version)
1597
- dst_dr = self.warehouse.dataset_rows(dst)
1598
-
1599
- merge_result_columns = list(
1600
- {
1601
- c.name: c for c in list(src_dr.table.c) + list(dst_dr.table.c)
1602
- }.values()
1603
- )
1604
-
1605
- dst_version = dst_version or dst.next_version
1606
- dst = self.create_new_dataset_version(
1607
- dst,
1608
- dst_version,
1609
- columns=merge_result_columns,
1610
- )
1611
- self.warehouse.merge_dataset_rows(
1612
- src,
1613
- dst,
1614
- src_version,
1615
- dst_version,
1616
- )
1617
- self.update_dataset_version_with_warehouse_info(dst, dst_version)
1618
- for dep in set(src_dep + dst_dep):
1619
- if dep:
1620
- self.metastore.add_dependency(dep, dst.name, dst_version)
1621
-
1622
- return dst
1623
-
1624
1543
  def get_file_signals(
1625
1544
  self, dataset_name: str, dataset_version: int, row: RowDict
1626
1545
  ) -> Optional[dict]:
@@ -1641,17 +1560,8 @@ class Catalog:
1641
1560
  version = self.get_dataset(dataset_name).get_version(dataset_version)
1642
1561
 
1643
1562
  file_signals_values = {}
1644
- file_schemas = {}
1645
- # TODO: To remove after we properly fix deserialization
1646
- for signal, type_name in version.feature_schema.items():
1647
- from datachain.lib.model_store import ModelStore
1648
-
1649
- type_name_parsed, v = ModelStore.parse_name_version(type_name)
1650
- fr = ModelStore.get(type_name_parsed, v)
1651
- if fr and issubclass(fr, File):
1652
- file_schemas[signal] = type_name
1653
1563
 
1654
- schema = SignalSchema.deserialize(file_schemas)
1564
+ schema = SignalSchema.deserialize(version.feature_schema)
1655
1565
  for file_signals in schema.get_signals(File):
1656
1566
  prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
1657
1567
  file_signals_values[file_signals] = {
@@ -1997,7 +1907,7 @@ class Catalog:
1997
1907
  """
1998
1908
  from datachain.query.dataset import ExecutionResult
1999
1909
 
2000
- feature_file = tempfile.NamedTemporaryFile(
1910
+ feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
2001
1911
  dir=os.getcwd(), suffix=".py", delete=False
2002
1912
  )
2003
1913
  _, feature_module = os.path.split(feature_file.name)
@@ -336,36 +336,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
336
336
  help="Display size using powers of 1000 not 1024",
337
337
  )
338
338
 
339
- parse_merge_datasets = subp.add_parser(
340
- "merge-datasets", parents=[parent_parser], description="Merges datasets"
341
- )
342
- parse_merge_datasets.add_argument(
343
- "--src",
344
- action="store",
345
- default=None,
346
- help="Source dataset name",
347
- )
348
- parse_merge_datasets.add_argument(
349
- "--dst",
350
- action="store",
351
- default=None,
352
- help="Destination dataset name",
353
- )
354
- parse_merge_datasets.add_argument(
355
- "--src-version",
356
- action="store",
357
- default=None,
358
- type=int,
359
- help="Source dataset version",
360
- )
361
- parse_merge_datasets.add_argument(
362
- "--dst-version",
363
- action="store",
364
- default=None,
365
- type=int,
366
- help="Destination dataset version",
367
- )
368
-
369
339
  parse_ls = subp.add_parser(
370
340
  "ls", parents=[parent_parser], description="List storage contents"
371
341
  )
@@ -996,13 +966,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
996
966
  new_name=args.new_name,
997
967
  labels=args.labels,
998
968
  )
999
- elif args.command == "merge-datasets":
1000
- catalog.merge_datasets(
1001
- catalog.get_dataset(args.src),
1002
- catalog.get_dataset(args.dst),
1003
- args.src_version,
1004
- dst_version=args.dst_version,
1005
- )
1006
969
  elif args.command == "ls":
1007
970
  ls(
1008
971
  args.sources,
@@ -95,7 +95,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
95
95
  if not column:
96
96
  column = f"c{default_column}"
97
97
  default_column += 1
98
- dtype = _arrow_type_mapper(field.type) # type: ignore[assignment]
98
+ dtype = arrow_type_mapper(field.type) # type: ignore[assignment]
99
99
  if field.nullable:
100
100
  dtype = Optional[dtype] # type: ignore[assignment]
101
101
  output[column] = dtype
@@ -103,7 +103,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
103
103
  return output
104
104
 
105
105
 
106
- def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
106
+ def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
107
107
  """Convert pyarrow types to basic types."""
108
108
  from datetime import datetime
109
109
 
@@ -122,16 +122,16 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
122
122
  if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
123
123
  return str
124
124
  if pa.types.is_list(col_type):
125
- return list[_arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
125
+ return list[arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
126
126
  if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
127
127
  return dict
128
128
  if isinstance(col_type, pa.lib.DictionaryType):
129
- return _arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
129
+ return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
130
130
  raise TypeError(f"{col_type!r} datatypes not supported")
131
131
 
132
132
 
133
133
  def _nrows_file(file: File, nrows: int) -> str:
134
- tf = NamedTemporaryFile(delete=False)
134
+ tf = NamedTemporaryFile(delete=False) # noqa: SIM115
135
135
  with file.open(mode="r") as reader:
136
136
  with open(tf.name, "a") as writer:
137
137
  for row, line in enumerate(reader):
@@ -1,5 +1,5 @@
1
1
  import inspect
2
- from typing import TYPE_CHECKING, Any, Callable, Literal, Union
2
+ from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
3
3
 
4
4
  import torch
5
5
  from transformers.modeling_utils import PreTrainedModel
@@ -39,6 +39,7 @@ def clip_similarity_scores(
39
39
  tokenizer: Callable,
40
40
  prob: bool = False,
41
41
  image_to_text: bool = True,
42
+ device: Optional[Union[str, torch.device]] = None,
42
43
  ) -> list[list[float]]:
43
44
  """
44
45
  Calculate CLIP similarity scores between one or more images and/or text.
@@ -52,6 +53,7 @@ def clip_similarity_scores(
52
53
  prob : Compute softmax probabilities.
53
54
  image_to_text : Whether to compute for image-to-text or text-to-image. Ignored
54
55
  if only one of images or text provided.
56
+ device : Device to use. Defaults is None - use model's device.
55
57
 
56
58
 
57
59
  Example:
@@ -130,17 +132,26 @@ def clip_similarity_scores(
130
132
  ```
131
133
  """
132
134
 
135
+ if device is None:
136
+ if hasattr(model, "device"):
137
+ device = model.device
138
+ else:
139
+ device = next(model.parameters()).device
140
+ else:
141
+ model = model.to(device)
133
142
  with torch.no_grad():
134
143
  if images is not None:
135
144
  encoder = _get_encoder(model, "image")
136
145
  image_features = convert_images(
137
- images, transform=preprocess, encoder=encoder
146
+ images, transform=preprocess, encoder=encoder, device=device
138
147
  )
139
148
  image_features /= image_features.norm(dim=-1, keepdim=True) # type: ignore[union-attr]
140
149
 
141
150
  if text is not None:
142
151
  encoder = _get_encoder(model, "text")
143
- text_features = convert_text(text, tokenizer, encoder=encoder)
152
+ text_features = convert_text(
153
+ text, tokenizer, encoder=encoder, device=device
154
+ )
144
155
  text_features /= text_features.norm(dim=-1, keepdim=True) # type: ignore[union-attr]
145
156
 
146
157
  if images is not None and text is not None:
@@ -73,6 +73,9 @@ def python_to_sql(typ): # noqa: PLR0911
73
73
  if len(args) == 2 and (type(None) in args):
74
74
  return python_to_sql(args[0])
75
75
 
76
+ if _is_union_str_literal(orig, args):
77
+ return String
78
+
76
79
  if _is_json_inside_union(orig, args):
77
80
  return JSON
78
81
 
@@ -94,3 +97,9 @@ def _is_json_inside_union(orig, args) -> bool:
94
97
  if any(inspect.isclass(arg) and issubclass(arg, BaseModel) for arg in args):
95
98
  return True
96
99
  return False
100
+
101
+
102
+ def _is_union_str_literal(orig, args) -> bool:
103
+ if orig != Union:
104
+ return False
105
+ return all(arg is str or get_origin(arg) in (Literal, LiteralEx) for arg in args)
@@ -2,7 +2,7 @@ from collections.abc import Sequence
2
2
  from datetime import datetime
3
3
  from typing import ClassVar, Union, get_args, get_origin
4
4
 
5
- from pydantic import BaseModel
5
+ from pydantic import BaseModel, create_model
6
6
 
7
7
  from datachain.lib.model_store import ModelStore
8
8
 
@@ -57,3 +57,12 @@ def is_chain_type(t: type) -> bool:
57
57
  return is_chain_type(args[0])
58
58
 
59
59
  return False
60
+
61
+
62
+ def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
63
+ fields = {name: (anno, ...) for name, anno in data_dict.items()}
64
+ return create_model(
65
+ name,
66
+ __base__=(DataModel,), # type: ignore[call-overload]
67
+ **fields,
68
+ ) # type: ignore[call-overload]