datachain 0.3.6__tar.gz → 0.3.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (244) hide show
  1. {datachain-0.3.6 → datachain-0.3.8}/.github/workflows/tests.yml +1 -1
  2. {datachain-0.3.6/src/datachain.egg-info → datachain-0.3.8}/PKG-INFO +8 -3
  3. {datachain-0.3.6 → datachain-0.3.8}/pyproject.toml +10 -4
  4. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/catalog/catalog.py +0 -81
  5. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/cli.py +0 -37
  6. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/schema.py +1 -1
  7. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/sqlite.py +1 -10
  8. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/warehouse.py +12 -5
  9. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/arrow.py +4 -4
  10. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/clip.py +14 -3
  11. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/convert/python_to_sql.py +9 -0
  12. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/data_model.py +10 -1
  13. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/dc.py +95 -30
  14. datachain-0.3.8/src/datachain/lib/hf.py +166 -0
  15. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/image.py +9 -1
  16. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/pytorch.py +1 -2
  17. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/signal_schema.py +124 -20
  18. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/text.py +4 -0
  19. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/udf.py +14 -20
  20. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/dataset.py +10 -3
  21. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/session.py +5 -3
  22. {datachain-0.3.6 → datachain-0.3.8/src/datachain.egg-info}/PKG-INFO +8 -3
  23. {datachain-0.3.6 → datachain-0.3.8}/src/datachain.egg-info/SOURCES.txt +3 -4
  24. {datachain-0.3.6 → datachain-0.3.8}/src/datachain.egg-info/requires.txt +8 -2
  25. {datachain-0.3.6 → datachain-0.3.8}/tests/examples/wds_data.py +11 -11
  26. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_datasets.py +0 -127
  27. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_feature_pickling.py +70 -0
  28. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_pytorch.py +17 -2
  29. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/conftest.py +5 -2
  30. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_arrow.py +3 -3
  31. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_datachain.py +19 -0
  32. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_feature.py +3 -2
  33. datachain-0.3.8/tests/unit/lib/test_hf.py +132 -0
  34. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_signal_schema.py +92 -3
  35. datachain-0.3.6/examples/computer_vision/blip2_image_desc_lib.py +0 -100
  36. datachain-0.3.6/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -64
  37. datachain-0.3.6/examples/llm_and_nlp/llm-claude.py +0 -46
  38. {datachain-0.3.6 → datachain-0.3.8}/.cruft.json +0 -0
  39. {datachain-0.3.6 → datachain-0.3.8}/.gitattributes +0 -0
  40. {datachain-0.3.6 → datachain-0.3.8}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  41. {datachain-0.3.6 → datachain-0.3.8}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  42. {datachain-0.3.6 → datachain-0.3.8}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  43. {datachain-0.3.6 → datachain-0.3.8}/.github/codecov.yaml +0 -0
  44. {datachain-0.3.6 → datachain-0.3.8}/.github/dependabot.yml +0 -0
  45. {datachain-0.3.6 → datachain-0.3.8}/.github/workflows/benchmarks.yml +0 -0
  46. {datachain-0.3.6 → datachain-0.3.8}/.github/workflows/release.yml +0 -0
  47. {datachain-0.3.6 → datachain-0.3.8}/.github/workflows/tests-studio.yml +0 -0
  48. {datachain-0.3.6 → datachain-0.3.8}/.github/workflows/update-template.yaml +0 -0
  49. {datachain-0.3.6 → datachain-0.3.8}/.gitignore +0 -0
  50. {datachain-0.3.6 → datachain-0.3.8}/.pre-commit-config.yaml +0 -0
  51. {datachain-0.3.6 → datachain-0.3.8}/CODE_OF_CONDUCT.rst +0 -0
  52. {datachain-0.3.6 → datachain-0.3.8}/CONTRIBUTING.rst +0 -0
  53. {datachain-0.3.6 → datachain-0.3.8}/LICENSE +0 -0
  54. {datachain-0.3.6 → datachain-0.3.8}/README.rst +0 -0
  55. {datachain-0.3.6 → datachain-0.3.8}/docs/assets/captioned_cartoons.png +0 -0
  56. {datachain-0.3.6 → datachain-0.3.8}/docs/assets/datachain.png +0 -0
  57. {datachain-0.3.6 → datachain-0.3.8}/docs/assets/flowchart.png +0 -0
  58. {datachain-0.3.6 → datachain-0.3.8}/docs/index.md +0 -0
  59. {datachain-0.3.6 → datachain-0.3.8}/docs/references/datachain.md +0 -0
  60. {datachain-0.3.6 → datachain-0.3.8}/docs/references/datatype.md +0 -0
  61. {datachain-0.3.6 → datachain-0.3.8}/docs/references/file.md +0 -0
  62. {datachain-0.3.6 → datachain-0.3.8}/docs/references/index.md +0 -0
  63. {datachain-0.3.6 → datachain-0.3.8}/docs/references/sql.md +0 -0
  64. {datachain-0.3.6 → datachain-0.3.8}/docs/references/torch.md +0 -0
  65. {datachain-0.3.6 → datachain-0.3.8}/docs/references/udf.md +0 -0
  66. {datachain-0.3.6 → datachain-0.3.8}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  67. {datachain-0.3.6 → datachain-0.3.8}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  68. {datachain-0.3.6 → datachain-0.3.8}/examples/computer_vision/openimage-detect.py +0 -0
  69. {datachain-0.3.6 → datachain-0.3.8}/examples/get_started/common_sql_functions.py +0 -0
  70. {datachain-0.3.6 → datachain-0.3.8}/examples/get_started/json-csv-reader.py +0 -0
  71. {datachain-0.3.6 → datachain-0.3.8}/examples/get_started/torch-loader.py +0 -0
  72. {datachain-0.3.6 → datachain-0.3.8}/examples/get_started/udfs/parallel.py +0 -0
  73. {datachain-0.3.6 → datachain-0.3.8}/examples/get_started/udfs/simple.py +0 -0
  74. {datachain-0.3.6 → datachain-0.3.8}/examples/get_started/udfs/stateful.py +0 -0
  75. /datachain-0.3.6/examples/llm_and_nlp/llm-claude-simple-query.py → /datachain-0.3.8/examples/llm_and_nlp/claude-query.py +0 -0
  76. {datachain-0.3.6 → datachain-0.3.8}/examples/llm_and_nlp/unstructured-text.py +0 -0
  77. {datachain-0.3.6 → datachain-0.3.8}/examples/multimodal/clip_inference.py +0 -0
  78. {datachain-0.3.6 → datachain-0.3.8}/examples/multimodal/hf_pipeline.py +0 -0
  79. {datachain-0.3.6 → datachain-0.3.8}/examples/multimodal/openai_image_desc_lib.py +0 -0
  80. {datachain-0.3.6 → datachain-0.3.8}/examples/multimodal/wds.py +0 -0
  81. {datachain-0.3.6 → datachain-0.3.8}/examples/multimodal/wds_filtered.py +0 -0
  82. {datachain-0.3.6 → datachain-0.3.8}/mkdocs.yml +0 -0
  83. {datachain-0.3.6 → datachain-0.3.8}/noxfile.py +0 -0
  84. {datachain-0.3.6 → datachain-0.3.8}/setup.cfg +0 -0
  85. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/__init__.py +0 -0
  86. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/__main__.py +0 -0
  87. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/asyn.py +0 -0
  88. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/cache.py +0 -0
  89. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/catalog/__init__.py +0 -0
  90. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/catalog/datasource.py +0 -0
  91. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/catalog/loader.py +0 -0
  92. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/catalog/subclass.py +0 -0
  93. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/cli_utils.py +0 -0
  94. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/client/__init__.py +0 -0
  95. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/client/azure.py +0 -0
  96. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/client/fileslice.py +0 -0
  97. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/client/fsspec.py +0 -0
  98. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/client/gcs.py +0 -0
  99. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/client/local.py +0 -0
  100. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/client/s3.py +0 -0
  101. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/config.py +0 -0
  102. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/__init__.py +0 -0
  103. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/db_engine.py +0 -0
  104. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/id_generator.py +0 -0
  105. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/job.py +0 -0
  106. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/metastore.py +0 -0
  107. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/data_storage/serializer.py +0 -0
  108. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/dataset.py +0 -0
  109. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/error.py +0 -0
  110. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/job.py +0 -0
  111. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/__init__.py +0 -0
  112. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/convert/__init__.py +0 -0
  113. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/convert/flatten.py +0 -0
  114. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/convert/sql_to_python.py +0 -0
  115. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/convert/unflatten.py +0 -0
  116. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  117. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/dataset_info.py +0 -0
  118. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/file.py +0 -0
  119. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/listing.py +0 -0
  120. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/meta_formats.py +0 -0
  121. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/model_store.py +0 -0
  122. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/settings.py +0 -0
  123. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/udf_signature.py +0 -0
  124. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/utils.py +0 -0
  125. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/vfile.py +0 -0
  126. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/webdataset.py +0 -0
  127. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/lib/webdataset_laion.py +0 -0
  128. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/listing.py +0 -0
  129. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/node.py +0 -0
  130. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/nodes_fetcher.py +0 -0
  131. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/nodes_thread_pool.py +0 -0
  132. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/progress.py +0 -0
  133. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/py.typed +0 -0
  134. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/__init__.py +0 -0
  135. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/batch.py +0 -0
  136. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/builtins.py +0 -0
  137. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/dispatch.py +0 -0
  138. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/metrics.py +0 -0
  139. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/params.py +0 -0
  140. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/queue.py +0 -0
  141. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/schema.py +0 -0
  142. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/query/udf.py +0 -0
  143. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/remote/__init__.py +0 -0
  144. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/remote/studio.py +0 -0
  145. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/__init__.py +0 -0
  146. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/default/__init__.py +0 -0
  147. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/default/base.py +0 -0
  148. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/functions/__init__.py +0 -0
  149. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/functions/array.py +0 -0
  150. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/functions/conditional.py +0 -0
  151. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/functions/path.py +0 -0
  152. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/functions/random.py +0 -0
  153. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/functions/string.py +0 -0
  154. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/selectable.py +0 -0
  155. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/sqlite/__init__.py +0 -0
  156. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/sqlite/base.py +0 -0
  157. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/sqlite/types.py +0 -0
  158. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/sqlite/vector.py +0 -0
  159. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/types.py +0 -0
  160. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/sql/utils.py +0 -0
  161. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/storage.py +0 -0
  162. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/torch/__init__.py +0 -0
  163. {datachain-0.3.6 → datachain-0.3.8}/src/datachain/utils.py +0 -0
  164. {datachain-0.3.6 → datachain-0.3.8}/src/datachain.egg-info/dependency_links.txt +0 -0
  165. {datachain-0.3.6 → datachain-0.3.8}/src/datachain.egg-info/entry_points.txt +0 -0
  166. {datachain-0.3.6 → datachain-0.3.8}/src/datachain.egg-info/top_level.txt +0 -0
  167. {datachain-0.3.6 → datachain-0.3.8}/tests/__init__.py +0 -0
  168. {datachain-0.3.6 → datachain-0.3.8}/tests/benchmarks/__init__.py +0 -0
  169. {datachain-0.3.6 → datachain-0.3.8}/tests/benchmarks/conftest.py +0 -0
  170. {datachain-0.3.6 → datachain-0.3.8}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  171. {datachain-0.3.6 → datachain-0.3.8}/tests/benchmarks/datasets/.dvc/config +0 -0
  172. {datachain-0.3.6 → datachain-0.3.8}/tests/benchmarks/datasets/.gitignore +0 -0
  173. {datachain-0.3.6 → datachain-0.3.8}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  174. {datachain-0.3.6 → datachain-0.3.8}/tests/benchmarks/test_datachain.py +0 -0
  175. {datachain-0.3.6 → datachain-0.3.8}/tests/benchmarks/test_ls.py +0 -0
  176. {datachain-0.3.6 → datachain-0.3.8}/tests/benchmarks/test_version.py +0 -0
  177. {datachain-0.3.6 → datachain-0.3.8}/tests/conftest.py +0 -0
  178. {datachain-0.3.6 → datachain-0.3.8}/tests/data.py +0 -0
  179. {datachain-0.3.6 → datachain-0.3.8}/tests/examples/__init__.py +0 -0
  180. {datachain-0.3.6 → datachain-0.3.8}/tests/examples/test_examples.py +0 -0
  181. {datachain-0.3.6 → datachain-0.3.8}/tests/examples/test_wds_e2e.py +0 -0
  182. {datachain-0.3.6 → datachain-0.3.8}/tests/func/__init__.py +0 -0
  183. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_catalog.py +0 -0
  184. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_client.py +0 -0
  185. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_datachain.py +0 -0
  186. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_dataset_query.py +0 -0
  187. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_listing.py +0 -0
  188. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_ls.py +0 -0
  189. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_pull.py +0 -0
  190. {datachain-0.3.6 → datachain-0.3.8}/tests/func/test_query.py +0 -0
  191. {datachain-0.3.6 → datachain-0.3.8}/tests/scripts/feature_class.py +0 -0
  192. {datachain-0.3.6 → datachain-0.3.8}/tests/scripts/feature_class_parallel.py +0 -0
  193. {datachain-0.3.6 → datachain-0.3.8}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  194. {datachain-0.3.6 → datachain-0.3.8}/tests/scripts/name_len_slow.py +0 -0
  195. {datachain-0.3.6 → datachain-0.3.8}/tests/test_cli_e2e.py +0 -0
  196. {datachain-0.3.6 → datachain-0.3.8}/tests/test_query_e2e.py +0 -0
  197. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/__init__.py +0 -0
  198. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/__init__.py +0 -0
  199. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_clip.py +0 -0
  200. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  201. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_datachain_merge.py +0 -0
  202. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_feature_utils.py +0 -0
  203. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_file.py +0 -0
  204. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_image.py +0 -0
  205. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_schema.py +0 -0
  206. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_sql_to_python.py +0 -0
  207. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_text.py +0 -0
  208. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_udf_signature.py +0 -0
  209. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_utils.py +0 -0
  210. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/lib/test_webdataset.py +0 -0
  211. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/sql/__init__.py +0 -0
  212. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/sql/sqlite/__init__.py +0 -0
  213. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/sql/sqlite/test_utils.py +0 -0
  214. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/sql/test_array.py +0 -0
  215. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/sql/test_conditional.py +0 -0
  216. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/sql/test_path.py +0 -0
  217. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/sql/test_random.py +0 -0
  218. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/sql/test_selectable.py +0 -0
  219. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/sql/test_string.py +0 -0
  220. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_asyn.py +0 -0
  221. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_cache.py +0 -0
  222. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_catalog.py +0 -0
  223. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_catalog_loader.py +0 -0
  224. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_cli_parsing.py +0 -0
  225. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_client.py +0 -0
  226. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_client_s3.py +0 -0
  227. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_data_storage.py +0 -0
  228. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_database_engine.py +0 -0
  229. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_dataset.py +0 -0
  230. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_dispatch.py +0 -0
  231. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_fileslice.py +0 -0
  232. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_id_generator.py +0 -0
  233. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_listing.py +0 -0
  234. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_metastore.py +0 -0
  235. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_module_exports.py +0 -0
  236. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_query_metrics.py +0 -0
  237. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_query_params.py +0 -0
  238. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_serializer.py +0 -0
  239. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_session.py +0 -0
  240. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_storage.py +0 -0
  241. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_udf.py +0 -0
  242. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_utils.py +0 -0
  243. {datachain-0.3.6 → datachain-0.3.8}/tests/unit/test_warehouse.py +0 -0
  244. {datachain-0.3.6 → datachain-0.3.8}/tests/utils.py +0 -0
@@ -50,7 +50,7 @@ jobs:
50
50
  run: nox -s lint
51
51
 
52
52
  datachain:
53
- timeout-minutes: 25
53
+ timeout-minutes: 30
54
54
  runs-on: ${{ matrix.os }}
55
55
  strategy:
56
56
  fail-fast: false
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.6
3
+ Version: 0.3.8
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -42,6 +42,7 @@ Requires-Dist: jmespath>=1.0
42
42
  Requires-Dist: datamodel-code-generator>=0.25
43
43
  Requires-Dist: Pillow<11,>=10.0.0
44
44
  Requires-Dist: msgpack<2,>=1.0.4
45
+ Requires-Dist: psutil
45
46
  Provides-Extra: docs
46
47
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
47
48
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -58,8 +59,11 @@ Requires-Dist: lz4; extra == "remote"
58
59
  Requires-Dist: requests>=2.22.0; extra == "remote"
59
60
  Provides-Extra: vector
60
61
  Requires-Dist: usearch; extra == "vector"
62
+ Provides-Extra: hf
63
+ Requires-Dist: numba>=0.60.0; extra == "hf"
64
+ Requires-Dist: datasets[audio,vision]; extra == "hf"
61
65
  Provides-Extra: tests
62
- Requires-Dist: datachain[remote,torch,vector]; extra == "tests"
66
+ Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
63
67
  Requires-Dist: pytest<9,>=8; extra == "tests"
64
68
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
65
69
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
@@ -74,9 +78,10 @@ Requires-Dist: hypothesis; extra == "tests"
74
78
  Requires-Dist: open_clip_torch; extra == "tests"
75
79
  Requires-Dist: aiotools>=1.7.0; extra == "tests"
76
80
  Requires-Dist: requests-mock; extra == "tests"
81
+ Requires-Dist: scipy; extra == "tests"
77
82
  Provides-Extra: dev
78
83
  Requires-Dist: datachain[docs,tests]; extra == "dev"
79
- Requires-Dist: mypy==1.11.1; extra == "dev"
84
+ Requires-Dist: mypy==1.11.2; extra == "dev"
80
85
  Requires-Dist: types-python-dateutil; extra == "dev"
81
86
  Requires-Dist: types-pytz; extra == "dev"
82
87
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -44,7 +44,8 @@ dependencies = [
44
44
  "jmespath>=1.0",
45
45
  "datamodel-code-generator>=0.25",
46
46
  "Pillow>=10.0.0,<11",
47
- "msgpack>=1.0.4,<2"
47
+ "msgpack>=1.0.4,<2",
48
+ "psutil"
48
49
  ]
49
50
 
50
51
  [project.optional-dependencies]
@@ -68,8 +69,12 @@ remote = [
68
69
  vector = [
69
70
  "usearch"
70
71
  ]
72
+ hf = [
73
+ "numba>=0.60.0",
74
+ "datasets[audio,vision]"
75
+ ]
71
76
  tests = [
72
- "datachain[torch,remote,vector]",
77
+ "datachain[torch,remote,vector,hf]",
73
78
  "pytest>=8,<9",
74
79
  "pytest-sugar>=0.9.6",
75
80
  "pytest-cov>=4.1.0",
@@ -83,11 +88,12 @@ tests = [
83
88
  "hypothesis",
84
89
  "open_clip_torch",
85
90
  "aiotools>=1.7.0",
86
- "requests-mock"
91
+ "requests-mock",
92
+ "scipy"
87
93
  ]
88
94
  dev = [
89
95
  "datachain[docs,tests]",
90
- "mypy==1.11.1",
96
+ "mypy==1.11.2",
91
97
  "types-python-dateutil",
92
98
  "types-pytz",
93
99
  "types-PyYAML",
@@ -1540,87 +1540,6 @@ class Catalog:
1540
1540
  dataset = self.get_dataset(name)
1541
1541
  return self.update_dataset(dataset, **update_data)
1542
1542
 
1543
- def merge_datasets(
1544
- self,
1545
- src: DatasetRecord,
1546
- dst: DatasetRecord,
1547
- src_version: int,
1548
- dst_version: Optional[int] = None,
1549
- ) -> DatasetRecord:
1550
- """
1551
- Merges records from source to destination dataset.
1552
- It will create new version
1553
- of a dataset with records merged from old version and the source, unless
1554
- existing version is specified for destination in which case it must
1555
- be in non final status as datasets are immutable
1556
- """
1557
- if (
1558
- dst_version
1559
- and not dst.is_valid_next_version(dst_version)
1560
- and dst.get_version(dst_version).is_final_status()
1561
- ):
1562
- raise DatasetInvalidVersionError(
1563
- f"Version {dst_version} must be higher than the current latest one"
1564
- )
1565
-
1566
- src_dep = self.get_dataset_dependencies(src.name, src_version)
1567
- dst_dep = self.get_dataset_dependencies(
1568
- dst.name,
1569
- dst.latest_version, # type: ignore[arg-type]
1570
- )
1571
-
1572
- if dst.has_version(dst_version): # type: ignore[arg-type]
1573
- # case where we don't create new version, but append to the existing one
1574
- self.warehouse.merge_dataset_rows(
1575
- src,
1576
- dst,
1577
- src_version,
1578
- dst_version=dst_version, # type: ignore[arg-type]
1579
- )
1580
- merged_schema = src.serialized_schema | dst.serialized_schema
1581
- self.update_dataset(dst, schema=merged_schema)
1582
- self.update_dataset_version_with_warehouse_info(
1583
- dst,
1584
- dst_version, # type: ignore[arg-type]
1585
- schema=merged_schema,
1586
- )
1587
- for dep in src_dep:
1588
- if dep and dep not in dst_dep:
1589
- self.metastore.add_dependency(
1590
- dep,
1591
- dst.name,
1592
- dst_version, # type: ignore[arg-type]
1593
- )
1594
- else:
1595
- # case where we create new version of merged results
1596
- src_dr = self.warehouse.dataset_rows(src, src_version)
1597
- dst_dr = self.warehouse.dataset_rows(dst)
1598
-
1599
- merge_result_columns = list(
1600
- {
1601
- c.name: c for c in list(src_dr.table.c) + list(dst_dr.table.c)
1602
- }.values()
1603
- )
1604
-
1605
- dst_version = dst_version or dst.next_version
1606
- dst = self.create_new_dataset_version(
1607
- dst,
1608
- dst_version,
1609
- columns=merge_result_columns,
1610
- )
1611
- self.warehouse.merge_dataset_rows(
1612
- src,
1613
- dst,
1614
- src_version,
1615
- dst_version,
1616
- )
1617
- self.update_dataset_version_with_warehouse_info(dst, dst_version)
1618
- for dep in set(src_dep + dst_dep):
1619
- if dep:
1620
- self.metastore.add_dependency(dep, dst.name, dst_version)
1621
-
1622
- return dst
1623
-
1624
1543
  def get_file_signals(
1625
1544
  self, dataset_name: str, dataset_version: int, row: RowDict
1626
1545
  ) -> Optional[dict]:
@@ -336,36 +336,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
336
336
  help="Display size using powers of 1000 not 1024",
337
337
  )
338
338
 
339
- parse_merge_datasets = subp.add_parser(
340
- "merge-datasets", parents=[parent_parser], description="Merges datasets"
341
- )
342
- parse_merge_datasets.add_argument(
343
- "--src",
344
- action="store",
345
- default=None,
346
- help="Source dataset name",
347
- )
348
- parse_merge_datasets.add_argument(
349
- "--dst",
350
- action="store",
351
- default=None,
352
- help="Destination dataset name",
353
- )
354
- parse_merge_datasets.add_argument(
355
- "--src-version",
356
- action="store",
357
- default=None,
358
- type=int,
359
- help="Source dataset version",
360
- )
361
- parse_merge_datasets.add_argument(
362
- "--dst-version",
363
- action="store",
364
- default=None,
365
- type=int,
366
- help="Destination dataset version",
367
- )
368
-
369
339
  parse_ls = subp.add_parser(
370
340
  "ls", parents=[parent_parser], description="List storage contents"
371
341
  )
@@ -996,13 +966,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
996
966
  new_name=args.new_name,
997
967
  labels=args.labels,
998
968
  )
999
- elif args.command == "merge-datasets":
1000
- catalog.merge_datasets(
1001
- catalog.get_dataset(args.src),
1002
- catalog.get_dataset(args.dst),
1003
- args.src_version,
1004
- dst_version=args.dst_version,
1005
- )
1006
969
  elif args.command == "ls":
1007
970
  ls(
1008
971
  args.sources,
@@ -50,7 +50,7 @@ def convert_rows_custom_column_types(
50
50
  columns: "ColumnCollection[str, ColumnElement[Any]]",
51
51
  rows: Iterator[tuple[Any, ...]],
52
52
  dialect: "Dialect",
53
- ):
53
+ ) -> Iterator[tuple[Any, ...]]:
54
54
  """
55
55
  This function converts values of rows columns based on their types which are
56
56
  defined in columns. We are only converting column values for which types are
@@ -27,10 +27,7 @@ import datachain.sql.sqlite
27
27
  from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
28
28
  from datachain.data_storage.db_engine import DatabaseEngine
29
29
  from datachain.data_storage.id_generator import AbstractDBIDGenerator
30
- from datachain.data_storage.schema import (
31
- DefaultSchema,
32
- convert_rows_custom_column_types,
33
- )
30
+ from datachain.data_storage.schema import DefaultSchema
34
31
  from datachain.dataset import DatasetRecord
35
32
  from datachain.error import DataChainError
36
33
  from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
@@ -651,12 +648,6 @@ class SQLiteWarehouse(AbstractWarehouse):
651
648
  self.db.create_table(table, if_not_exists=if_not_exists)
652
649
  return table
653
650
 
654
- def dataset_rows_select(self, select_query: Select, **kwargs):
655
- rows = self.db.execute(select_query, **kwargs)
656
- yield from convert_rows_custom_column_types(
657
- select_query.selected_columns, rows, sqlite_dialect
658
- )
659
-
660
651
  def get_dataset_sources(
661
652
  self, dataset: DatasetRecord, version: int
662
653
  ) -> list[StorageURI]:
@@ -17,6 +17,7 @@ from sqlalchemy.sql.expression import true
17
17
  from tqdm import tqdm
18
18
 
19
19
  from datachain.client import Client
20
+ from datachain.data_storage.schema import convert_rows_custom_column_types
20
21
  from datachain.data_storage.serializer import Serializable
21
22
  from datachain.dataset import DatasetRecord
22
23
  from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
@@ -226,7 +227,7 @@ class AbstractWarehouse(ABC, Serializable):
226
227
  if limit < page_size:
227
228
  paginated_query = paginated_query.limit(None).limit(limit)
228
229
 
229
- results = self.db.execute(paginated_query.offset(offset))
230
+ results = self.dataset_rows_select(paginated_query.offset(offset))
230
231
 
231
232
  processed = False
232
233
  for row in results:
@@ -309,12 +310,18 @@ class AbstractWarehouse(ABC, Serializable):
309
310
  Merge results should not contain duplicates.
310
311
  """
311
312
 
312
- @abstractmethod
313
- def dataset_rows_select(self, select_query: sa.sql.selectable.Select, **kwargs):
313
+ def dataset_rows_select(
314
+ self,
315
+ query: sa.sql.selectable.Select,
316
+ **kwargs,
317
+ ) -> Iterator[tuple[Any, ...]]:
314
318
  """
315
- Method for fetching dataset rows from database. This is abstract since
316
- in some DBs we need to use special settings
319
+ Fetch dataset rows from database.
317
320
  """
321
+ rows = self.db.execute(query, **kwargs)
322
+ yield from convert_rows_custom_column_types(
323
+ query.selected_columns, rows, self.db.dialect
324
+ )
318
325
 
319
326
  @abstractmethod
320
327
  def get_dataset_sources(
@@ -95,7 +95,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
95
95
  if not column:
96
96
  column = f"c{default_column}"
97
97
  default_column += 1
98
- dtype = _arrow_type_mapper(field.type) # type: ignore[assignment]
98
+ dtype = arrow_type_mapper(field.type) # type: ignore[assignment]
99
99
  if field.nullable:
100
100
  dtype = Optional[dtype] # type: ignore[assignment]
101
101
  output[column] = dtype
@@ -103,7 +103,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
103
103
  return output
104
104
 
105
105
 
106
- def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
106
+ def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
107
107
  """Convert pyarrow types to basic types."""
108
108
  from datetime import datetime
109
109
 
@@ -122,11 +122,11 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
122
122
  if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
123
123
  return str
124
124
  if pa.types.is_list(col_type):
125
- return list[_arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
125
+ return list[arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
126
126
  if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
127
127
  return dict
128
128
  if isinstance(col_type, pa.lib.DictionaryType):
129
- return _arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
129
+ return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
130
130
  raise TypeError(f"{col_type!r} datatypes not supported")
131
131
 
132
132
 
@@ -1,5 +1,5 @@
1
1
  import inspect
2
- from typing import TYPE_CHECKING, Any, Callable, Literal, Union
2
+ from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
3
3
 
4
4
  import torch
5
5
  from transformers.modeling_utils import PreTrainedModel
@@ -39,6 +39,7 @@ def clip_similarity_scores(
39
39
  tokenizer: Callable,
40
40
  prob: bool = False,
41
41
  image_to_text: bool = True,
42
+ device: Optional[Union[str, torch.device]] = None,
42
43
  ) -> list[list[float]]:
43
44
  """
44
45
  Calculate CLIP similarity scores between one or more images and/or text.
@@ -52,6 +53,7 @@ def clip_similarity_scores(
52
53
  prob : Compute softmax probabilities.
53
54
  image_to_text : Whether to compute for image-to-text or text-to-image. Ignored
54
55
  if only one of images or text provided.
56
+ device : Device to use. Defaults is None - use model's device.
55
57
 
56
58
 
57
59
  Example:
@@ -130,17 +132,26 @@ def clip_similarity_scores(
130
132
  ```
131
133
  """
132
134
 
135
+ if device is None:
136
+ if hasattr(model, "device"):
137
+ device = model.device
138
+ else:
139
+ device = next(model.parameters()).device
140
+ else:
141
+ model = model.to(device)
133
142
  with torch.no_grad():
134
143
  if images is not None:
135
144
  encoder = _get_encoder(model, "image")
136
145
  image_features = convert_images(
137
- images, transform=preprocess, encoder=encoder
146
+ images, transform=preprocess, encoder=encoder, device=device
138
147
  )
139
148
  image_features /= image_features.norm(dim=-1, keepdim=True) # type: ignore[union-attr]
140
149
 
141
150
  if text is not None:
142
151
  encoder = _get_encoder(model, "text")
143
- text_features = convert_text(text, tokenizer, encoder=encoder)
152
+ text_features = convert_text(
153
+ text, tokenizer, encoder=encoder, device=device
154
+ )
144
155
  text_features /= text_features.norm(dim=-1, keepdim=True) # type: ignore[union-attr]
145
156
 
146
157
  if images is not None and text is not None:
@@ -73,6 +73,9 @@ def python_to_sql(typ): # noqa: PLR0911
73
73
  if len(args) == 2 and (type(None) in args):
74
74
  return python_to_sql(args[0])
75
75
 
76
+ if _is_union_str_literal(orig, args):
77
+ return String
78
+
76
79
  if _is_json_inside_union(orig, args):
77
80
  return JSON
78
81
 
@@ -94,3 +97,9 @@ def _is_json_inside_union(orig, args) -> bool:
94
97
  if any(inspect.isclass(arg) and issubclass(arg, BaseModel) for arg in args):
95
98
  return True
96
99
  return False
100
+
101
+
102
+ def _is_union_str_literal(orig, args) -> bool:
103
+ if orig != Union:
104
+ return False
105
+ return all(arg is str or get_origin(arg) in (Literal, LiteralEx) for arg in args)
@@ -2,7 +2,7 @@ from collections.abc import Sequence
2
2
  from datetime import datetime
3
3
  from typing import ClassVar, Union, get_args, get_origin
4
4
 
5
- from pydantic import BaseModel
5
+ from pydantic import BaseModel, create_model
6
6
 
7
7
  from datachain.lib.model_store import ModelStore
8
8
 
@@ -57,3 +57,12 @@ def is_chain_type(t: type) -> bool:
57
57
  return is_chain_type(args[0])
58
58
 
59
59
  return False
60
+
61
+
62
+ def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
63
+ fields = {name: (anno, ...) for name, anno in data_dict.items()}
64
+ return create_model(
65
+ name,
66
+ __base__=(DataModel,), # type: ignore[call-overload]
67
+ **fields,
68
+ ) # type: ignore[call-overload]