datachain 0.2.10__tar.gz → 0.2.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (304) hide show
  1. {datachain-0.2.10 → datachain-0.2.12}/.github/workflows/tests.yml +1 -1
  2. {datachain-0.2.10 → datachain-0.2.12}/CONTRIBUTING.rst +3 -3
  3. datachain-0.2.12/PKG-INFO +412 -0
  4. datachain-0.2.12/README.rst +328 -0
  5. datachain-0.2.12/docs/assets/captioned_cartoons.png +0 -0
  6. datachain-0.2.12/docs/assets/flowchart.png +0 -0
  7. datachain-0.2.12/docs/index.md +304 -0
  8. datachain-0.2.12/docs/references/datachain.md +18 -0
  9. datachain-0.2.12/docs/references/datatype.md +19 -0
  10. datachain-0.2.12/docs/references/file.md +22 -0
  11. datachain-0.2.12/docs/references/index.md +8 -0
  12. datachain-0.2.12/docs/references/sql.md +18 -0
  13. datachain-0.2.12/docs/references/torch.md +17 -0
  14. datachain-0.2.12/docs/references/udf.md +18 -0
  15. datachain-0.2.12/examples/computer_vision/blip2_image_desc_lib.py +102 -0
  16. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/.gitignore +1 -0
  17. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +775 -1217
  18. datachain-0.2.12/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +4083 -0
  19. datachain-0.2.12/examples/computer_vision/fashion_product_images/3-train-model.ipynb +1080 -0
  20. datachain-0.2.12/examples/computer_vision/fashion_product_images/4-inference.ipynb +754 -0
  21. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/README.md +1 -1
  22. datachain-0.2.12/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +44 -0
  23. datachain-0.2.12/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +49 -0
  24. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +10 -18
  25. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +5 -7
  26. datachain-0.2.12/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +52 -0
  27. datachain-0.2.12/examples/computer_vision/fashion_product_images/src/train.py +143 -0
  28. datachain-0.2.12/examples/computer_vision/iptc_exif_xmp_lib.py +75 -0
  29. datachain-0.2.12/examples/computer_vision/llava2_image_desc_lib.py +82 -0
  30. datachain-0.2.12/examples/computer_vision/openimage-detect.py +63 -0
  31. datachain-0.2.12/examples/get_started/common_sql_functions.py +93 -0
  32. {datachain-0.2.10/examples → datachain-0.2.12/examples/get_started}/json-csv-reader.py +14 -31
  33. {datachain-0.2.10/examples → datachain-0.2.12/examples/get_started}/torch-loader.py +9 -5
  34. datachain-0.2.12/examples/get_started/udfs/parallel.py +39 -0
  35. datachain-0.2.12/examples/get_started/udfs/simple.py +19 -0
  36. datachain-0.2.12/examples/get_started/udfs/stateful.py +43 -0
  37. {datachain-0.2.10/examples → datachain-0.2.12/examples/llm_and_nlp}/llm-claude-aggregate-query.py +3 -5
  38. {datachain-0.2.10/examples → datachain-0.2.12/examples/llm_and_nlp}/llm-claude-simple-query.py +10 -5
  39. {datachain-0.2.10/examples → datachain-0.2.12/examples/llm_and_nlp}/llm-claude.py +2 -5
  40. datachain-0.2.12/examples/llm_and_nlp/unstructured-text.py +63 -0
  41. {datachain-0.2.10/examples → datachain-0.2.12/examples/multimodal}/clip.py +6 -6
  42. {datachain-0.2.10 → datachain-0.2.12}/examples/multimodal/clip_fine_tuning.ipynb +532 -277
  43. datachain-0.2.12/examples/multimodal/hf_pipeline.py +124 -0
  44. datachain-0.2.12/examples/multimodal/openai_image_desc_lib.py +95 -0
  45. {datachain-0.2.10/examples → datachain-0.2.12/examples/multimodal}/wds.py +6 -6
  46. datachain-0.2.12/examples/multimodal/wds_filtered.py +38 -0
  47. {datachain-0.2.10 → datachain-0.2.12}/mkdocs.yml +10 -5
  48. {datachain-0.2.10 → datachain-0.2.12}/pyproject.toml +14 -5
  49. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/__init__.py +3 -4
  50. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/cache.py +10 -4
  51. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/catalog/catalog.py +35 -15
  52. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/cli.py +37 -32
  53. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/metastore.py +24 -0
  54. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/warehouse.py +3 -1
  55. datachain-0.2.12/src/datachain/job.py +56 -0
  56. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/arrow.py +19 -7
  57. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/clip.py +89 -66
  58. datachain-0.2.10/src/datachain/lib/convert/type_converter.py → datachain-0.2.12/src/datachain/lib/convert/python_to_sql.py +6 -6
  59. datachain-0.2.12/src/datachain/lib/convert/sql_to_python.py +23 -0
  60. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/convert/values_to_tuples.py +51 -33
  61. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/data_model.py +6 -27
  62. datachain-0.2.12/src/datachain/lib/dataset_info.py +70 -0
  63. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/dc.py +646 -152
  64. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/file.py +117 -15
  65. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/image.py +1 -1
  66. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/meta_formats.py +14 -2
  67. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/model_store.py +3 -2
  68. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/pytorch.py +10 -7
  69. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/signal_schema.py +39 -14
  70. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/text.py +2 -1
  71. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/udf.py +56 -5
  72. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/udf_signature.py +1 -1
  73. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/webdataset.py +4 -3
  74. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/node.py +11 -8
  75. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/dataset.py +66 -147
  76. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/dispatch.py +15 -13
  77. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/schema.py +2 -0
  78. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/session.py +4 -4
  79. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/array.py +12 -0
  80. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/string.py +8 -0
  81. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/torch/__init__.py +1 -1
  82. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/utils.py +45 -0
  83. datachain-0.2.12/src/datachain.egg-info/PKG-INFO +412 -0
  84. {datachain-0.2.10 → datachain-0.2.12}/src/datachain.egg-info/SOURCES.txt +37 -48
  85. {datachain-0.2.10 → datachain-0.2.12}/src/datachain.egg-info/requires.txt +2 -1
  86. {datachain-0.2.10 → datachain-0.2.12}/tests/examples/test_wds_e2e.py +5 -5
  87. {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_catalog.py +1 -1
  88. datachain-0.2.12/tests/func/test_datachain.py +217 -0
  89. {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_dataset_query.py +156 -123
  90. datachain-0.2.12/tests/func/test_feature_pickling.py +209 -0
  91. {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_query.py +2 -2
  92. {datachain-0.2.10 → datachain-0.2.12}/tests/scripts/feature_class_parallel.py +0 -1
  93. datachain-0.2.12/tests/scripts/feature_class_parallel_data_model.py +28 -0
  94. {datachain-0.2.10 → datachain-0.2.12}/tests/scripts/name_len_normal.py +1 -1
  95. {datachain-0.2.10 → datachain-0.2.12}/tests/test_query_e2e.py +57 -16
  96. datachain-0.2.12/tests/unit/lib/conftest.py +72 -0
  97. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_arrow.py +17 -0
  98. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_clip.py +2 -4
  99. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_datachain.py +208 -77
  100. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_datachain_bootstrap.py +5 -5
  101. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_datachain_merge.py +14 -8
  102. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_feature.py +1 -1
  103. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_feature_utils.py +2 -2
  104. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_file.py +115 -2
  105. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_image.py +4 -5
  106. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_signal_schema.py +39 -13
  107. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_text.py +6 -8
  108. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_utils.py +4 -4
  109. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_catalog.py +13 -13
  110. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_module_exports.py +0 -4
  111. datachain-0.2.10/.reuse/dep5 +0 -8
  112. datachain-0.2.10/LICENSES/Apache-2.0.txt +0 -73
  113. datachain-0.2.10/LICENSES/BSD-3-Clause.txt +0 -11
  114. datachain-0.2.10/LICENSES/Python-2.0.txt +0 -72
  115. datachain-0.2.10/PKG-INFO +0 -430
  116. datachain-0.2.10/README.rst +0 -347
  117. datachain-0.2.10/docs/index.md +0 -3
  118. datachain-0.2.10/docs/references/catalog.md +0 -3
  119. datachain-0.2.10/docs/references/datachain.md +0 -3
  120. datachain-0.2.10/examples/blip2_image_desc_lib.py +0 -35
  121. datachain-0.2.10/examples/common_sql_functions.py +0 -78
  122. datachain-0.2.10/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -3589
  123. datachain-0.2.10/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -91
  124. datachain-0.2.10/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -51
  125. datachain-0.2.10/examples/hf_pipeline.py +0 -98
  126. datachain-0.2.10/examples/iptc_exif_xmp_lib.py +0 -15
  127. datachain-0.2.10/examples/llava2_image_desc_lib.py +0 -43
  128. datachain-0.2.10/examples/loader.py +0 -31
  129. datachain-0.2.10/examples/neurips/README +0 -18
  130. datachain-0.2.10/examples/neurips/distance_to_query.py +0 -29
  131. datachain-0.2.10/examples/neurips/llm_chat.py +0 -46
  132. datachain-0.2.10/examples/neurips/requirements.txt +0 -9
  133. datachain-0.2.10/examples/neurips/single_query.py +0 -119
  134. datachain-0.2.10/examples/neurips/text_loaders.py +0 -80
  135. datachain-0.2.10/examples/openai_image_desc_lib.py +0 -29
  136. datachain-0.2.10/examples/openimage-detect.py +0 -72
  137. datachain-0.2.10/examples/pose_detection.py +0 -219
  138. datachain-0.2.10/examples/udfs/batching.py +0 -34
  139. datachain-0.2.10/examples/udfs/image_transformation.py +0 -45
  140. datachain-0.2.10/examples/udfs/parallel.py +0 -55
  141. datachain-0.2.10/examples/udfs/simple.py +0 -42
  142. datachain-0.2.10/examples/udfs/stateful.py +0 -44
  143. datachain-0.2.10/examples/udfs/stateful_similarity.py +0 -79
  144. datachain-0.2.10/examples/unstructured-text.py +0 -54
  145. datachain-0.2.10/examples/wds_filtered.py +0 -55
  146. datachain-0.2.10/examples/zalando/zalando_clip.py +0 -44
  147. datachain-0.2.10/examples/zalando/zalando_dir_as_class.py +0 -31
  148. datachain-0.2.10/examples/zalando/zalando_splits_and_classes_ds.py +0 -9
  149. datachain-0.2.10/examples/zalando/zalando_splits_and_classes_output.py +0 -17
  150. datachain-0.2.10/src/datachain/lib/feature_registry.py +0 -77
  151. datachain-0.2.10/src/datachain/lib/gpt4_vision.py +0 -97
  152. datachain-0.2.10/src/datachain/lib/hf_image_to_text.py +0 -97
  153. datachain-0.2.10/src/datachain/lib/hf_pipeline.py +0 -90
  154. datachain-0.2.10/src/datachain/lib/image_transform.py +0 -103
  155. datachain-0.2.10/src/datachain/lib/iptc_exif_xmp.py +0 -76
  156. datachain-0.2.10/src/datachain/lib/unstructured.py +0 -41
  157. datachain-0.2.10/src/datachain/text/__init__.py +0 -3
  158. datachain-0.2.10/src/datachain.egg-info/PKG-INFO +0 -430
  159. datachain-0.2.10/tests/func/test_datachain.py +0 -58
  160. datachain-0.2.10/tests/unit/lib/conftest.py +0 -21
  161. {datachain-0.2.10 → datachain-0.2.12}/.cruft.json +0 -0
  162. {datachain-0.2.10 → datachain-0.2.12}/.gitattributes +0 -0
  163. {datachain-0.2.10 → datachain-0.2.12}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  164. {datachain-0.2.10 → datachain-0.2.12}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  165. {datachain-0.2.10 → datachain-0.2.12}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  166. {datachain-0.2.10 → datachain-0.2.12}/.github/codecov.yaml +0 -0
  167. {datachain-0.2.10 → datachain-0.2.12}/.github/dependabot.yml +0 -0
  168. {datachain-0.2.10 → datachain-0.2.12}/.github/workflows/benchmarks.yml +0 -0
  169. {datachain-0.2.10 → datachain-0.2.12}/.github/workflows/release.yml +0 -0
  170. {datachain-0.2.10 → datachain-0.2.12}/.github/workflows/update-template.yaml +0 -0
  171. {datachain-0.2.10 → datachain-0.2.12}/.gitignore +0 -0
  172. {datachain-0.2.10 → datachain-0.2.12}/.pre-commit-config.yaml +0 -0
  173. {datachain-0.2.10 → datachain-0.2.12}/CODE_OF_CONDUCT.rst +0 -0
  174. {datachain-0.2.10 → datachain-0.2.12}/LICENSE +0 -0
  175. {datachain-0.2.10 → datachain-0.2.12}/docs/assets/datachain.png +0 -0
  176. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  177. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  178. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  179. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  180. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  181. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  182. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  183. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  184. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  185. {datachain-0.2.10 → datachain-0.2.12}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  186. {datachain-0.2.10 → datachain-0.2.12}/noxfile.py +0 -0
  187. {datachain-0.2.10 → datachain-0.2.12}/setup.cfg +0 -0
  188. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/__main__.py +0 -0
  189. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/asyn.py +0 -0
  190. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/catalog/__init__.py +0 -0
  191. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/catalog/datasource.py +0 -0
  192. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/catalog/loader.py +0 -0
  193. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/catalog/subclass.py +0 -0
  194. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/cli_utils.py +0 -0
  195. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/__init__.py +0 -0
  196. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/azure.py +0 -0
  197. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/fileslice.py +0 -0
  198. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/fsspec.py +0 -0
  199. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/gcs.py +0 -0
  200. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/local.py +0 -0
  201. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/client/s3.py +0 -0
  202. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/config.py +0 -0
  203. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/__init__.py +0 -0
  204. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/db_engine.py +0 -0
  205. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/id_generator.py +0 -0
  206. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/job.py +0 -0
  207. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/schema.py +0 -0
  208. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/serializer.py +0 -0
  209. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/data_storage/sqlite.py +0 -0
  210. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/dataset.py +0 -0
  211. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/error.py +0 -0
  212. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/__init__.py +0 -0
  213. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/convert/__init__.py +0 -0
  214. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/convert/flatten.py +0 -0
  215. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/convert/unflatten.py +0 -0
  216. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/settings.py +0 -0
  217. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/utils.py +0 -0
  218. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/vfile.py +0 -0
  219. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/lib/webdataset_laion.py +0 -0
  220. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/listing.py +0 -0
  221. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/nodes_fetcher.py +0 -0
  222. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/nodes_thread_pool.py +0 -0
  223. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/progress.py +0 -0
  224. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/py.typed +0 -0
  225. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/__init__.py +0 -0
  226. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/batch.py +0 -0
  227. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/builtins.py +0 -0
  228. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/metrics.py +0 -0
  229. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/params.py +0 -0
  230. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/query/udf.py +0 -0
  231. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/remote/__init__.py +0 -0
  232. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/remote/studio.py +0 -0
  233. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/__init__.py +0 -0
  234. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/default/__init__.py +0 -0
  235. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/default/base.py +0 -0
  236. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/__init__.py +0 -0
  237. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/conditional.py +0 -0
  238. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/path.py +0 -0
  239. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/functions/random.py +0 -0
  240. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/selectable.py +0 -0
  241. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/sqlite/__init__.py +0 -0
  242. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/sqlite/base.py +0 -0
  243. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/sqlite/types.py +0 -0
  244. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/sqlite/vector.py +0 -0
  245. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/types.py +0 -0
  246. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/sql/utils.py +0 -0
  247. {datachain-0.2.10 → datachain-0.2.12}/src/datachain/storage.py +0 -0
  248. {datachain-0.2.10 → datachain-0.2.12}/src/datachain.egg-info/dependency_links.txt +0 -0
  249. {datachain-0.2.10 → datachain-0.2.12}/src/datachain.egg-info/entry_points.txt +0 -0
  250. {datachain-0.2.10 → datachain-0.2.12}/src/datachain.egg-info/top_level.txt +0 -0
  251. {datachain-0.2.10 → datachain-0.2.12}/tests/__init__.py +0 -0
  252. {datachain-0.2.10 → datachain-0.2.12}/tests/benchmarks/__init__.py +0 -0
  253. {datachain-0.2.10 → datachain-0.2.12}/tests/benchmarks/conftest.py +0 -0
  254. {datachain-0.2.10 → datachain-0.2.12}/tests/benchmarks/test_ls.py +0 -0
  255. {datachain-0.2.10 → datachain-0.2.12}/tests/benchmarks/test_version.py +0 -0
  256. {datachain-0.2.10 → datachain-0.2.12}/tests/conftest.py +0 -0
  257. {datachain-0.2.10 → datachain-0.2.12}/tests/data.py +0 -0
  258. {datachain-0.2.10 → datachain-0.2.12}/tests/examples/__init__.py +0 -0
  259. {datachain-0.2.10 → datachain-0.2.12}/tests/examples/wds_data.py +0 -0
  260. {datachain-0.2.10 → datachain-0.2.12}/tests/func/__init__.py +0 -0
  261. {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_client.py +0 -0
  262. {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_datasets.py +0 -0
  263. {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_ls.py +0 -0
  264. {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_pull.py +0 -0
  265. {datachain-0.2.10 → datachain-0.2.12}/tests/func/test_pytorch.py +0 -0
  266. {datachain-0.2.10 → datachain-0.2.12}/tests/scripts/feature_class.py +0 -0
  267. {datachain-0.2.10 → datachain-0.2.12}/tests/scripts/name_len_slow.py +0 -0
  268. {datachain-0.2.10 → datachain-0.2.12}/tests/test_cli_e2e.py +0 -0
  269. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/__init__.py +0 -0
  270. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/__init__.py +0 -0
  271. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_udf_signature.py +0 -0
  272. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/lib/test_webdataset.py +0 -0
  273. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/__init__.py +0 -0
  274. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/sqlite/__init__.py +0 -0
  275. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/sqlite/test_utils.py +0 -0
  276. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_array.py +0 -0
  277. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_conditional.py +0 -0
  278. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_path.py +0 -0
  279. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_random.py +0 -0
  280. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_selectable.py +0 -0
  281. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/sql/test_string.py +0 -0
  282. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_asyn.py +0 -0
  283. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_cache.py +0 -0
  284. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_catalog_loader.py +0 -0
  285. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_cli_parsing.py +0 -0
  286. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_client.py +0 -0
  287. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_client_s3.py +0 -0
  288. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_data_storage.py +0 -0
  289. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_database_engine.py +0 -0
  290. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_dataset.py +0 -0
  291. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_dispatch.py +0 -0
  292. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_fileslice.py +0 -0
  293. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_id_generator.py +0 -0
  294. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_listing.py +0 -0
  295. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_metastore.py +0 -0
  296. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_query_metrics.py +0 -0
  297. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_query_params.py +0 -0
  298. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_serializer.py +0 -0
  299. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_session.py +0 -0
  300. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_storage.py +0 -0
  301. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_udf.py +0 -0
  302. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_utils.py +0 -0
  303. {datachain-0.2.10 → datachain-0.2.12}/tests/unit/test_warehouse.py +0 -0
  304. {datachain-0.2.10 → datachain-0.2.12}/tests/utils.py +0 -0
@@ -191,7 +191,7 @@ jobs:
191
191
  # Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
192
192
  run: >
193
193
  pytest
194
- --config-file=pyproject.toml -rsx
194
+ --config-file=pyproject.toml -rs
195
195
  --splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
196
196
  tests ../datachain/tests
197
197
  working-directory: backend/datachain_server
@@ -13,9 +13,9 @@ Here is a list of important resources for contributors:
13
13
  - `Code of Conduct`_
14
14
 
15
15
  .. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
16
- .. _Source Code: https://github.com/iterative/dvcx
16
+ .. _Source Code: https://github.com/iterative/datachain
17
17
  .. _Documentation: https://docs.dvc.ai/datachain
18
- .. _Issue Tracker: https://github.com/iterative/dvcx/issues
18
+ .. _Issue Tracker: https://github.com/iterative/datachain/issues
19
19
 
20
20
  How to report a bug
21
21
  -------------------
@@ -124,6 +124,6 @@ To run linting and code formatting checks, you can invoke a `lint` session in no
124
124
  It is recommended to open an issue before starting work on anything.
125
125
  This will allow a chance to talk it over with the owners and validate your approach.
126
126
 
127
- .. _pull request: https://github.com/iterative/dvcx/pulls
127
+ .. _pull request: https://github.com/iterative/datachain/pulls
128
128
  .. github-only
129
129
  .. _Code of Conduct: CODE_OF_CONDUCT.rst
@@ -0,0 +1,412 @@
1
+ Metadata-Version: 2.1
2
+ Name: datachain
3
+ Version: 0.2.12
4
+ Summary: Wrangle unstructured AI data at scale
5
+ Author-email: Dmitry Petrov <support@dvc.org>
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://datachain.dvc.ai
8
+ Project-URL: Issues, https://github.com/iterative/datachain/issues
9
+ Project-URL: Source, https://github.com/iterative/datachain
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Development Status :: 2 - Pre-Alpha
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/x-rst
18
+ License-File: LICENSE
19
+ Requires-Dist: pyyaml
20
+ Requires-Dist: tomlkit
21
+ Requires-Dist: tqdm
22
+ Requires-Dist: numpy
23
+ Requires-Dist: numpy<2,>=1; sys_platform == "win32"
24
+ Requires-Dist: pandas>=2.0.0
25
+ Requires-Dist: pyarrow
26
+ Requires-Dist: typing-extensions
27
+ Requires-Dist: python-dateutil>=2
28
+ Requires-Dist: attrs>=21.3.0
29
+ Requires-Dist: s3fs>=2024.2.0
30
+ Requires-Dist: gcsfs>=2024.2.0
31
+ Requires-Dist: adlfs>=2024.2.0
32
+ Requires-Dist: dvc-data<4,>=3.10
33
+ Requires-Dist: dvc-objects<6,>=4
34
+ Requires-Dist: shtab<2,>=1.3.4
35
+ Requires-Dist: sqlalchemy>=2
36
+ Requires-Dist: multiprocess==0.70.16
37
+ Requires-Dist: dill==0.3.8
38
+ Requires-Dist: cloudpickle
39
+ Requires-Dist: ujson>=5.9.0
40
+ Requires-Dist: pydantic<3,>=2
41
+ Requires-Dist: jmespath>=1.0
42
+ Requires-Dist: datamodel-code-generator>=0.25
43
+ Requires-Dist: Pillow<11,>=10.0.0
44
+ Provides-Extra: docs
45
+ Requires-Dist: mkdocs>=1.5.2; extra == "docs"
46
+ Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
47
+ Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
48
+ Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
49
+ Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
50
+ Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
51
+ Provides-Extra: torch
52
+ Requires-Dist: torch>=2.1.0; extra == "torch"
53
+ Requires-Dist: torchvision; extra == "torch"
54
+ Requires-Dist: transformers>=4.36.0; extra == "torch"
55
+ Provides-Extra: remote
56
+ Requires-Dist: lz4; extra == "remote"
57
+ Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
58
+ Requires-Dist: requests>=2.22.0; extra == "remote"
59
+ Provides-Extra: vector
60
+ Requires-Dist: usearch; extra == "vector"
61
+ Provides-Extra: tests
62
+ Requires-Dist: datachain[remote,torch,vector]; extra == "tests"
63
+ Requires-Dist: pytest<9,>=8; extra == "tests"
64
+ Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
65
+ Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
66
+ Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
67
+ Requires-Dist: pytest-servers[all]>=0.5.5; extra == "tests"
68
+ Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
69
+ Requires-Dist: pytest-asyncio>=0.23.2; extra == "tests"
70
+ Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
71
+ Requires-Dist: virtualenv; extra == "tests"
72
+ Requires-Dist: dulwich; extra == "tests"
73
+ Requires-Dist: hypothesis; extra == "tests"
74
+ Requires-Dist: open_clip_torch; extra == "tests"
75
+ Requires-Dist: aiotools>=1.7.0; extra == "tests"
76
+ Requires-Dist: requests-mock; extra == "tests"
77
+ Provides-Extra: dev
78
+ Requires-Dist: datachain[docs,tests]; extra == "dev"
79
+ Requires-Dist: mypy==1.10.1; extra == "dev"
80
+ Requires-Dist: types-python-dateutil; extra == "dev"
81
+ Requires-Dist: types-PyYAML; extra == "dev"
82
+ Requires-Dist: types-requests; extra == "dev"
83
+ Requires-Dist: types-ujson; extra == "dev"
84
+
85
+ |PyPI| |Python Version| |Codecov| |Tests|
86
+
87
+ .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
88
+ :target: https://pypi.org/project/datachain/
89
+ :alt: PyPI
90
+ .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
91
+ :target: https://pypi.org/project/datachain
92
+ :alt: Python Version
93
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
94
+ :target: https://codecov.io/gh/iterative/datachain
95
+ :alt: Codecov
96
+ .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
97
+ :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
98
+ :alt: Tests
99
+
100
+ AI 🔗 DataChain
101
+ ----------------
102
+
103
+ DataChain is an open-source Python library for processing and curating unstructured
104
+ data at scale.
105
+
106
+ 🤖 AI-Driven Data Curation: Use local ML models, LLM APIs calls to enrich your data.
107
+
108
+ 🚀 GenAI Dataset scale: Handle 10s of milions of files or file snippets.
109
+
110
+ 🐍 Python-friendly: Use strictly typed `Pydantic`_ objects instead of JSON.
111
+
112
+
113
+ To ensure efficiency, Datachain supports parallel processing, parallel data
114
+ downloads, and out-of-memory computing. It excels at optimizing batch operations.
115
+ While most GenAI tools focus on online applications and realtime, DataChain is designed
116
+ for offline data processing, data curation and ETL.
117
+
118
+ The typical use cases are Computer Vision data curation, LLM analytics
119
+ and validation.
120
+
121
+
122
+ .. code:: console
123
+
124
+ $ pip install datachain
125
+
126
+ |Flowchart|
127
+
128
+ Quick Start
129
+ -----------
130
+
131
+ Basic evaluation
132
+ ================
133
+
134
+ We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
135
+ - 50 files total in the example.
136
+ These dialogs involve users looking for better wireless plans chatting with bot.
137
+ Our goal is to identify successful dialogs.
138
+
139
+ The data used in the examples is publicly available. Please feel free to run this code.
140
+
141
+ First, we'll use a simple sentiment analysis model. Please install transformers.
142
+
143
+ .. code:: shell
144
+
145
+ pip install transformers
146
+
147
+ The code below downloads files the cloud, applies function
148
+ `is_positive_dialogue_ending()` to each. All files with a positive sentiment
149
+ are copied to local directory `output/`.
150
+
151
+ .. code:: py
152
+
153
+ from transformers import pipeline
154
+ from datachain import DataChain, Column
155
+
156
+ classifier = pipeline("sentiment-analysis", device="cpu",
157
+ model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
158
+
159
+ def is_positive_dialogue_ending(file) -> bool:
160
+ dialogue_ending = file.read()[-512:]
161
+ return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
162
+
163
+ chain = (
164
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
165
+ object_name="file", type="text")
166
+ .settings(parallel=8, cache=True)
167
+ .map(is_positive=is_positive_dialogue_ending)
168
+ .save("file_response")
169
+ )
170
+
171
+ positive_chain = chain.filter(Column("is_positive") == True)
172
+ positive_chain.export_files("./output1")
173
+
174
+ print(f"{positive_chain.count()} files were exported")
175
+
176
+
177
+
178
+ 13 files were exported
179
+
180
+ .. code:: shell
181
+
182
+ $ ls output/datachain-demo/chatbot-KiT/
183
+ 15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
184
+ $ ls output/datachain-demo/chatbot-KiT/ | wc -l
185
+ 13
186
+
187
+
188
+ LLM judging LLMs dialogs
189
+ ==========================
190
+
191
+ Finding good dialogs using an LLM can be more efficient. In this example,
192
+ we use Mistral with a free API. Please install the package and get a free
193
+ Mistral API key at https://console.mistral.ai
194
+
195
+ .. code:: shell
196
+
197
+ $ pip install mistralai
198
+ $ export MISTRAL_API_KEY=_your_key_
199
+
200
+ Below is a similar code example, but this time using an LLM to evaluate the dialogs.
201
+ Note, only 4 threads were used in this example `parallel=4` due to a limitation of
202
+ the free LLM service.
203
+
204
+ .. code:: py
205
+
206
+ from mistralai.client import MistralClient
207
+ from mistralai.models.chat_completion import ChatMessage
208
+ from datachain import File, DataChain, Column
209
+
210
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
211
+
212
+ def eval_dialogue(file: File) -> bool:
213
+ client = MistralClient()
214
+ response = client.chat(
215
+ model="open-mixtral-8x22b",
216
+ messages=[ChatMessage(role="system", content=PROMPT),
217
+ ChatMessage(role="user", content=file.read())])
218
+ result = response.choices[0].message.content
219
+ return result.lower().startswith("success")
220
+
221
+ chain = (
222
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
223
+ .settings(parallel=4, cache=True)
224
+ .map(is_success=eval_dialogue)
225
+ .save("mistral_files")
226
+ )
227
+
228
+ successful_chain = chain.filter(Column("is_success") == True)
229
+ successful_chain.export_files("./output_mistral")
230
+
231
+ print(f"{successful_chain.count()} files were exported")
232
+
233
+
234
+ With the current prompt, we found 31 files considered successful dialogs:
235
+
236
+ .. code:: shell
237
+
238
+ $ ls output_mistral/datachain-demo/chatbot-KiT/
239
+ 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
240
+ $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
241
+ 31
242
+
243
+
244
+
245
+ Serializing Python-objects
246
+ ==========================
247
+
248
+ LLM responses contain valuable information for analytics, such as tokens used and the
249
+ model. Preserving this information can be beneficial.
250
+
251
+ Instead of extracting this information from the Mistral data structure (class
252
+ `ChatCompletionResponse`), we serialize the entire Python object to the internal DB.
253
+
254
+
255
+ .. code:: py
256
+
257
+ from mistralai.client import MistralClient
258
+ from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
259
+ from datachain import File, DataChain, Column
260
+
261
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
262
+
263
+ def eval_dialog(file: File) -> ChatCompletionResponse:
264
+ client = MistralClient()
265
+ return client.chat(
266
+ model="open-mixtral-8x22b",
267
+ messages=[ChatMessage(role="system", content=PROMPT),
268
+ ChatMessage(role="user", content=file.read())])
269
+
270
+ chain = (
271
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
272
+ .settings(parallel=4, cache=True)
273
+ .map(response=eval_dialog)
274
+ .map(status=lambda response: response.choices[0].message.content.lower()[:7])
275
+ .save("response")
276
+ )
277
+
278
+ chain.select("file.name", "status", "response.usage").show(5)
279
+
280
+ success_rate = chain.filter(Column("status") == "success").count() / chain.count()
281
+ print(f"{100*success_rate:.1f}% dialogs were successful")
282
+
283
+ Output:
284
+
285
+ .. code:: shell
286
+
287
+ file status response response response
288
+ name usage usage usage
289
+ prompt_tokens total_tokens completion_tokens
290
+ 0 1.txt success 547 548 1
291
+ 1 10.txt failure 3576 3578 2
292
+ 2 11.txt failure 626 628 2
293
+ 3 12.txt failure 1144 1182 38
294
+ 4 13.txt success 1100 1101 1
295
+
296
+ [Limited by 5 rows]
297
+ 64.0% dialogs were successful
298
+
299
+
300
+ Complex Python data structures
301
+ =============================================
302
+
303
+ In the previous examples, a few dataset were saved in the embedded database
304
+ (`SQLite`_ in directory `.datachain`).
305
+ These datasets are versioned, and can be accessed using
306
+ `DataChain.from_dataset("dataset_name")`.
307
+
308
+ .. code:: py
309
+
310
+ chain = DataChain.from_dataset("response")
311
+
312
+ # Iterating one-by-one: out of memory
313
+ for file, response in chain.limit(5).collect("file", "response"):
314
+ # You work with Python objects
315
+ assert isinstance(response, ChatCompletionResponse)
316
+
317
+ status = response.choices[0].message.content[:7]
318
+ tokens = response.usage.total_tokens
319
+ print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
320
+
321
+ Output:
322
+
323
+ .. code:: shell
324
+
325
+ gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
326
+ gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
327
+ gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
328
+ gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
329
+ gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
330
+
331
+
332
+ Vectorized analytics over Python objects
333
+ ========================================
334
+
335
+ Some operations can be efficiently run inside the DB without deserializing Python objects.
336
+ Let's calculate the cost of using LLM APIs in a vectorized way.
337
+ Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
338
+
339
+ .. code:: py
340
+
341
+ chain = DataChain.from_dataset("mistral_dataset")
342
+
343
+ cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
344
+ + chain.sum("response.usage.completion_tokens")*0.000006
345
+ print(f"Spent ${cost:.2f} on {chain.count()} calls")
346
+
347
+ Output:
348
+
349
+ .. code:: shell
350
+
351
+ Spent $0.08 on 50 calls
352
+
353
+
354
+ PyTorch data loader
355
+ ===================
356
+
357
+ Chain results can be exported or passed directly to PyTorch dataloader.
358
+ For example, if we are interested in passing image and a label based on file
359
+ name suffix, the following code will do it:
360
+
361
+ .. code:: py
362
+
363
+ from torch.utils.data import DataLoader
364
+ from transformers import CLIPProcessor
365
+
366
+ from datachain import C, DataChain
367
+
368
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
369
+
370
+ chain = (
371
+ DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
372
+ .map(label=lambda name: name.split(".")[0], params=["file.name"])
373
+ .select("file", "label").to_pytorch(
374
+ transform=processor.image_processor,
375
+ tokenizer=processor.tokenizer,
376
+ )
377
+ )
378
+ loader = DataLoader(chain, batch_size=1)
379
+
380
+
381
+ Tutorials
382
+ ---------
383
+
384
+ * `Getting Started`_
385
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
386
+
387
+ Contributions
388
+ -------------
389
+
390
+ Contributions are very welcome.
391
+ To learn more, see the `Contributor Guide`_.
392
+
393
+
394
+ Community and Support
395
+ ---------------------
396
+
397
+ * `Docs <https://datachain.dvc.ai/>`_
398
+ * `File an issue`_ if you encounter any problems
399
+ * `Discord Chat <https://dvc.org/chat>`_
400
+ * `Email <mailto:support@dvc.org>`_
401
+ * `Twitter <https://twitter.com/DVCorg>`_
402
+
403
+
404
+ .. _PyPI: https://pypi.org/
405
+ .. _file an issue: https://github.com/iterative/datachain/issues
406
+ .. github-only
407
+ .. _Contributor Guide: CONTRIBUTING.rst
408
+ .. _Pydantic: https://github.com/pydantic/pydantic
409
+ .. _SQLite: https://www.sqlite.org/
410
+ .. _Getting Started: https://datachain.dvc.ai/
411
+ .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
412
+ :alt: DataChain FlowChart