datachain 0.2.11__tar.gz → 0.2.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (305) hide show
  1. {datachain-0.2.11 → datachain-0.2.13}/.github/workflows/tests.yml +9 -24
  2. {datachain-0.2.11 → datachain-0.2.13}/.pre-commit-config.yaml +2 -0
  3. {datachain-0.2.11 → datachain-0.2.13}/CONTRIBUTING.rst +3 -3
  4. datachain-0.2.13/PKG-INFO +411 -0
  5. datachain-0.2.13/README.rst +326 -0
  6. datachain-0.2.13/docs/assets/captioned_cartoons.png +0 -0
  7. datachain-0.2.13/docs/assets/flowchart.png +0 -0
  8. datachain-0.2.13/docs/index.md +304 -0
  9. datachain-0.2.13/docs/references/datachain.md +18 -0
  10. datachain-0.2.13/docs/references/datatype.md +19 -0
  11. datachain-0.2.13/docs/references/file.md +22 -0
  12. datachain-0.2.13/docs/references/index.md +8 -0
  13. datachain-0.2.13/docs/references/sql.md +18 -0
  14. datachain-0.2.13/docs/references/torch.md +17 -0
  15. datachain-0.2.13/docs/references/udf.md +18 -0
  16. datachain-0.2.13/examples/computer_vision/blip2_image_desc_lib.py +102 -0
  17. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/.gitignore +1 -0
  18. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +775 -1217
  19. datachain-0.2.13/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +4083 -0
  20. datachain-0.2.13/examples/computer_vision/fashion_product_images/3-train-model.ipynb +1080 -0
  21. datachain-0.2.13/examples/computer_vision/fashion_product_images/4-inference.ipynb +754 -0
  22. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/README.md +1 -1
  23. datachain-0.2.13/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +44 -0
  24. datachain-0.2.13/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +49 -0
  25. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +10 -18
  26. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +5 -7
  27. datachain-0.2.13/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +52 -0
  28. datachain-0.2.13/examples/computer_vision/fashion_product_images/src/train.py +143 -0
  29. datachain-0.2.13/examples/computer_vision/iptc_exif_xmp_lib.py +75 -0
  30. datachain-0.2.13/examples/computer_vision/llava2_image_desc_lib.py +82 -0
  31. datachain-0.2.13/examples/computer_vision/openimage-detect.py +63 -0
  32. datachain-0.2.13/examples/get_started/common_sql_functions.py +93 -0
  33. {datachain-0.2.11/examples → datachain-0.2.13/examples/get_started}/json-csv-reader.py +14 -31
  34. {datachain-0.2.11/examples → datachain-0.2.13/examples/get_started}/torch-loader.py +9 -5
  35. datachain-0.2.13/examples/get_started/udfs/parallel.py +39 -0
  36. datachain-0.2.13/examples/get_started/udfs/simple.py +19 -0
  37. datachain-0.2.13/examples/get_started/udfs/stateful.py +43 -0
  38. datachain-0.2.13/examples/llm/llm_chatbot_evaluation.ipynb +772 -0
  39. {datachain-0.2.11/examples → datachain-0.2.13/examples/llm_and_nlp}/llm-claude-aggregate-query.py +2 -1
  40. {datachain-0.2.11/examples → datachain-0.2.13/examples/llm_and_nlp}/llm-claude-simple-query.py +9 -1
  41. {datachain-0.2.11/examples → datachain-0.2.13/examples/llm_and_nlp}/llm-claude.py +1 -1
  42. datachain-0.2.13/examples/llm_and_nlp/unstructured-text.py +63 -0
  43. {datachain-0.2.11/examples → datachain-0.2.13/examples/multimodal}/clip.py +6 -6
  44. {datachain-0.2.11 → datachain-0.2.13}/examples/multimodal/clip_fine_tuning.ipynb +532 -277
  45. datachain-0.2.13/examples/multimodal/hf_pipeline.py +124 -0
  46. datachain-0.2.13/examples/multimodal/openai_image_desc_lib.py +95 -0
  47. {datachain-0.2.11/examples → datachain-0.2.13/examples/multimodal}/wds.py +6 -6
  48. datachain-0.2.13/examples/multimodal/wds_filtered.py +38 -0
  49. {datachain-0.2.11 → datachain-0.2.13}/mkdocs.yml +10 -5
  50. {datachain-0.2.11 → datachain-0.2.13}/pyproject.toml +14 -5
  51. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/__init__.py +3 -4
  52. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/cache.py +10 -4
  53. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/catalog/catalog.py +42 -16
  54. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/cli.py +48 -32
  55. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/data_storage/metastore.py +24 -0
  56. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/data_storage/warehouse.py +3 -1
  57. datachain-0.2.13/src/datachain/job.py +56 -0
  58. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/arrow.py +19 -7
  59. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/clip.py +89 -66
  60. datachain-0.2.11/src/datachain/lib/convert/type_converter.py → datachain-0.2.13/src/datachain/lib/convert/python_to_sql.py +6 -6
  61. datachain-0.2.13/src/datachain/lib/convert/sql_to_python.py +23 -0
  62. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/convert/values_to_tuples.py +51 -33
  63. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/data_model.py +6 -27
  64. datachain-0.2.13/src/datachain/lib/dataset_info.py +70 -0
  65. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/dc.py +618 -156
  66. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/file.py +130 -22
  67. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/image.py +1 -1
  68. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/meta_formats.py +14 -2
  69. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/model_store.py +3 -2
  70. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/pytorch.py +10 -7
  71. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/signal_schema.py +19 -11
  72. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/text.py +2 -1
  73. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/udf.py +56 -5
  74. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/udf_signature.py +1 -1
  75. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/node.py +11 -8
  76. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/query/dataset.py +62 -28
  77. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/query/schema.py +2 -0
  78. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/query/session.py +4 -4
  79. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/functions/array.py +12 -0
  80. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/functions/string.py +8 -0
  81. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/torch/__init__.py +1 -1
  82. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/utils.py +6 -0
  83. datachain-0.2.13/src/datachain.egg-info/PKG-INFO +411 -0
  84. {datachain-0.2.11 → datachain-0.2.13}/src/datachain.egg-info/SOURCES.txt +36 -47
  85. {datachain-0.2.11 → datachain-0.2.13}/src/datachain.egg-info/requires.txt +2 -1
  86. {datachain-0.2.11 → datachain-0.2.13}/tests/conftest.py +42 -26
  87. {datachain-0.2.11 → datachain-0.2.13}/tests/examples/test_wds_e2e.py +5 -5
  88. {datachain-0.2.11 → datachain-0.2.13}/tests/func/test_catalog.py +40 -1
  89. datachain-0.2.13/tests/func/test_datachain.py +275 -0
  90. {datachain-0.2.11 → datachain-0.2.13}/tests/func/test_dataset_query.py +185 -110
  91. {datachain-0.2.11 → datachain-0.2.13}/tests/func/test_query.py +2 -2
  92. {datachain-0.2.11 → datachain-0.2.13}/tests/scripts/name_len_normal.py +1 -1
  93. {datachain-0.2.11 → datachain-0.2.13}/tests/test_query_e2e.py +2 -2
  94. datachain-0.2.13/tests/unit/lib/conftest.py +72 -0
  95. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_arrow.py +17 -0
  96. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_clip.py +2 -4
  97. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_datachain.py +158 -40
  98. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_datachain_bootstrap.py +5 -5
  99. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_datachain_merge.py +14 -8
  100. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_feature.py +1 -1
  101. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_feature_utils.py +2 -2
  102. datachain-0.2.13/tests/unit/lib/test_file.py +331 -0
  103. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_image.py +4 -5
  104. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_signal_schema.py +23 -4
  105. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_text.py +6 -8
  106. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_utils.py +4 -4
  107. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_catalog.py +13 -13
  108. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_module_exports.py +0 -4
  109. {datachain-0.2.11 → datachain-0.2.13}/tests/utils.py +6 -0
  110. datachain-0.2.11/.reuse/dep5 +0 -8
  111. datachain-0.2.11/LICENSES/Apache-2.0.txt +0 -73
  112. datachain-0.2.11/LICENSES/BSD-3-Clause.txt +0 -11
  113. datachain-0.2.11/LICENSES/Python-2.0.txt +0 -72
  114. datachain-0.2.11/PKG-INFO +0 -431
  115. datachain-0.2.11/README.rst +0 -347
  116. datachain-0.2.11/docs/index.md +0 -3
  117. datachain-0.2.11/docs/references/catalog.md +0 -3
  118. datachain-0.2.11/docs/references/datachain.md +0 -3
  119. datachain-0.2.11/examples/blip2_image_desc_lib.py +0 -35
  120. datachain-0.2.11/examples/common_sql_functions.py +0 -78
  121. datachain-0.2.11/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -3589
  122. datachain-0.2.11/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -91
  123. datachain-0.2.11/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -51
  124. datachain-0.2.11/examples/hf_pipeline.py +0 -98
  125. datachain-0.2.11/examples/iptc_exif_xmp_lib.py +0 -15
  126. datachain-0.2.11/examples/llava2_image_desc_lib.py +0 -43
  127. datachain-0.2.11/examples/loader.py +0 -31
  128. datachain-0.2.11/examples/neurips/README +0 -18
  129. datachain-0.2.11/examples/neurips/distance_to_query.py +0 -29
  130. datachain-0.2.11/examples/neurips/llm_chat.py +0 -46
  131. datachain-0.2.11/examples/neurips/requirements.txt +0 -9
  132. datachain-0.2.11/examples/neurips/single_query.py +0 -119
  133. datachain-0.2.11/examples/neurips/text_loaders.py +0 -80
  134. datachain-0.2.11/examples/openai_image_desc_lib.py +0 -29
  135. datachain-0.2.11/examples/openimage-detect.py +0 -72
  136. datachain-0.2.11/examples/pose_detection.py +0 -219
  137. datachain-0.2.11/examples/udfs/batching.py +0 -34
  138. datachain-0.2.11/examples/udfs/image_transformation.py +0 -45
  139. datachain-0.2.11/examples/udfs/parallel.py +0 -55
  140. datachain-0.2.11/examples/udfs/simple.py +0 -42
  141. datachain-0.2.11/examples/udfs/stateful.py +0 -44
  142. datachain-0.2.11/examples/udfs/stateful_similarity.py +0 -79
  143. datachain-0.2.11/examples/unstructured-text.py +0 -54
  144. datachain-0.2.11/examples/wds_filtered.py +0 -55
  145. datachain-0.2.11/examples/zalando/zalando_clip.py +0 -44
  146. datachain-0.2.11/examples/zalando/zalando_dir_as_class.py +0 -31
  147. datachain-0.2.11/examples/zalando/zalando_splits_and_classes_ds.py +0 -9
  148. datachain-0.2.11/examples/zalando/zalando_splits_and_classes_output.py +0 -17
  149. datachain-0.2.11/src/datachain/lib/gpt4_vision.py +0 -97
  150. datachain-0.2.11/src/datachain/lib/hf_image_to_text.py +0 -97
  151. datachain-0.2.11/src/datachain/lib/hf_pipeline.py +0 -90
  152. datachain-0.2.11/src/datachain/lib/image_transform.py +0 -103
  153. datachain-0.2.11/src/datachain/lib/iptc_exif_xmp.py +0 -76
  154. datachain-0.2.11/src/datachain/lib/unstructured.py +0 -41
  155. datachain-0.2.11/src/datachain/text/__init__.py +0 -3
  156. datachain-0.2.11/src/datachain.egg-info/PKG-INFO +0 -431
  157. datachain-0.2.11/tests/func/test_datachain.py +0 -58
  158. datachain-0.2.11/tests/unit/lib/conftest.py +0 -21
  159. datachain-0.2.11/tests/unit/lib/test_file.py +0 -162
  160. {datachain-0.2.11 → datachain-0.2.13}/.cruft.json +0 -0
  161. {datachain-0.2.11 → datachain-0.2.13}/.gitattributes +0 -0
  162. {datachain-0.2.11 → datachain-0.2.13}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  163. {datachain-0.2.11 → datachain-0.2.13}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  164. {datachain-0.2.11 → datachain-0.2.13}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  165. {datachain-0.2.11 → datachain-0.2.13}/.github/codecov.yaml +0 -0
  166. {datachain-0.2.11 → datachain-0.2.13}/.github/dependabot.yml +0 -0
  167. {datachain-0.2.11 → datachain-0.2.13}/.github/workflows/benchmarks.yml +0 -0
  168. {datachain-0.2.11 → datachain-0.2.13}/.github/workflows/release.yml +0 -0
  169. {datachain-0.2.11 → datachain-0.2.13}/.github/workflows/update-template.yaml +0 -0
  170. {datachain-0.2.11 → datachain-0.2.13}/.gitignore +0 -0
  171. {datachain-0.2.11 → datachain-0.2.13}/CODE_OF_CONDUCT.rst +0 -0
  172. {datachain-0.2.11 → datachain-0.2.13}/LICENSE +0 -0
  173. {datachain-0.2.11 → datachain-0.2.13}/docs/assets/datachain.png +0 -0
  174. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  175. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  176. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  177. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  178. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  179. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  180. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  181. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  182. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  183. {datachain-0.2.11 → datachain-0.2.13}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  184. {datachain-0.2.11 → datachain-0.2.13}/noxfile.py +0 -0
  185. {datachain-0.2.11 → datachain-0.2.13}/setup.cfg +0 -0
  186. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/__main__.py +0 -0
  187. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/asyn.py +0 -0
  188. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/catalog/__init__.py +0 -0
  189. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/catalog/datasource.py +0 -0
  190. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/catalog/loader.py +0 -0
  191. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/catalog/subclass.py +0 -0
  192. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/cli_utils.py +0 -0
  193. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/client/__init__.py +0 -0
  194. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/client/azure.py +0 -0
  195. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/client/fileslice.py +0 -0
  196. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/client/fsspec.py +0 -0
  197. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/client/gcs.py +0 -0
  198. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/client/local.py +0 -0
  199. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/client/s3.py +0 -0
  200. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/config.py +0 -0
  201. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/data_storage/__init__.py +0 -0
  202. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/data_storage/db_engine.py +0 -0
  203. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/data_storage/id_generator.py +0 -0
  204. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/data_storage/job.py +0 -0
  205. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/data_storage/schema.py +0 -0
  206. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/data_storage/serializer.py +0 -0
  207. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/data_storage/sqlite.py +0 -0
  208. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/dataset.py +0 -0
  209. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/error.py +0 -0
  210. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/__init__.py +0 -0
  211. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/convert/__init__.py +0 -0
  212. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/convert/flatten.py +0 -0
  213. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/convert/unflatten.py +0 -0
  214. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/settings.py +0 -0
  215. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/utils.py +0 -0
  216. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/vfile.py +0 -0
  217. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/webdataset.py +0 -0
  218. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/lib/webdataset_laion.py +0 -0
  219. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/listing.py +0 -0
  220. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/nodes_fetcher.py +0 -0
  221. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/nodes_thread_pool.py +0 -0
  222. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/progress.py +0 -0
  223. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/py.typed +0 -0
  224. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/query/__init__.py +0 -0
  225. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/query/batch.py +0 -0
  226. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/query/builtins.py +0 -0
  227. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/query/dispatch.py +0 -0
  228. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/query/metrics.py +0 -0
  229. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/query/params.py +0 -0
  230. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/query/udf.py +0 -0
  231. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/remote/__init__.py +0 -0
  232. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/remote/studio.py +0 -0
  233. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/__init__.py +0 -0
  234. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/default/__init__.py +0 -0
  235. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/default/base.py +0 -0
  236. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/functions/__init__.py +0 -0
  237. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/functions/conditional.py +0 -0
  238. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/functions/path.py +0 -0
  239. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/functions/random.py +0 -0
  240. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/selectable.py +0 -0
  241. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/sqlite/__init__.py +0 -0
  242. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/sqlite/base.py +0 -0
  243. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/sqlite/types.py +0 -0
  244. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/sqlite/vector.py +0 -0
  245. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/types.py +0 -0
  246. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/sql/utils.py +0 -0
  247. {datachain-0.2.11 → datachain-0.2.13}/src/datachain/storage.py +0 -0
  248. {datachain-0.2.11 → datachain-0.2.13}/src/datachain.egg-info/dependency_links.txt +0 -0
  249. {datachain-0.2.11 → datachain-0.2.13}/src/datachain.egg-info/entry_points.txt +0 -0
  250. {datachain-0.2.11 → datachain-0.2.13}/src/datachain.egg-info/top_level.txt +0 -0
  251. {datachain-0.2.11 → datachain-0.2.13}/tests/__init__.py +0 -0
  252. {datachain-0.2.11 → datachain-0.2.13}/tests/benchmarks/__init__.py +0 -0
  253. {datachain-0.2.11 → datachain-0.2.13}/tests/benchmarks/conftest.py +0 -0
  254. {datachain-0.2.11 → datachain-0.2.13}/tests/benchmarks/test_ls.py +0 -0
  255. {datachain-0.2.11 → datachain-0.2.13}/tests/benchmarks/test_version.py +0 -0
  256. {datachain-0.2.11 → datachain-0.2.13}/tests/data.py +0 -0
  257. {datachain-0.2.11 → datachain-0.2.13}/tests/examples/__init__.py +0 -0
  258. {datachain-0.2.11 → datachain-0.2.13}/tests/examples/wds_data.py +0 -0
  259. {datachain-0.2.11 → datachain-0.2.13}/tests/func/__init__.py +0 -0
  260. {datachain-0.2.11 → datachain-0.2.13}/tests/func/test_client.py +0 -0
  261. {datachain-0.2.11 → datachain-0.2.13}/tests/func/test_datasets.py +0 -0
  262. {datachain-0.2.11 → datachain-0.2.13}/tests/func/test_feature_pickling.py +0 -0
  263. {datachain-0.2.11 → datachain-0.2.13}/tests/func/test_ls.py +0 -0
  264. {datachain-0.2.11 → datachain-0.2.13}/tests/func/test_pull.py +0 -0
  265. {datachain-0.2.11 → datachain-0.2.13}/tests/func/test_pytorch.py +0 -0
  266. {datachain-0.2.11 → datachain-0.2.13}/tests/scripts/feature_class.py +0 -0
  267. {datachain-0.2.11 → datachain-0.2.13}/tests/scripts/feature_class_parallel.py +0 -0
  268. {datachain-0.2.11 → datachain-0.2.13}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  269. {datachain-0.2.11 → datachain-0.2.13}/tests/scripts/name_len_slow.py +0 -0
  270. {datachain-0.2.11 → datachain-0.2.13}/tests/test_cli_e2e.py +0 -0
  271. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/__init__.py +0 -0
  272. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/__init__.py +0 -0
  273. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_udf_signature.py +0 -0
  274. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/lib/test_webdataset.py +0 -0
  275. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/sql/__init__.py +0 -0
  276. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/sql/sqlite/__init__.py +0 -0
  277. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/sql/sqlite/test_utils.py +0 -0
  278. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/sql/test_array.py +0 -0
  279. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/sql/test_conditional.py +0 -0
  280. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/sql/test_path.py +0 -0
  281. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/sql/test_random.py +0 -0
  282. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/sql/test_selectable.py +0 -0
  283. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/sql/test_string.py +0 -0
  284. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_asyn.py +0 -0
  285. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_cache.py +0 -0
  286. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_catalog_loader.py +0 -0
  287. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_cli_parsing.py +0 -0
  288. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_client.py +0 -0
  289. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_client_s3.py +0 -0
  290. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_data_storage.py +0 -0
  291. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_database_engine.py +0 -0
  292. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_dataset.py +0 -0
  293. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_dispatch.py +0 -0
  294. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_fileslice.py +0 -0
  295. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_id_generator.py +0 -0
  296. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_listing.py +0 -0
  297. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_metastore.py +0 -0
  298. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_query_metrics.py +0 -0
  299. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_query_params.py +0 -0
  300. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_serializer.py +0 -0
  301. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_session.py +0 -0
  302. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_storage.py +0 -0
  303. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_udf.py +0 -0
  304. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_utils.py +0 -0
  305. {datachain-0.2.11 → datachain-0.2.13}/tests/unit/test_warehouse.py +0 -0
@@ -69,26 +69,6 @@ jobs:
69
69
  pyv: '3.12'
70
70
 
71
71
  steps:
72
-
73
- # https://github.com/iterative/pytest-servers/pull/122
74
- # https://github.com/abiosoft/colima/issues/468
75
- # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
76
- # colima v0.5.6 seems to run more stable than the latest - that has occasional network failures (ports are not open)
77
- # see: https://github.com/abiosoft/colima/issues/962
78
- - name: Use colima as default docker host on MacOS
79
- if: runner.os == 'macOS'
80
- run: |
81
- brew install docker lima || true # avoid non-zero exit code if brew link fails
82
- sudo curl -L -o /usr/local/bin/colima https://github.com/abiosoft/colima/releases/download/v0.5.6/colima-Darwin-x86_64
83
- sudo chmod +x /usr/local/bin/colima
84
- colima start
85
- sudo ln -vsf "${HOME}"/.colima/default/docker.sock /var/run/docker.sock
86
- env:
87
- HOMEBREW_NO_AUTO_UPDATE: true
88
- HOMEBREW_NO_INSTALL_CLEANUP: true
89
- HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: true
90
- HOMEBREW_NO_INSTALL_UPGRADE: true
91
-
92
72
  - name: Check out the repository
93
73
  uses: actions/checkout@v4
94
74
  with:
@@ -106,12 +86,17 @@ jobs:
106
86
  nox --version
107
87
  uv --version
108
88
 
109
- - name: Skip flaky azure, gs remotes if unavailable on macos
89
+ - name: Skip flaky azure, gs remotes on macOS
110
90
  if: runner.os == 'macOS'
111
- run: echo 'DATACHAIN_TEST_SKIP_MISSING_REMOTES=azure,gs' >> "$GITHUB_ENV"
91
+ run: echo 'DISABLE_REMOTES_ARG=--disable-remotes=azure,gs' >> "$GITHUB_ENV"
92
+
93
+ - name: Skip all remotes on Windows
94
+ if: runner.os == 'Windows'
95
+ run: echo 'DISABLE_REMOTES_ARG=--disable-remotes=azure,gs' >> $env:GITHUB_ENV
112
96
 
113
97
  - name: Run tests
114
- run: nox -s tests-${{ matrix.pyv }}
98
+ run: nox -s tests-${{ matrix.pyv }} -- $DISABLE_REMOTES_ARG
99
+ shell: bash
115
100
 
116
101
  - name: Upload coverage report
117
102
  uses: codecov/codecov-action@v4
@@ -191,7 +176,7 @@ jobs:
191
176
  # Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
192
177
  run: >
193
178
  pytest
194
- --config-file=pyproject.toml -rsx
179
+ --config-file=pyproject.toml -rs
195
180
  --splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
196
181
  tests ../datachain/tests
197
182
  working-directory: backend/datachain_server
@@ -1,5 +1,7 @@
1
1
  default_language_version:
2
2
  python: python3
3
+ ci:
4
+ skip: [mypy]
3
5
  repos:
4
6
  - repo: https://github.com/pre-commit/pre-commit-hooks
5
7
  rev: v4.6.0
@@ -13,9 +13,9 @@ Here is a list of important resources for contributors:
13
13
  - `Code of Conduct`_
14
14
 
15
15
  .. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
16
- .. _Source Code: https://github.com/iterative/dvcx
16
+ .. _Source Code: https://github.com/iterative/datachain
17
17
  .. _Documentation: https://docs.dvc.ai/datachain
18
- .. _Issue Tracker: https://github.com/iterative/dvcx/issues
18
+ .. _Issue Tracker: https://github.com/iterative/datachain/issues
19
19
 
20
20
  How to report a bug
21
21
  -------------------
@@ -124,6 +124,6 @@ To run linting and code formatting checks, you can invoke a `lint` session in no
124
124
  It is recommended to open an issue before starting work on anything.
125
125
  This will allow a chance to talk it over with the owners and validate your approach.
126
126
 
127
- .. _pull request: https://github.com/iterative/dvcx/pulls
127
+ .. _pull request: https://github.com/iterative/datachain/pulls
128
128
  .. github-only
129
129
  .. _Code of Conduct: CODE_OF_CONDUCT.rst
@@ -0,0 +1,411 @@
1
+ Metadata-Version: 2.1
2
+ Name: datachain
3
+ Version: 0.2.13
4
+ Summary: Wrangle unstructured AI data at scale
5
+ Author-email: Dmitry Petrov <support@dvc.org>
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://datachain.dvc.ai
8
+ Project-URL: Issues, https://github.com/iterative/datachain/issues
9
+ Project-URL: Source, https://github.com/iterative/datachain
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Development Status :: 2 - Pre-Alpha
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/x-rst
18
+ License-File: LICENSE
19
+ Requires-Dist: pyyaml
20
+ Requires-Dist: tomlkit
21
+ Requires-Dist: tqdm
22
+ Requires-Dist: numpy
23
+ Requires-Dist: numpy<2,>=1; sys_platform == "win32"
24
+ Requires-Dist: pandas>=2.0.0
25
+ Requires-Dist: pyarrow
26
+ Requires-Dist: typing-extensions
27
+ Requires-Dist: python-dateutil>=2
28
+ Requires-Dist: attrs>=21.3.0
29
+ Requires-Dist: s3fs>=2024.2.0
30
+ Requires-Dist: gcsfs>=2024.2.0
31
+ Requires-Dist: adlfs>=2024.2.0
32
+ Requires-Dist: dvc-data<4,>=3.10
33
+ Requires-Dist: dvc-objects<6,>=4
34
+ Requires-Dist: shtab<2,>=1.3.4
35
+ Requires-Dist: sqlalchemy>=2
36
+ Requires-Dist: multiprocess==0.70.16
37
+ Requires-Dist: dill==0.3.8
38
+ Requires-Dist: cloudpickle
39
+ Requires-Dist: ujson>=5.9.0
40
+ Requires-Dist: pydantic<3,>=2
41
+ Requires-Dist: jmespath>=1.0
42
+ Requires-Dist: datamodel-code-generator>=0.25
43
+ Requires-Dist: Pillow<11,>=10.0.0
44
+ Provides-Extra: docs
45
+ Requires-Dist: mkdocs>=1.5.2; extra == "docs"
46
+ Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
47
+ Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
48
+ Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
49
+ Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
50
+ Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
51
+ Provides-Extra: torch
52
+ Requires-Dist: torch>=2.1.0; extra == "torch"
53
+ Requires-Dist: torchvision; extra == "torch"
54
+ Requires-Dist: transformers>=4.36.0; extra == "torch"
55
+ Provides-Extra: remote
56
+ Requires-Dist: lz4; extra == "remote"
57
+ Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
58
+ Requires-Dist: requests>=2.22.0; extra == "remote"
59
+ Provides-Extra: vector
60
+ Requires-Dist: usearch; extra == "vector"
61
+ Provides-Extra: tests
62
+ Requires-Dist: datachain[remote,torch,vector]; extra == "tests"
63
+ Requires-Dist: pytest<9,>=8; extra == "tests"
64
+ Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
65
+ Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
66
+ Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
67
+ Requires-Dist: pytest-servers[all]>=0.5.5; extra == "tests"
68
+ Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
69
+ Requires-Dist: pytest-asyncio>=0.23.2; extra == "tests"
70
+ Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
71
+ Requires-Dist: virtualenv; extra == "tests"
72
+ Requires-Dist: dulwich; extra == "tests"
73
+ Requires-Dist: hypothesis; extra == "tests"
74
+ Requires-Dist: open_clip_torch; extra == "tests"
75
+ Requires-Dist: aiotools>=1.7.0; extra == "tests"
76
+ Requires-Dist: requests-mock; extra == "tests"
77
+ Provides-Extra: dev
78
+ Requires-Dist: datachain[docs,tests]; extra == "dev"
79
+ Requires-Dist: mypy==1.10.1; extra == "dev"
80
+ Requires-Dist: types-python-dateutil; extra == "dev"
81
+ Requires-Dist: types-pytz; extra == "dev"
82
+ Requires-Dist: types-PyYAML; extra == "dev"
83
+ Requires-Dist: types-requests; extra == "dev"
84
+ Requires-Dist: types-ujson; extra == "dev"
85
+
86
+ |PyPI| |Python Version| |Codecov| |Tests|
87
+
88
+ .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
89
+ :target: https://pypi.org/project/datachain/
90
+ :alt: PyPI
91
+ .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
92
+ :target: https://pypi.org/project/datachain
93
+ :alt: Python Version
94
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
95
+ :target: https://codecov.io/gh/iterative/datachain
96
+ :alt: Codecov
97
+ .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
98
+ :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
99
+ :alt: Tests
100
+
101
+ AI 🔗 DataChain
102
+ ----------------
103
+
104
+ DataChain is an open-source Python library for processing and curating unstructured
105
+ data at scale.
106
+
107
+ 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
108
+
109
+ 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
110
+
111
+ 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
112
+
113
+
114
+ Datachain supports parallel processing, parallel data
115
+ downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
116
+
117
+ The typical use cases include Computer Vision data curation, LLM analytics,
118
+ and validation of multimodal AI applications.
119
+
120
+
121
+ .. code:: console
122
+
123
+ $ pip install datachain
124
+
125
+ |Flowchart|
126
+
127
+ Quick Start
128
+ -----------
129
+
130
+ Data curation with a local model
131
+ =================================
132
+
133
+ We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
134
+ - 50 files total in this example.
135
+ These dialogs involve users chatting with a bot while looking for better wireless plans.
136
+ Our goal is to identify the successful dialogs.
137
+
138
+ The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
139
+
140
+ First, we'll show batch inference with a simple sentiment model using the `transformers` library:
141
+
142
+ .. code:: shell
143
+
144
+ pip install transformers
145
+
146
+ The code below downloads files the cloud, and applies a user-defined function
147
+ to each one of them. All files with a positive sentiment
148
+ detected are then copied to the local directory.
149
+
150
+ .. code:: py
151
+
152
+ from transformers import pipeline
153
+ from datachain import DataChain, Column
154
+
155
+ classifier = pipeline("sentiment-analysis", device="cpu",
156
+ model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
157
+
158
+ def is_positive_dialogue_ending(file) -> bool:
159
+ dialogue_ending = file.read()[-512:]
160
+ return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
161
+
162
+ chain = (
163
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
164
+ object_name="file", type="text")
165
+ .settings(parallel=8, cache=True)
166
+ .map(is_positive=is_positive_dialogue_ending)
167
+ .save("file_response")
168
+ )
169
+
170
+ positive_chain = chain.filter(Column("is_positive") == True)
171
+ positive_chain.export_files("./output")
172
+
173
+ print(f"{positive_chain.count()} files were exported")
174
+
175
+
176
+
177
+ 13 files were exported
178
+
179
+ .. code:: shell
180
+
181
+ $ ls output/datachain-demo/chatbot-KiT/
182
+ 15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
183
+ $ ls output/datachain-demo/chatbot-KiT/ | wc -l
184
+ 13
185
+
186
+
187
+ LLM judging chatbots
188
+ =============================
189
+
190
+ LLMs can work as efficient universal classifiers. In the example below,
191
+ we employ a free API from Mistral to judge the chatbot performance. Please get a free
192
+ Mistral API key at https://console.mistral.ai
193
+
194
+ .. code:: shell
195
+
196
+ $ pip install mistralai
197
+ $ export MISTRAL_API_KEY=_your_key_
198
+
199
+ DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
200
+
201
+ .. code:: py
202
+
203
+ from mistralai.client import MistralClient
204
+ from mistralai.models.chat_completion import ChatMessage
205
+ from datachain import File, DataChain, Column
206
+
207
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
208
+
209
+ def eval_dialogue(file: File) -> bool:
210
+ client = MistralClient()
211
+ response = client.chat(
212
+ model="open-mixtral-8x22b",
213
+ messages=[ChatMessage(role="system", content=PROMPT),
214
+ ChatMessage(role="user", content=file.read())])
215
+ result = response.choices[0].message.content
216
+ return result.lower().startswith("success")
217
+
218
+ chain = (
219
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
220
+ .settings(parallel=4, cache=True)
221
+ .map(is_success=eval_dialogue)
222
+ .save("mistral_files")
223
+ )
224
+
225
+ successful_chain = chain.filter(Column("is_success") == True)
226
+ successful_chain.export_files("./output_mistral")
227
+
228
+ print(f"{successful_chain.count()} files were exported")
229
+
230
+
231
+ With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
232
+
233
+ .. code:: shell
234
+
235
+ $ ls output_mistral/datachain-demo/chatbot-KiT/
236
+ 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
237
+ $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
238
+ 31
239
+
240
+
241
+
242
+ Serializing Python-objects
243
+ ==========================
244
+
245
+ LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
246
+ model performance parameters.
247
+
248
+ Instead of extracting this information from the Mistral response data structure (class
249
+ `ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
250
+
251
+
252
+ .. code:: py
253
+
254
+ from mistralai.client import MistralClient
255
+ from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
256
+ from datachain import File, DataChain, Column
257
+
258
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
259
+
260
+ def eval_dialog(file: File) -> ChatCompletionResponse:
261
+ client = MistralClient()
262
+ return client.chat(
263
+ model="open-mixtral-8x22b",
264
+ messages=[ChatMessage(role="system", content=PROMPT),
265
+ ChatMessage(role="user", content=file.read())])
266
+
267
+ chain = (
268
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
269
+ .settings(parallel=4, cache=True)
270
+ .map(response=eval_dialog)
271
+ .map(status=lambda response: response.choices[0].message.content.lower()[:7])
272
+ .save("response")
273
+ )
274
+
275
+ chain.select("file.name", "status", "response.usage").show(5)
276
+
277
+ success_rate = chain.filter(Column("status") == "success").count() / chain.count()
278
+ print(f"{100*success_rate:.1f}% dialogs were successful")
279
+
280
+ Output:
281
+
282
+ .. code:: shell
283
+
284
+ file status response response response
285
+ name usage usage usage
286
+ prompt_tokens total_tokens completion_tokens
287
+ 0 1.txt success 547 548 1
288
+ 1 10.txt failure 3576 3578 2
289
+ 2 11.txt failure 626 628 2
290
+ 3 12.txt failure 1144 1182 38
291
+ 4 13.txt success 1100 1101 1
292
+
293
+ [Limited by 5 rows]
294
+ 64.0% dialogs were successful
295
+
296
+
297
+ Iterating over Python data structures
298
+ =============================================
299
+
300
+ In the previous examples, datasets were saved in the embedded database
301
+ (`SQLite`_ in folder `.datachain` of the working directory).
302
+ These datasets were automatically versioned, and can be accessed using
303
+ `DataChain.from_dataset("dataset_name")`.
304
+
305
+ Here is how to retrieve a saved dataset and iterate over the objects:
306
+
307
+ .. code:: py
308
+
309
+ chain = DataChain.from_dataset("response")
310
+
311
+ # Iterating one-by-one: support out-of-memory workflow
312
+ for file, response in chain.limit(5).collect("file", "response"):
313
+ # verify the collected Python objects
314
+ assert isinstance(response, ChatCompletionResponse)
315
+
316
+ status = response.choices[0].message.content[:7]
317
+ tokens = response.usage.total_tokens
318
+ print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
319
+
320
+ Output:
321
+
322
+ .. code:: shell
323
+
324
+ gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
325
+ gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
326
+ gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
327
+ gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
328
+ gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
329
+
330
+
331
+ Vectorized analytics over Python objects
332
+ ========================================
333
+
334
+ Some operations can run inside the DB without deserialization.
335
+ For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
336
+
337
+ .. code:: py
338
+
339
+ chain = DataChain.from_dataset("mistral_dataset")
340
+
341
+ cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
342
+ + chain.sum("response.usage.completion_tokens")*0.000006
343
+ print(f"Spent ${cost:.2f} on {chain.count()} calls")
344
+
345
+ Output:
346
+
347
+ .. code:: shell
348
+
349
+ Spent $0.08 on 50 calls
350
+
351
+
352
+ PyTorch data loader
353
+ ===================
354
+
355
+ Chain results can be exported or passed directly to PyTorch dataloader.
356
+ For example, if we are interested in passing image and a label based on file
357
+ name suffix, the following code will do it:
358
+
359
+ .. code:: py
360
+
361
+ from torch.utils.data import DataLoader
362
+ from transformers import CLIPProcessor
363
+
364
+ from datachain import C, DataChain
365
+
366
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
367
+
368
+ chain = (
369
+ DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
370
+ .map(label=lambda name: name.split(".")[0], params=["file.name"])
371
+ .select("file", "label").to_pytorch(
372
+ transform=processor.image_processor,
373
+ tokenizer=processor.tokenizer,
374
+ )
375
+ )
376
+ loader = DataLoader(chain, batch_size=1)
377
+
378
+
379
+ Tutorials
380
+ ---------
381
+
382
+ * `Getting Started`_
383
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
384
+
385
+ Contributions
386
+ -------------
387
+
388
+ Contributions are very welcome.
389
+ To learn more, see the `Contributor Guide`_.
390
+
391
+
392
+ Community and Support
393
+ ---------------------
394
+
395
+ * `Docs <https://datachain.dvc.ai/>`_
396
+ * `File an issue`_ if you encounter any problems
397
+ * `Discord Chat <https://dvc.org/chat>`_
398
+ * `Email <mailto:support@dvc.org>`_
399
+ * `Twitter <https://twitter.com/DVCorg>`_
400
+
401
+
402
+ .. _PyPI: https://pypi.org/
403
+ .. _file an issue: https://github.com/iterative/datachain/issues
404
+ .. github-only
405
+ .. _Contributor Guide: CONTRIBUTING.rst
406
+ .. _Pydantic: https://github.com/pydantic/pydantic
407
+ .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
408
+ .. _SQLite: https://www.sqlite.org/
409
+ .. _Getting Started: https://datachain.dvc.ai/
410
+ .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
411
+ :alt: DataChain FlowChart