datachain 0.3.1__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (267) hide show
  1. {datachain-0.3.1 → datachain-0.3.3}/.github/workflows/benchmarks.yml +10 -7
  2. datachain-0.3.3/.github/workflows/tests-studio.yml +103 -0
  3. {datachain-0.3.1 → datachain-0.3.3}/.github/workflows/tests.yml +13 -76
  4. {datachain-0.3.1 → datachain-0.3.3}/.pre-commit-config.yaml +1 -1
  5. {datachain-0.3.1/src/datachain.egg-info → datachain-0.3.3}/PKG-INFO +75 -87
  6. {datachain-0.3.1 → datachain-0.3.3}/README.rst +64 -85
  7. {datachain-0.3.1 → datachain-0.3.3}/docs/index.md +5 -6
  8. {datachain-0.3.1 → datachain-0.3.3}/examples/computer_vision/openimage-detect.py +1 -1
  9. {datachain-0.3.1 → datachain-0.3.3}/examples/get_started/common_sql_functions.py +15 -13
  10. {datachain-0.3.1 → datachain-0.3.3}/examples/get_started/torch-loader.py +3 -2
  11. {datachain-0.3.1 → datachain-0.3.3}/examples/llm_and_nlp/unstructured-text.py +15 -15
  12. {datachain-0.3.1 → datachain-0.3.3}/examples/multimodal/hf_pipeline.py +28 -19
  13. {datachain-0.3.1 → datachain-0.3.3}/examples/multimodal/wds.py +17 -6
  14. {datachain-0.3.1 → datachain-0.3.3}/examples/multimodal/wds_filtered.py +4 -2
  15. {datachain-0.3.1 → datachain-0.3.3}/noxfile.py +11 -0
  16. {datachain-0.3.1 → datachain-0.3.3}/pyproject.toml +18 -3
  17. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/asyn.py +20 -0
  18. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/catalog/catalog.py +12 -1
  19. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/catalog/loader.py +75 -50
  20. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/client/azure.py +13 -0
  21. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/client/gcs.py +12 -0
  22. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/client/local.py +11 -0
  23. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/client/s3.py +12 -0
  24. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/data_storage/schema.py +22 -8
  25. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/data_storage/sqlite.py +60 -14
  26. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/data_storage/warehouse.py +17 -3
  27. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/arrow.py +1 -1
  28. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/convert/values_to_tuples.py +14 -8
  29. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/data_model.py +1 -0
  30. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/dc.py +52 -19
  31. datachain-0.3.3/src/datachain/lib/listing.py +111 -0
  32. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/meta_formats.py +8 -2
  33. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/node.py +1 -1
  34. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/dataset.py +22 -12
  35. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/schema.py +4 -0
  36. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/session.py +9 -2
  37. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/default/base.py +3 -0
  38. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/sqlite/base.py +33 -4
  39. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/types.py +120 -11
  40. {datachain-0.3.1 → datachain-0.3.3/src/datachain.egg-info}/PKG-INFO +75 -87
  41. {datachain-0.3.1 → datachain-0.3.3}/src/datachain.egg-info/SOURCES.txt +10 -25
  42. {datachain-0.3.1 → datachain-0.3.3}/src/datachain.egg-info/requires.txt +11 -1
  43. {datachain-0.3.1 → datachain-0.3.3}/tests/benchmarks/conftest.py +6 -0
  44. datachain-0.3.3/tests/benchmarks/datasets/.dvc/.gitignore +3 -0
  45. datachain-0.3.3/tests/benchmarks/datasets/.dvc/config +4 -0
  46. datachain-0.3.3/tests/benchmarks/datasets/.gitignore +1 -0
  47. datachain-0.3.3/tests/benchmarks/datasets/laion-tiny.npz.dvc +5 -0
  48. datachain-0.3.3/tests/benchmarks/test_datachain.py +22 -0
  49. datachain-0.3.3/tests/examples/test_examples.py +96 -0
  50. {datachain-0.3.1 → datachain-0.3.3}/tests/func/test_datachain.py +20 -0
  51. {datachain-0.3.1 → datachain-0.3.3}/tests/func/test_dataset_query.py +17 -38
  52. datachain-0.3.3/tests/func/test_listing.py +34 -0
  53. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_datachain.py +252 -35
  54. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_datachain_merge.py +41 -7
  55. datachain-0.3.3/tests/unit/lib/test_schema.py +22 -0
  56. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/sql/test_path.py +2 -1
  57. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_asyn.py +29 -1
  58. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_catalog_loader.py +41 -0
  59. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_data_storage.py +50 -1
  60. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_database_engine.py +21 -1
  61. datachain-0.3.1/examples/computer_vision/fashion_product_images/.gitignore +0 -5
  62. datachain-0.3.1/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -2211
  63. datachain-0.3.1/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -4103
  64. datachain-0.3.1/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -1081
  65. datachain-0.3.1/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -754
  66. datachain-0.3.1/examples/computer_vision/fashion_product_images/README.md +0 -60
  67. datachain-0.3.1/examples/computer_vision/fashion_product_images/requirements.txt +0 -6
  68. datachain-0.3.1/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -47
  69. datachain-0.3.1/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -47
  70. datachain-0.3.1/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -36
  71. datachain-0.3.1/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -44
  72. datachain-0.3.1/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -52
  73. datachain-0.3.1/examples/computer_vision/fashion_product_images/src/clustering.py +0 -41
  74. datachain-0.3.1/examples/computer_vision/fashion_product_images/src/train.py +0 -143
  75. datachain-0.3.1/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  76. datachain-0.3.1/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  77. datachain-0.3.1/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  78. datachain-0.3.1/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  79. datachain-0.3.1/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  80. datachain-0.3.1/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  81. datachain-0.3.1/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  82. datachain-0.3.1/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  83. datachain-0.3.1/examples/get_started/json-metadata-tutorial.ipynb +0 -2020
  84. datachain-0.3.1/examples/llm/llm_chatbot_evaluation.ipynb +0 -683
  85. datachain-0.3.1/examples/multimodal/clip_fine_tuning.ipynb +0 -1948
  86. {datachain-0.3.1 → datachain-0.3.3}/.cruft.json +0 -0
  87. {datachain-0.3.1 → datachain-0.3.3}/.gitattributes +0 -0
  88. {datachain-0.3.1 → datachain-0.3.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  89. {datachain-0.3.1 → datachain-0.3.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  90. {datachain-0.3.1 → datachain-0.3.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  91. {datachain-0.3.1 → datachain-0.3.3}/.github/codecov.yaml +0 -0
  92. {datachain-0.3.1 → datachain-0.3.3}/.github/dependabot.yml +0 -0
  93. {datachain-0.3.1 → datachain-0.3.3}/.github/workflows/release.yml +0 -0
  94. {datachain-0.3.1 → datachain-0.3.3}/.github/workflows/update-template.yaml +0 -0
  95. {datachain-0.3.1 → datachain-0.3.3}/.gitignore +0 -0
  96. {datachain-0.3.1 → datachain-0.3.3}/CODE_OF_CONDUCT.rst +0 -0
  97. {datachain-0.3.1 → datachain-0.3.3}/CONTRIBUTING.rst +0 -0
  98. {datachain-0.3.1 → datachain-0.3.3}/LICENSE +0 -0
  99. {datachain-0.3.1 → datachain-0.3.3}/docs/assets/captioned_cartoons.png +0 -0
  100. {datachain-0.3.1 → datachain-0.3.3}/docs/assets/datachain.png +0 -0
  101. {datachain-0.3.1 → datachain-0.3.3}/docs/assets/flowchart.png +0 -0
  102. {datachain-0.3.1 → datachain-0.3.3}/docs/references/datachain.md +0 -0
  103. {datachain-0.3.1 → datachain-0.3.3}/docs/references/datatype.md +0 -0
  104. {datachain-0.3.1 → datachain-0.3.3}/docs/references/file.md +0 -0
  105. {datachain-0.3.1 → datachain-0.3.3}/docs/references/index.md +0 -0
  106. {datachain-0.3.1 → datachain-0.3.3}/docs/references/sql.md +0 -0
  107. {datachain-0.3.1 → datachain-0.3.3}/docs/references/torch.md +0 -0
  108. {datachain-0.3.1 → datachain-0.3.3}/docs/references/udf.md +0 -0
  109. {datachain-0.3.1 → datachain-0.3.3}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  110. {datachain-0.3.1 → datachain-0.3.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  111. {datachain-0.3.1 → datachain-0.3.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  112. {datachain-0.3.1 → datachain-0.3.3}/examples/get_started/json-csv-reader.py +0 -0
  113. {datachain-0.3.1 → datachain-0.3.3}/examples/get_started/udfs/parallel.py +0 -0
  114. {datachain-0.3.1 → datachain-0.3.3}/examples/get_started/udfs/simple.py +0 -0
  115. {datachain-0.3.1 → datachain-0.3.3}/examples/get_started/udfs/stateful.py +0 -0
  116. {datachain-0.3.1 → datachain-0.3.3}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  117. {datachain-0.3.1 → datachain-0.3.3}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  118. {datachain-0.3.1 → datachain-0.3.3}/examples/llm_and_nlp/llm-claude.py +0 -0
  119. {datachain-0.3.1 → datachain-0.3.3}/examples/multimodal/clip_inference.py +0 -0
  120. {datachain-0.3.1 → datachain-0.3.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
  121. {datachain-0.3.1 → datachain-0.3.3}/mkdocs.yml +0 -0
  122. {datachain-0.3.1 → datachain-0.3.3}/setup.cfg +0 -0
  123. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/__init__.py +0 -0
  124. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/__main__.py +0 -0
  125. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/cache.py +0 -0
  126. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/catalog/__init__.py +0 -0
  127. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/catalog/datasource.py +0 -0
  128. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/catalog/subclass.py +0 -0
  129. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/cli.py +0 -0
  130. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/cli_utils.py +0 -0
  131. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/client/__init__.py +0 -0
  132. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/client/fileslice.py +0 -0
  133. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/client/fsspec.py +0 -0
  134. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/config.py +0 -0
  135. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/data_storage/__init__.py +0 -0
  136. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/data_storage/db_engine.py +0 -0
  137. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/data_storage/id_generator.py +0 -0
  138. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/data_storage/job.py +0 -0
  139. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/data_storage/metastore.py +0 -0
  140. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/data_storage/serializer.py +0 -0
  141. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/dataset.py +0 -0
  142. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/error.py +0 -0
  143. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/job.py +0 -0
  144. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/__init__.py +0 -0
  145. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/clip.py +0 -0
  146. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/convert/__init__.py +0 -0
  147. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/convert/flatten.py +0 -0
  148. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
  149. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
  150. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/convert/unflatten.py +0 -0
  151. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/dataset_info.py +0 -0
  152. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/file.py +0 -0
  153. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/image.py +0 -0
  154. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/model_store.py +0 -0
  155. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/pytorch.py +0 -0
  156. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/settings.py +0 -0
  157. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/signal_schema.py +0 -0
  158. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/text.py +0 -0
  159. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/udf.py +0 -0
  160. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/udf_signature.py +0 -0
  161. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/utils.py +0 -0
  162. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/vfile.py +0 -0
  163. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/webdataset.py +0 -0
  164. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/lib/webdataset_laion.py +0 -0
  165. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/listing.py +0 -0
  166. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/nodes_fetcher.py +0 -0
  167. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/nodes_thread_pool.py +0 -0
  168. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/progress.py +0 -0
  169. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/py.typed +0 -0
  170. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/__init__.py +0 -0
  171. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/batch.py +0 -0
  172. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/builtins.py +0 -0
  173. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/dispatch.py +0 -0
  174. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/metrics.py +0 -0
  175. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/params.py +0 -0
  176. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/queue.py +0 -0
  177. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/query/udf.py +0 -0
  178. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/remote/__init__.py +0 -0
  179. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/remote/studio.py +0 -0
  180. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/__init__.py +0 -0
  181. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/default/__init__.py +0 -0
  182. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/functions/__init__.py +0 -0
  183. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/functions/array.py +0 -0
  184. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/functions/conditional.py +0 -0
  185. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/functions/path.py +0 -0
  186. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/functions/random.py +0 -0
  187. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/functions/string.py +0 -0
  188. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/selectable.py +0 -0
  189. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/sqlite/__init__.py +0 -0
  190. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/sqlite/types.py +0 -0
  191. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/sqlite/vector.py +0 -0
  192. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/sql/utils.py +0 -0
  193. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/storage.py +0 -0
  194. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/torch/__init__.py +0 -0
  195. {datachain-0.3.1 → datachain-0.3.3}/src/datachain/utils.py +0 -0
  196. {datachain-0.3.1 → datachain-0.3.3}/src/datachain.egg-info/dependency_links.txt +0 -0
  197. {datachain-0.3.1 → datachain-0.3.3}/src/datachain.egg-info/entry_points.txt +0 -0
  198. {datachain-0.3.1 → datachain-0.3.3}/src/datachain.egg-info/top_level.txt +0 -0
  199. {datachain-0.3.1 → datachain-0.3.3}/tests/__init__.py +0 -0
  200. {datachain-0.3.1 → datachain-0.3.3}/tests/benchmarks/__init__.py +0 -0
  201. {datachain-0.3.1 → datachain-0.3.3}/tests/benchmarks/test_ls.py +0 -0
  202. {datachain-0.3.1 → datachain-0.3.3}/tests/benchmarks/test_version.py +0 -0
  203. {datachain-0.3.1 → datachain-0.3.3}/tests/conftest.py +0 -0
  204. {datachain-0.3.1 → datachain-0.3.3}/tests/data.py +0 -0
  205. {datachain-0.3.1 → datachain-0.3.3}/tests/examples/__init__.py +0 -0
  206. {datachain-0.3.1 → datachain-0.3.3}/tests/examples/test_wds_e2e.py +0 -0
  207. {datachain-0.3.1 → datachain-0.3.3}/tests/examples/wds_data.py +0 -0
  208. {datachain-0.3.1 → datachain-0.3.3}/tests/func/__init__.py +0 -0
  209. {datachain-0.3.1 → datachain-0.3.3}/tests/func/test_catalog.py +0 -0
  210. {datachain-0.3.1 → datachain-0.3.3}/tests/func/test_client.py +0 -0
  211. {datachain-0.3.1 → datachain-0.3.3}/tests/func/test_datasets.py +0 -0
  212. {datachain-0.3.1 → datachain-0.3.3}/tests/func/test_feature_pickling.py +0 -0
  213. {datachain-0.3.1 → datachain-0.3.3}/tests/func/test_ls.py +0 -0
  214. {datachain-0.3.1 → datachain-0.3.3}/tests/func/test_pull.py +0 -0
  215. {datachain-0.3.1 → datachain-0.3.3}/tests/func/test_pytorch.py +0 -0
  216. {datachain-0.3.1 → datachain-0.3.3}/tests/func/test_query.py +0 -0
  217. {datachain-0.3.1 → datachain-0.3.3}/tests/scripts/feature_class.py +0 -0
  218. {datachain-0.3.1 → datachain-0.3.3}/tests/scripts/feature_class_parallel.py +0 -0
  219. {datachain-0.3.1 → datachain-0.3.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  220. {datachain-0.3.1 → datachain-0.3.3}/tests/scripts/name_len_slow.py +0 -0
  221. {datachain-0.3.1 → datachain-0.3.3}/tests/test_cli_e2e.py +0 -0
  222. {datachain-0.3.1 → datachain-0.3.3}/tests/test_query_e2e.py +0 -0
  223. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/__init__.py +0 -0
  224. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/__init__.py +0 -0
  225. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/conftest.py +0 -0
  226. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_arrow.py +0 -0
  227. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_clip.py +0 -0
  228. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  229. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_feature.py +0 -0
  230. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_feature_utils.py +0 -0
  231. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_file.py +0 -0
  232. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_image.py +0 -0
  233. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_signal_schema.py +0 -0
  234. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_sql_to_python.py +0 -0
  235. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_text.py +0 -0
  236. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_udf_signature.py +0 -0
  237. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_utils.py +0 -0
  238. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/lib/test_webdataset.py +0 -0
  239. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/sql/__init__.py +0 -0
  240. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/sql/sqlite/__init__.py +0 -0
  241. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
  242. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/sql/test_array.py +0 -0
  243. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/sql/test_conditional.py +0 -0
  244. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/sql/test_random.py +0 -0
  245. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/sql/test_selectable.py +0 -0
  246. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/sql/test_string.py +0 -0
  247. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_cache.py +0 -0
  248. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_catalog.py +0 -0
  249. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_cli_parsing.py +0 -0
  250. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_client.py +0 -0
  251. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_client_s3.py +0 -0
  252. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_dataset.py +0 -0
  253. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_dispatch.py +0 -0
  254. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_fileslice.py +0 -0
  255. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_id_generator.py +0 -0
  256. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_listing.py +0 -0
  257. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_metastore.py +0 -0
  258. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_module_exports.py +0 -0
  259. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_query_metrics.py +0 -0
  260. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_query_params.py +0 -0
  261. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_serializer.py +0 -0
  262. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_session.py +0 -0
  263. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_storage.py +0 -0
  264. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_udf.py +0 -0
  265. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_utils.py +0 -0
  266. {datachain-0.3.1 → datachain-0.3.3}/tests/unit/test_warehouse.py +0 -0
  267. {datachain-0.3.1 → datachain-0.3.3}/tests/utils.py +0 -0
@@ -5,23 +5,24 @@ on:
5
5
  - cron: '0 0 * * *'
6
6
  pull_request:
7
7
  types: [opened, reopened, labeled, synchronize]
8
- workflow_dispatch: {}
8
+ workflow_dispatch:
9
9
 
10
10
  env:
11
11
  FORCE_COLOR: "1"
12
12
 
13
+ concurrency:
14
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
15
+ cancel-in-progress: true
16
+
13
17
  jobs:
14
- build:
15
- if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') }}
18
+ run:
16
19
  runs-on: ubuntu-latest
17
-
18
20
  steps:
19
21
  - uses: actions/checkout@v4
20
-
21
- - name: Set up Python 3.10
22
+ - name: Set up Python 3.12
22
23
  uses: actions/setup-python@v5
23
24
  with:
24
- python-version: '3.10'
25
+ python-version: '3.12'
25
26
  cache: 'pip'
26
27
 
27
28
  - name: Upgrade nox and uv
@@ -30,5 +31,7 @@ jobs:
30
31
  nox --version
31
32
  uv --version
32
33
 
34
+ - run: uv pip install dvc[gs] --system
35
+ - run: dvc --cd tests/benchmarks/datasets pull
33
36
  - name: Run benchmarks
34
37
  run: nox -s bench
@@ -0,0 +1,103 @@
1
+ name: Studio Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ workflow_dispatch:
8
+
9
+ env:
10
+ FORCE_COLOR: "1"
11
+ BRANCH: ${{ github.head_ref || github.ref_name }}
12
+
13
+ concurrency:
14
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
15
+ cancel-in-progress: true
16
+
17
+ jobs:
18
+ studio:
19
+ if: '!github.event.pull_request.head.repo.fork'
20
+ runs-on: ubuntu-latest-16-cores
21
+ strategy:
22
+ matrix:
23
+ pyv: ['3.12']
24
+ group: [1, 2, 3, 4, 5, 6]
25
+ services:
26
+ postgres:
27
+ image: postgres:16.3
28
+ ports:
29
+ - 5432:5432
30
+ env:
31
+ POSTGRES_USER: test
32
+ POSTGRES_DB: database
33
+ POSTGRES_HOST_AUTH_METHOD: trust
34
+ clickhouse:
35
+ image: clickhouse/clickhouse-server:24
36
+ ports:
37
+ - 8123:8123
38
+ - 9010:9000
39
+ env:
40
+ CLICKHOUSE_DB: studio_local_db
41
+ CLICKHOUSE_USER: studio_local
42
+ CLICKHOUSE_PASSWORD: ch123456789!
43
+ CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
44
+ redis:
45
+ image: redis:7.2.5
46
+ ports:
47
+ - 6379:6379
48
+ steps:
49
+ - name: Studio branch name
50
+ env:
51
+ BRANCH: ${{ env.BRANCH }}
52
+ STUDIO_READ_ACCESS_TOKEN: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
53
+ run: |
54
+ echo "DataChain branch: $BRANCH"
55
+ if [[ "$BRANCH" == "main" ]]
56
+ then
57
+ STUDIO_BRANCH=develop
58
+ elif git ls-remote --heads https://"$STUDIO_READ_ACCESS_TOKEN"@github.com/iterative/studio.git "$BRANCH" | grep -F "$BRANCH" 2>&1>/dev/null
59
+ then
60
+ STUDIO_BRANCH="$BRANCH"
61
+ else
62
+ STUDIO_BRANCH=develop
63
+ fi
64
+ echo "STUDIO_BRANCH=$STUDIO_BRANCH" >> $GITHUB_ENV
65
+ echo "Studio branch: $STUDIO_BRANCH"
66
+
67
+ - name: Check out Studio
68
+ uses: actions/checkout@v4
69
+ with:
70
+ fetch-depth: 0
71
+ repository: iterative/studio
72
+ ref: ${{ env.STUDIO_BRANCH }}
73
+ token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
74
+
75
+ - name: Check out repository
76
+ uses: actions/checkout@v4
77
+ with:
78
+ path: './backend/datachain'
79
+ fetch-depth: 0
80
+
81
+ - name: Set up Python ${{ matrix.pyv }}
82
+ uses: actions/setup-python@v5
83
+ with:
84
+ python-version: ${{ matrix.pyv }}
85
+ cache: 'pip'
86
+
87
+ - name: Install uv
88
+ run: |
89
+ python -m pip install --upgrade uv
90
+ uv --version
91
+
92
+ - name: Install dependencies
93
+ run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
94
+
95
+ - name: Run tests
96
+ # Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
97
+ run: >
98
+ pytest
99
+ --config-file=pyproject.toml -rs
100
+ --splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
101
+ -m 'not benchmark'
102
+ tests ../datachain/tests
103
+ working-directory: backend/datachain_server
@@ -8,7 +8,6 @@ on:
8
8
 
9
9
  env:
10
10
  FORCE_COLOR: "1"
11
- BRANCH: ${{ github.head_ref || github.ref_name }}
12
11
 
13
12
  concurrency:
14
13
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -18,7 +17,6 @@ jobs:
18
17
  lint:
19
18
  runs-on: ubuntu-latest
20
19
  steps:
21
-
22
20
  - name: Check out the repository
23
21
  uses: actions/checkout@v4
24
22
  with:
@@ -112,70 +110,17 @@ jobs:
112
110
  - name: Build docs
113
111
  run: nox -s docs
114
112
 
115
-
116
- studio:
117
- if: '!github.event.pull_request.head.repo.fork'
118
- runs-on: ubuntu-latest-16-cores
113
+ examples:
114
+ runs-on: ${{ matrix.os }}
115
+ timeout-minutes: 60
119
116
  strategy:
117
+ fail-fast: false
120
118
  matrix:
121
- pyv: ['3.12']
122
- group: [1, 2, 3, 4, 5, 6]
123
- services:
124
- postgres:
125
- image: postgres:16.3
126
- ports:
127
- - 5432:5432
128
- env:
129
- POSTGRES_USER: test
130
- POSTGRES_DB: database
131
- POSTGRES_HOST_AUTH_METHOD: trust
132
- clickhouse:
133
- image: clickhouse/clickhouse-server:24
134
- ports:
135
- - 8123:8123
136
- - 9010:9000
137
- env:
138
- CLICKHOUSE_DB: studio_local_db
139
- CLICKHOUSE_USER: studio_local
140
- CLICKHOUSE_PASSWORD: ch123456789!
141
- CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
142
- redis:
143
- image: redis:7.2.5
144
- ports:
145
- - 6379:6379
119
+ os: [ubuntu-latest-16-cores, macos-latest, windows-latest-8-cores]
120
+ pyv: ['3.9', '3.12']
121
+ group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
146
122
  steps:
147
-
148
- - name: Studio branch name
149
- env:
150
- BRANCH: ${{ env.BRANCH }}
151
- STUDIO_READ_ACCESS_TOKEN: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
152
- run: |
153
- echo "DataChain branch: $BRANCH"
154
- if [[ "$BRANCH" == "main" ]]
155
- then
156
- STUDIO_BRANCH=develop
157
- elif git ls-remote --heads https://"$STUDIO_READ_ACCESS_TOKEN"@github.com/iterative/studio.git "$BRANCH" | grep -F "$BRANCH" 2>&1>/dev/null
158
- then
159
- STUDIO_BRANCH="$BRANCH"
160
- else
161
- STUDIO_BRANCH=develop
162
- fi
163
- echo "STUDIO_BRANCH=$STUDIO_BRANCH" >> $GITHUB_ENV
164
- echo "Studio branch: $STUDIO_BRANCH"
165
-
166
- - name: Check out Studio
167
- uses: actions/checkout@v4
168
- with:
169
- fetch-depth: 0
170
- repository: iterative/studio
171
- ref: ${{ env.STUDIO_BRANCH }}
172
- token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
173
-
174
- - name: Check out repository
175
- uses: actions/checkout@v4
176
- with:
177
- path: './backend/datachain'
178
- fetch-depth: 0
123
+ - uses: actions/checkout@v4
179
124
 
180
125
  - name: Set up Python ${{ matrix.pyv }}
181
126
  uses: actions/setup-python@v5
@@ -183,19 +128,11 @@ jobs:
183
128
  python-version: ${{ matrix.pyv }}
184
129
  cache: 'pip'
185
130
 
186
- - name: Install uv
131
+ - name: Upgrade nox and uv
187
132
  run: |
188
- python -m pip install --upgrade uv
133
+ python -m pip install --upgrade 'nox[uv]'
134
+ nox --version
189
135
  uv --version
190
136
 
191
- - name: Install dependencies
192
- run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
193
-
194
- - name: Run tests
195
- # Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
196
- run: >
197
- pytest
198
- --config-file=pyproject.toml -rs
199
- --splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
200
- tests ../datachain/tests
201
- working-directory: backend/datachain_server
137
+ - name: Run examples
138
+ run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.5.6'
27
+ rev: 'v0.5.7'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -76,11 +76,20 @@ Requires-Dist: aiotools>=1.7.0; extra == "tests"
76
76
  Requires-Dist: requests-mock; extra == "tests"
77
77
  Provides-Extra: dev
78
78
  Requires-Dist: datachain[docs,tests]; extra == "dev"
79
- Requires-Dist: mypy==1.10.1; extra == "dev"
79
+ Requires-Dist: mypy==1.11.1; extra == "dev"
80
80
  Requires-Dist: types-python-dateutil; extra == "dev"
81
81
  Requires-Dist: types-pytz; extra == "dev"
82
82
  Requires-Dist: types-PyYAML; extra == "dev"
83
83
  Requires-Dist: types-requests; extra == "dev"
84
+ Provides-Extra: examples
85
+ Requires-Dist: datachain[tests]; extra == "examples"
86
+ Requires-Dist: numpy<2,>=1; extra == "examples"
87
+ Requires-Dist: defusedxml; extra == "examples"
88
+ Requires-Dist: accelerate; extra == "examples"
89
+ Requires-Dist: unstructured[pdf]; extra == "examples"
90
+ Requires-Dist: pdfplumber==0.11.3; extra == "examples"
91
+ Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
92
+ Requires-Dist: nltk==3.8.1; extra == "examples"
84
93
 
85
94
  |PyPI| |Python Version| |Codecov| |Tests|
86
95
 
@@ -100,102 +109,78 @@ Requires-Dist: types-requests; extra == "dev"
100
109
  AI 🔗 DataChain
101
110
  ----------------
102
111
 
103
- DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
104
- AI engineers build a metadata layer on top of unstructured files and analyze data using
105
- this layer.
112
+ DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
113
+ It is made to organize your unstructured data into datasets and wrangle it at scale on
114
+ your local machine.
106
115
 
107
- 📂 **Raw Files Processing**
108
- Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
109
- Local), version and update datasets.
116
+ Key Features
117
+ ============
110
118
 
111
- 🌟 **Metadata layer.**
112
- Build a metadata layer on top of files using structured sources like CSV, Parquet,
113
- and JSON files.
119
+ 📂 **Storage as a Source of Truth.**
120
+ - Process unstructured data without redundant copies: S3, GCP, Azure, and local
121
+ file systems.
122
+ - Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
123
+ - Join files and metadata together into persistent, versioned, columnar datasets.
114
124
 
115
- **Metadata enrichment.**
116
- Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
125
+ 🐍 **Python-friendly data pipelines.**
126
+ - Operate on Python objects and object fields.
127
+ - Built-in parallelization and out-of-memory compute without a need in SQL or
128
+ Spark jobs.
117
129
 
118
- 🛠️ **Data Transformation.**
119
- Transform metadata using traditional methods like filtering, grouping, joining, and
120
- others.
130
+ 🧠 **Data Enrichment and Processing.**
131
+ - Generate metadata columns using local AI models and LLM APIs.
132
+ - Filter, join, and group by AI metadata. Vector similarity search.
133
+ - Pass datasets to Pytorch and Tensorflow, or export back into storage.
121
134
 
122
- 🐍 **User-friendly interface.**
123
- Operate efficiently with familiar Python objects and object fields, eliminating the
124
- need for SQL.
135
+ 🚀 **Efficiency.**
136
+ - Parallelization, out-of-memory workloads and data caching.
137
+ - Vectorized operations on Python object fields: sum, count, avg, etc.
138
+ - Vector search on embeddings.
125
139
 
126
140
 
141
+ Quick Start
142
+ -----------
143
+
127
144
  .. code:: console
128
145
 
129
146
  $ pip install datachain
130
147
 
131
148
 
132
- Data Structures
133
- ===============
134
-
135
- DataChain introduces expressive data structures tailored for AI-specific workload:
136
-
137
- - **Dataset:** Preserves the file-references and meta-information. Takes care of Python
138
- object serialization, dataset versioning and difference. Operations on dataset:
139
-
140
- - **Transformations:** traditional data-frame or SQL operations such as filtering,
141
- grouping, joining.
142
- - **Enrichments:** mapping, aggregating and generating using customer’s Python
143
- code. This is needed to work with ML inference and LLM calls.
144
-
145
- - **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
146
- mode - only when needed.
147
-
148
- DataChain name comes from these major data structures: dataset and chaining.
149
-
149
+ Selecting files using JSON metadata
150
+ ======================================
150
151
 
151
- What’s new in DataChain?
152
- ========================
152
+ A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`),
153
+ annotated with ground truth and model inferences in the 'json-pairs' format,
154
+ where each image has a matching JSON file like `cat.1009.json`:
153
155
 
154
- The project combines multiple ideas from different areas in order to simplify AI
155
- use-cases and at the same time to fit it into traditional data infrastructure.
156
+ .. code:: json
156
157
 
157
- - **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
158
- native language for AI. It’s powered by `Pydantic`_ data models.
159
- - **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
160
- group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
161
- needed for distributed computations.
162
- - **Resuming data processing** (in development). Introduces idempotent operations,
163
- allowing data processing to resume from the last successful process file/record/batch
164
- if it fails due to issues like failed LLM calls, ML inference or file download.
158
+ {
159
+ "class": "cat", "id": "1009", "num_annotators": 8,
160
+ "inference": {"class": "dog", "confidence": 0.68}
161
+ }
165
162
 
166
- Additional relatively new ideas:
163
+ Example of downloading only high-confidence cat images using JSON metadata:
167
164
 
168
- - **Functional style data processing.** Using a functional/chaining approach to data
169
- processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
170
- - **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
171
- and implements data versioning, extending ideas from DVC (developed by the same team).
172
165
 
166
+ .. code:: py
173
167
 
174
- What DataChain is NOT?
175
- ======================
176
-
177
- - **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
178
- `SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
179
- version.
180
- - **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
181
- it delegates heavy data transformations to underlying data warehouses and focuses on
182
- AI specific data enrichments and orchestrating all the pieces together.
183
-
168
+ from datachain import Column, DataChain
184
169
 
185
- Quick Start
186
- -----------
170
+ meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
171
+ images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
187
172
 
188
- Data curation with a local model
189
- =================================
173
+ images_id = images.map(id=lambda file: file.path.split('.')[-2])
174
+ annotated = images_id.merge(meta, on="id", right_on="meta.id")
190
175
 
191
- We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
192
- - 50 files total in this example.
193
- These dialogs involve users chatting with a bot while looking for better wireless plans.
194
- Our goal is to identify the successful dialogs.
176
+ likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
+ & (Column("meta.inference.class_") == "cat"))
178
+ likely_cats.export_files("high-confidence-cats/", signal="file")
195
179
 
196
- The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
197
180
 
198
- First, we'll show batch inference with a simple sentiment model using the `transformers` library:
181
+ Data curation with a local AI model
182
+ ===================================
183
+ Batch inference with a simple sentiment model using the `transformers` library:
199
184
 
200
185
  .. code:: shell
201
186
 
@@ -246,30 +231,30 @@ LLM judging chatbots
246
231
  =============================
247
232
 
248
233
  LLMs can work as efficient universal classifiers. In the example below,
249
- we employ a free API from Mistral to judge the chatbot performance. Please get a free
234
+ we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
250
235
  Mistral API key at https://console.mistral.ai
251
236
 
237
+
252
238
  .. code:: shell
253
239
 
254
- $ pip install mistralai
240
+ $ pip install mistralai (Requires version >=1.0.0)
255
241
  $ export MISTRAL_API_KEY=_your_key_
256
242
 
257
243
  DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
258
244
 
259
245
  .. code:: py
260
246
 
261
- from mistralai.client import MistralClient
262
- from mistralai.models.chat_completion import ChatMessage
247
+ from mistralai import Mistral
263
248
  from datachain import File, DataChain, Column
264
249
 
265
250
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
266
251
 
267
252
  def eval_dialogue(file: File) -> bool:
268
- client = MistralClient()
269
- response = client.chat(
253
+ client = Mistral()
254
+ response = client.chat.complete(
270
255
  model="open-mixtral-8x22b",
271
- messages=[ChatMessage(role="system", content=PROMPT),
272
- ChatMessage(role="user", content=file.read())])
256
+ messages=[{"role": "system", "content": PROMPT},
257
+ {"role": "user", "content": file.read()}])
273
258
  result = response.choices[0].message.content
274
259
  return result.lower().startswith("success")
275
260
 
@@ -309,8 +294,8 @@ Instead of extracting this information from the Mistral response data structure
309
294
 
310
295
  .. code:: py
311
296
 
312
- from mistralai.client import MistralClient
313
- from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
297
+ from mistralai import Mistral
298
+ from mistralai.models import ChatCompletionResponse
314
299
  from datachain import File, DataChain, Column
315
300
 
316
301
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
@@ -319,8 +304,8 @@ Instead of extracting this information from the Mistral response data structure
319
304
  client = MistralClient()
320
305
  return client.chat(
321
306
  model="open-mixtral-8x22b",
322
- messages=[ChatMessage(role="system", content=PROMPT),
323
- ChatMessage(role="user", content=file.read())])
307
+ messages=[{"role": "system", "content": PROMPT},
308
+ {"role": "user", "content": file.read()}])
324
309
 
325
310
  chain = (
326
311
  DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
@@ -438,7 +423,10 @@ Tutorials
438
423
  ---------
439
424
 
440
425
  * `Getting Started`_
441
- * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
426
+ * `Multimodal <https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`__)
427
+ * `LLM evaluations <https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`__)
428
+ * `Reading JSON metadata <https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`__)
429
+
442
430
 
443
431
  Contributions
444
432
  -------------