datachain 0.2.12__tar.gz → 0.2.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (255) hide show
  1. {datachain-0.2.12 → datachain-0.2.14}/.github/workflows/tests.yml +8 -23
  2. {datachain-0.2.12 → datachain-0.2.14}/.pre-commit-config.yaml +2 -0
  3. {datachain-0.2.12/src/datachain.egg-info → datachain-0.2.14}/PKG-INFO +42 -44
  4. {datachain-0.2.12 → datachain-0.2.14}/README.rst +39 -41
  5. {datachain-0.2.12 → datachain-0.2.14}/docs/index.md +2 -2
  6. datachain-0.2.14/examples/llm/llm_chatbot_evaluation.ipynb +772 -0
  7. {datachain-0.2.12 → datachain-0.2.14}/pyproject.toml +8 -3
  8. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/catalog/catalog.py +7 -1
  9. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/cli.py +11 -0
  10. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/metastore.py +0 -4
  11. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/schema.py +7 -3
  12. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/sqlite.py +1 -4
  13. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/warehouse.py +1 -24
  14. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/flatten.py +4 -4
  15. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/values_to_tuples.py +4 -1
  16. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/dc.py +100 -5
  17. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/file.py +23 -22
  18. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/meta_formats.py +6 -5
  19. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/dataset.py +29 -23
  20. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/sqlite/base.py +3 -3
  21. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/sqlite/types.py +5 -13
  22. {datachain-0.2.12 → datachain-0.2.14/src/datachain.egg-info}/PKG-INFO +42 -44
  23. {datachain-0.2.12 → datachain-0.2.14}/src/datachain.egg-info/SOURCES.txt +1 -0
  24. {datachain-0.2.12 → datachain-0.2.14}/src/datachain.egg-info/requires.txt +2 -2
  25. {datachain-0.2.12 → datachain-0.2.14}/tests/conftest.py +42 -26
  26. {datachain-0.2.12 → datachain-0.2.14}/tests/examples/test_wds_e2e.py +1 -1
  27. {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_catalog.py +39 -0
  28. {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_datachain.py +61 -7
  29. {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_dataset_query.py +29 -0
  30. {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_datasets.py +7 -6
  31. {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_feature_pickling.py +10 -3
  32. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_datachain.py +57 -1
  33. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_datachain_merge.py +19 -19
  34. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_feature.py +7 -7
  35. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_file.py +57 -1
  36. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_module_exports.py +25 -18
  37. {datachain-0.2.12 → datachain-0.2.14}/tests/utils.py +6 -0
  38. {datachain-0.2.12 → datachain-0.2.14}/.cruft.json +0 -0
  39. {datachain-0.2.12 → datachain-0.2.14}/.gitattributes +0 -0
  40. {datachain-0.2.12 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  41. {datachain-0.2.12 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  42. {datachain-0.2.12 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  43. {datachain-0.2.12 → datachain-0.2.14}/.github/codecov.yaml +0 -0
  44. {datachain-0.2.12 → datachain-0.2.14}/.github/dependabot.yml +0 -0
  45. {datachain-0.2.12 → datachain-0.2.14}/.github/workflows/benchmarks.yml +0 -0
  46. {datachain-0.2.12 → datachain-0.2.14}/.github/workflows/release.yml +0 -0
  47. {datachain-0.2.12 → datachain-0.2.14}/.github/workflows/update-template.yaml +0 -0
  48. {datachain-0.2.12 → datachain-0.2.14}/.gitignore +0 -0
  49. {datachain-0.2.12 → datachain-0.2.14}/CODE_OF_CONDUCT.rst +0 -0
  50. {datachain-0.2.12 → datachain-0.2.14}/CONTRIBUTING.rst +0 -0
  51. {datachain-0.2.12 → datachain-0.2.14}/LICENSE +0 -0
  52. {datachain-0.2.12 → datachain-0.2.14}/docs/assets/captioned_cartoons.png +0 -0
  53. {datachain-0.2.12 → datachain-0.2.14}/docs/assets/datachain.png +0 -0
  54. {datachain-0.2.12 → datachain-0.2.14}/docs/assets/flowchart.png +0 -0
  55. {datachain-0.2.12 → datachain-0.2.14}/docs/references/datachain.md +0 -0
  56. {datachain-0.2.12 → datachain-0.2.14}/docs/references/datatype.md +0 -0
  57. {datachain-0.2.12 → datachain-0.2.14}/docs/references/file.md +0 -0
  58. {datachain-0.2.12 → datachain-0.2.14}/docs/references/index.md +0 -0
  59. {datachain-0.2.12 → datachain-0.2.14}/docs/references/sql.md +0 -0
  60. {datachain-0.2.12 → datachain-0.2.14}/docs/references/torch.md +0 -0
  61. {datachain-0.2.12 → datachain-0.2.14}/docs/references/udf.md +0 -0
  62. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  63. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
  64. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
  65. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
  66. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
  67. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
  68. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/README.md +0 -0
  69. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  70. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
  71. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
  72. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
  73. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
  74. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
  75. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  76. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
  77. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  78. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  79. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  80. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  81. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  82. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  83. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  84. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  85. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  86. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  87. {datachain-0.2.12 → datachain-0.2.14}/examples/computer_vision/openimage-detect.py +0 -0
  88. {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/common_sql_functions.py +0 -0
  89. {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/json-csv-reader.py +0 -0
  90. {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/torch-loader.py +0 -0
  91. {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/udfs/parallel.py +0 -0
  92. {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/udfs/simple.py +0 -0
  93. {datachain-0.2.12 → datachain-0.2.14}/examples/get_started/udfs/stateful.py +0 -0
  94. {datachain-0.2.12 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  95. {datachain-0.2.12 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  96. {datachain-0.2.12 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude.py +0 -0
  97. {datachain-0.2.12 → datachain-0.2.14}/examples/llm_and_nlp/unstructured-text.py +0 -0
  98. {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/clip.py +0 -0
  99. {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
  100. {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/hf_pipeline.py +0 -0
  101. {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/openai_image_desc_lib.py +0 -0
  102. {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/wds.py +0 -0
  103. {datachain-0.2.12 → datachain-0.2.14}/examples/multimodal/wds_filtered.py +0 -0
  104. {datachain-0.2.12 → datachain-0.2.14}/mkdocs.yml +0 -0
  105. {datachain-0.2.12 → datachain-0.2.14}/noxfile.py +0 -0
  106. {datachain-0.2.12 → datachain-0.2.14}/setup.cfg +0 -0
  107. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/__init__.py +0 -0
  108. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/__main__.py +0 -0
  109. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/asyn.py +0 -0
  110. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/cache.py +0 -0
  111. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/catalog/__init__.py +0 -0
  112. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/catalog/datasource.py +0 -0
  113. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/catalog/loader.py +0 -0
  114. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/catalog/subclass.py +0 -0
  115. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/cli_utils.py +0 -0
  116. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/__init__.py +0 -0
  117. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/azure.py +0 -0
  118. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/fileslice.py +0 -0
  119. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/fsspec.py +0 -0
  120. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/gcs.py +0 -0
  121. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/local.py +0 -0
  122. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/client/s3.py +0 -0
  123. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/config.py +0 -0
  124. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/__init__.py +0 -0
  125. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/db_engine.py +0 -0
  126. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/id_generator.py +0 -0
  127. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/job.py +0 -0
  128. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/data_storage/serializer.py +0 -0
  129. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/dataset.py +0 -0
  130. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/error.py +0 -0
  131. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/job.py +0 -0
  132. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/__init__.py +0 -0
  133. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/arrow.py +0 -0
  134. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/clip.py +0 -0
  135. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/__init__.py +0 -0
  136. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/python_to_sql.py +0 -0
  137. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/sql_to_python.py +0 -0
  138. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/convert/unflatten.py +0 -0
  139. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/data_model.py +0 -0
  140. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/dataset_info.py +0 -0
  141. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/image.py +0 -0
  142. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/model_store.py +0 -0
  143. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/pytorch.py +0 -0
  144. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/settings.py +0 -0
  145. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/signal_schema.py +0 -0
  146. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/text.py +0 -0
  147. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/udf.py +0 -0
  148. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/udf_signature.py +0 -0
  149. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/utils.py +0 -0
  150. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/vfile.py +0 -0
  151. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/webdataset.py +0 -0
  152. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/lib/webdataset_laion.py +0 -0
  153. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/listing.py +0 -0
  154. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/node.py +0 -0
  155. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/nodes_fetcher.py +0 -0
  156. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/nodes_thread_pool.py +0 -0
  157. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/progress.py +0 -0
  158. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/py.typed +0 -0
  159. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/__init__.py +0 -0
  160. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/batch.py +0 -0
  161. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/builtins.py +0 -0
  162. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/dispatch.py +0 -0
  163. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/metrics.py +0 -0
  164. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/params.py +0 -0
  165. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/schema.py +0 -0
  166. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/session.py +0 -0
  167. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/query/udf.py +0 -0
  168. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/remote/__init__.py +0 -0
  169. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/remote/studio.py +0 -0
  170. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/__init__.py +0 -0
  171. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/default/__init__.py +0 -0
  172. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/default/base.py +0 -0
  173. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/__init__.py +0 -0
  174. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/array.py +0 -0
  175. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/conditional.py +0 -0
  176. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/path.py +0 -0
  177. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/random.py +0 -0
  178. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/functions/string.py +0 -0
  179. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/selectable.py +0 -0
  180. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/sqlite/__init__.py +0 -0
  181. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/sqlite/vector.py +0 -0
  182. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/types.py +0 -0
  183. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/sql/utils.py +0 -0
  184. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/storage.py +0 -0
  185. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/torch/__init__.py +0 -0
  186. {datachain-0.2.12 → datachain-0.2.14}/src/datachain/utils.py +0 -0
  187. {datachain-0.2.12 → datachain-0.2.14}/src/datachain.egg-info/dependency_links.txt +0 -0
  188. {datachain-0.2.12 → datachain-0.2.14}/src/datachain.egg-info/entry_points.txt +0 -0
  189. {datachain-0.2.12 → datachain-0.2.14}/src/datachain.egg-info/top_level.txt +0 -0
  190. {datachain-0.2.12 → datachain-0.2.14}/tests/__init__.py +0 -0
  191. {datachain-0.2.12 → datachain-0.2.14}/tests/benchmarks/__init__.py +0 -0
  192. {datachain-0.2.12 → datachain-0.2.14}/tests/benchmarks/conftest.py +0 -0
  193. {datachain-0.2.12 → datachain-0.2.14}/tests/benchmarks/test_ls.py +0 -0
  194. {datachain-0.2.12 → datachain-0.2.14}/tests/benchmarks/test_version.py +0 -0
  195. {datachain-0.2.12 → datachain-0.2.14}/tests/data.py +0 -0
  196. {datachain-0.2.12 → datachain-0.2.14}/tests/examples/__init__.py +0 -0
  197. {datachain-0.2.12 → datachain-0.2.14}/tests/examples/wds_data.py +0 -0
  198. {datachain-0.2.12 → datachain-0.2.14}/tests/func/__init__.py +0 -0
  199. {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_client.py +0 -0
  200. {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_ls.py +0 -0
  201. {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_pull.py +0 -0
  202. {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_pytorch.py +0 -0
  203. {datachain-0.2.12 → datachain-0.2.14}/tests/func/test_query.py +0 -0
  204. {datachain-0.2.12 → datachain-0.2.14}/tests/scripts/feature_class.py +0 -0
  205. {datachain-0.2.12 → datachain-0.2.14}/tests/scripts/feature_class_parallel.py +0 -0
  206. {datachain-0.2.12 → datachain-0.2.14}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  207. {datachain-0.2.12 → datachain-0.2.14}/tests/scripts/name_len_normal.py +0 -0
  208. {datachain-0.2.12 → datachain-0.2.14}/tests/scripts/name_len_slow.py +0 -0
  209. {datachain-0.2.12 → datachain-0.2.14}/tests/test_cli_e2e.py +0 -0
  210. {datachain-0.2.12 → datachain-0.2.14}/tests/test_query_e2e.py +0 -0
  211. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/__init__.py +0 -0
  212. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/__init__.py +0 -0
  213. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/conftest.py +0 -0
  214. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_arrow.py +0 -0
  215. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_clip.py +0 -0
  216. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  217. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_feature_utils.py +0 -0
  218. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_image.py +0 -0
  219. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_signal_schema.py +0 -0
  220. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_text.py +0 -0
  221. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_udf_signature.py +0 -0
  222. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_utils.py +0 -0
  223. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/lib/test_webdataset.py +0 -0
  224. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/__init__.py +0 -0
  225. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/sqlite/__init__.py +0 -0
  226. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/sqlite/test_utils.py +0 -0
  227. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_array.py +0 -0
  228. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_conditional.py +0 -0
  229. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_path.py +0 -0
  230. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_random.py +0 -0
  231. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_selectable.py +0 -0
  232. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/sql/test_string.py +0 -0
  233. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_asyn.py +0 -0
  234. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_cache.py +0 -0
  235. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_catalog.py +0 -0
  236. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_catalog_loader.py +0 -0
  237. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_cli_parsing.py +0 -0
  238. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_client.py +0 -0
  239. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_client_s3.py +0 -0
  240. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_data_storage.py +0 -0
  241. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_database_engine.py +0 -0
  242. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_dataset.py +0 -0
  243. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_dispatch.py +0 -0
  244. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_fileslice.py +0 -0
  245. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_id_generator.py +0 -0
  246. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_listing.py +0 -0
  247. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_metastore.py +0 -0
  248. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_query_metrics.py +0 -0
  249. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_query_params.py +0 -0
  250. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_serializer.py +0 -0
  251. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_session.py +0 -0
  252. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_storage.py +0 -0
  253. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_udf.py +0 -0
  254. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_utils.py +0 -0
  255. {datachain-0.2.12 → datachain-0.2.14}/tests/unit/test_warehouse.py +0 -0
@@ -69,26 +69,6 @@ jobs:
69
69
  pyv: '3.12'
70
70
 
71
71
  steps:
72
-
73
- # https://github.com/iterative/pytest-servers/pull/122
74
- # https://github.com/abiosoft/colima/issues/468
75
- # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
76
- # colima v0.5.6 seems to run more stable than the latest - that has occasional network failures (ports are not open)
77
- # see: https://github.com/abiosoft/colima/issues/962
78
- - name: Use colima as default docker host on MacOS
79
- if: runner.os == 'macOS'
80
- run: |
81
- brew install docker lima || true # avoid non-zero exit code if brew link fails
82
- sudo curl -L -o /usr/local/bin/colima https://github.com/abiosoft/colima/releases/download/v0.5.6/colima-Darwin-x86_64
83
- sudo chmod +x /usr/local/bin/colima
84
- colima start
85
- sudo ln -vsf "${HOME}"/.colima/default/docker.sock /var/run/docker.sock
86
- env:
87
- HOMEBREW_NO_AUTO_UPDATE: true
88
- HOMEBREW_NO_INSTALL_CLEANUP: true
89
- HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: true
90
- HOMEBREW_NO_INSTALL_UPGRADE: true
91
-
92
72
  - name: Check out the repository
93
73
  uses: actions/checkout@v4
94
74
  with:
@@ -106,12 +86,17 @@ jobs:
106
86
  nox --version
107
87
  uv --version
108
88
 
109
- - name: Skip flaky azure, gs remotes if unavailable on macos
89
+ - name: Skip flaky azure, gs remotes on macOS
110
90
  if: runner.os == 'macOS'
111
- run: echo 'DATACHAIN_TEST_SKIP_MISSING_REMOTES=azure,gs' >> "$GITHUB_ENV"
91
+ run: echo 'DISABLE_REMOTES_ARG=--disable-remotes=azure,gs' >> "$GITHUB_ENV"
92
+
93
+ - name: Skip all remotes on Windows
94
+ if: runner.os == 'Windows'
95
+ run: echo 'DISABLE_REMOTES_ARG=--disable-remotes=azure,gs' >> $env:GITHUB_ENV
112
96
 
113
97
  - name: Run tests
114
- run: nox -s tests-${{ matrix.pyv }}
98
+ run: nox -s tests-${{ matrix.pyv }} -- $DISABLE_REMOTES_ARG
99
+ shell: bash
115
100
 
116
101
  - name: Upload coverage report
117
102
  uses: codecov/codecov-action@v4
@@ -1,5 +1,7 @@
1
1
  default_language_version:
2
2
  python: python3
3
+ ci:
4
+ skip: [mypy]
3
5
  repos:
4
6
  - repo: https://github.com/pre-commit/pre-commit-hooks
5
7
  rev: v4.6.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.12
3
+ Version: 0.2.14
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -36,7 +36,7 @@ Requires-Dist: sqlalchemy>=2
36
36
  Requires-Dist: multiprocess==0.70.16
37
37
  Requires-Dist: dill==0.3.8
38
38
  Requires-Dist: cloudpickle
39
- Requires-Dist: ujson>=5.9.0
39
+ Requires-Dist: orjson>=3.10.5
40
40
  Requires-Dist: pydantic<3,>=2
41
41
  Requires-Dist: jmespath>=1.0
42
42
  Requires-Dist: datamodel-code-generator>=0.25
@@ -78,9 +78,9 @@ Provides-Extra: dev
78
78
  Requires-Dist: datachain[docs,tests]; extra == "dev"
79
79
  Requires-Dist: mypy==1.10.1; extra == "dev"
80
80
  Requires-Dist: types-python-dateutil; extra == "dev"
81
+ Requires-Dist: types-pytz; extra == "dev"
81
82
  Requires-Dist: types-PyYAML; extra == "dev"
82
83
  Requires-Dist: types-requests; extra == "dev"
83
- Requires-Dist: types-ujson; extra == "dev"
84
84
 
85
85
  |PyPI| |Python Version| |Codecov| |Tests|
86
86
 
@@ -103,20 +103,18 @@ AI 🔗 DataChain
103
103
  DataChain is an open-source Python library for processing and curating unstructured
104
104
  data at scale.
105
105
 
106
- 🤖 AI-Driven Data Curation: Use local ML models, LLM APIs calls to enrich your data.
106
+ 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
107
107
 
108
- 🚀 GenAI Dataset scale: Handle 10s of milions of files or file snippets.
108
+ 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
109
109
 
110
- 🐍 Python-friendly: Use strictly typed `Pydantic`_ objects instead of JSON.
110
+ 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
111
111
 
112
112
 
113
- To ensure efficiency, Datachain supports parallel processing, parallel data
114
- downloads, and out-of-memory computing. It excels at optimizing batch operations.
115
- While most GenAI tools focus on online applications and realtime, DataChain is designed
116
- for offline data processing, data curation and ETL.
113
+ Datachain supports parallel processing, parallel data
114
+ downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
117
115
 
118
- The typical use cases are Computer Vision data curation, LLM analytics
119
- and validation.
116
+ The typical use cases include Computer Vision data curation, LLM analytics,
117
+ and validation of multimodal AI applications.
120
118
 
121
119
 
122
120
  .. code:: console
@@ -128,25 +126,25 @@ and validation.
128
126
  Quick Start
129
127
  -----------
130
128
 
131
- Basic evaluation
132
- ================
129
+ Data curation with a local model
130
+ =================================
133
131
 
134
132
  We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
135
- - 50 files total in the example.
136
- These dialogs involve users looking for better wireless plans chatting with bot.
137
- Our goal is to identify successful dialogs.
133
+ - 50 files total in this example.
134
+ These dialogs involve users chatting with a bot while looking for better wireless plans.
135
+ Our goal is to identify the successful dialogs.
138
136
 
139
- The data used in the examples is publicly available. Please feel free to run this code.
137
+ The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
140
138
 
141
- First, we'll use a simple sentiment analysis model. Please install transformers.
139
+ First, we'll show batch inference with a simple sentiment model using the `transformers` library:
142
140
 
143
141
  .. code:: shell
144
142
 
145
143
  pip install transformers
146
144
 
147
- The code below downloads files the cloud, applies function
148
- `is_positive_dialogue_ending()` to each. All files with a positive sentiment
149
- are copied to local directory `output/`.
145
+ The code below downloads files the cloud, and applies a user-defined function
146
+ to each one of them. All files with a positive sentiment
147
+ detected are then copied to the local directory.
150
148
 
151
149
  .. code:: py
152
150
 
@@ -169,7 +167,7 @@ are copied to local directory `output/`.
169
167
  )
170
168
 
171
169
  positive_chain = chain.filter(Column("is_positive") == True)
172
- positive_chain.export_files("./output1")
170
+ positive_chain.export_files("./output")
173
171
 
174
172
  print(f"{positive_chain.count()} files were exported")
175
173
 
@@ -185,11 +183,11 @@ are copied to local directory `output/`.
185
183
  13
186
184
 
187
185
 
188
- LLM judging LLMs dialogs
189
- ==========================
186
+ LLM judging chatbots
187
+ =============================
190
188
 
191
- Finding good dialogs using an LLM can be more efficient. In this example,
192
- we use Mistral with a free API. Please install the package and get a free
189
+ LLMs can work as efficient universal classifiers. In the example below,
190
+ we employ a free API from Mistral to judge the chatbot performance. Please get a free
193
191
  Mistral API key at https://console.mistral.ai
194
192
 
195
193
  .. code:: shell
@@ -197,9 +195,7 @@ Mistral API key at https://console.mistral.ai
197
195
  $ pip install mistralai
198
196
  $ export MISTRAL_API_KEY=_your_key_
199
197
 
200
- Below is a similar code example, but this time using an LLM to evaluate the dialogs.
201
- Note, only 4 threads were used in this example `parallel=4` due to a limitation of
202
- the free LLM service.
198
+ DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
203
199
 
204
200
  .. code:: py
205
201
 
@@ -231,7 +227,7 @@ the free LLM service.
231
227
  print(f"{successful_chain.count()} files were exported")
232
228
 
233
229
 
234
- With the current prompt, we found 31 files considered successful dialogs:
230
+ With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
235
231
 
236
232
  .. code:: shell
237
233
 
@@ -245,11 +241,11 @@ With the current prompt, we found 31 files considered successful dialogs:
245
241
  Serializing Python-objects
246
242
  ==========================
247
243
 
248
- LLM responses contain valuable information for analytics, such as tokens used and the
249
- model. Preserving this information can be beneficial.
244
+ LLM responses may contain valuable information for analytics such as the number of tokens used, or the
245
+ model performance parameters.
250
246
 
251
- Instead of extracting this information from the Mistral data structure (class
252
- `ChatCompletionResponse`), we serialize the entire Python object to the internal DB.
247
+ Instead of extracting this information from the Mistral response data structure (class
248
+ `ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
253
249
 
254
250
 
255
251
  .. code:: py
@@ -297,21 +293,23 @@ Output:
297
293
  64.0% dialogs were successful
298
294
 
299
295
 
300
- Complex Python data structures
296
+ Iterating over Python data structures
301
297
  =============================================
302
298
 
303
- In the previous examples, a few dataset were saved in the embedded database
304
- (`SQLite`_ in directory `.datachain`).
305
- These datasets are versioned, and can be accessed using
299
+ In the previous examples, datasets were saved in the embedded database
300
+ (`SQLite`_ in folder `.datachain` of the working directory).
301
+ These datasets were automatically versioned, and can be accessed using
306
302
  `DataChain.from_dataset("dataset_name")`.
307
303
 
304
+ Here is how to retrieve a saved dataset and iterate over the objects:
305
+
308
306
  .. code:: py
309
307
 
310
308
  chain = DataChain.from_dataset("response")
311
309
 
312
- # Iterating one-by-one: out of memory
310
+ # Iterating one-by-one: support out-of-memory workflow
313
311
  for file, response in chain.limit(5).collect("file", "response"):
314
- # You work with Python objects
312
+ # verify the collected Python objects
315
313
  assert isinstance(response, ChatCompletionResponse)
316
314
 
317
315
  status = response.choices[0].message.content[:7]
@@ -332,9 +330,8 @@ Output:
332
330
  Vectorized analytics over Python objects
333
331
  ========================================
334
332
 
335
- Some operations can be efficiently run inside the DB without deserializing Python objects.
336
- Let's calculate the cost of using LLM APIs in a vectorized way.
337
- Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
333
+ Some operations can run inside the DB without deserialization.
334
+ For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
338
335
 
339
336
  .. code:: py
340
337
 
@@ -406,6 +403,7 @@ Community and Support
406
403
  .. github-only
407
404
  .. _Contributor Guide: CONTRIBUTING.rst
408
405
  .. _Pydantic: https://github.com/pydantic/pydantic
406
+ .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
409
407
  .. _SQLite: https://www.sqlite.org/
410
408
  .. _Getting Started: https://datachain.dvc.ai/
411
409
  .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
@@ -19,20 +19,18 @@ AI 🔗 DataChain
19
19
  DataChain is an open-source Python library for processing and curating unstructured
20
20
  data at scale.
21
21
 
22
- 🤖 AI-Driven Data Curation: Use local ML models, LLM APIs calls to enrich your data.
22
+ 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
23
23
 
24
- 🚀 GenAI Dataset scale: Handle 10s of milions of files or file snippets.
24
+ 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
25
25
 
26
- 🐍 Python-friendly: Use strictly typed `Pydantic`_ objects instead of JSON.
26
+ 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
27
27
 
28
28
 
29
- To ensure efficiency, Datachain supports parallel processing, parallel data
30
- downloads, and out-of-memory computing. It excels at optimizing batch operations.
31
- While most GenAI tools focus on online applications and realtime, DataChain is designed
32
- for offline data processing, data curation and ETL.
29
+ Datachain supports parallel processing, parallel data
30
+ downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
33
31
 
34
- The typical use cases are Computer Vision data curation, LLM analytics
35
- and validation.
32
+ The typical use cases include Computer Vision data curation, LLM analytics,
33
+ and validation of multimodal AI applications.
36
34
 
37
35
 
38
36
  .. code:: console
@@ -44,25 +42,25 @@ and validation.
44
42
  Quick Start
45
43
  -----------
46
44
 
47
- Basic evaluation
48
- ================
45
+ Data curation with a local model
46
+ =================================
49
47
 
50
48
  We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
51
- - 50 files total in the example.
52
- These dialogs involve users looking for better wireless plans chatting with bot.
53
- Our goal is to identify successful dialogs.
49
+ - 50 files total in this example.
50
+ These dialogs involve users chatting with a bot while looking for better wireless plans.
51
+ Our goal is to identify the successful dialogs.
54
52
 
55
- The data used in the examples is publicly available. Please feel free to run this code.
53
+ The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
56
54
 
57
- First, we'll use a simple sentiment analysis model. Please install transformers.
55
+ First, we'll show batch inference with a simple sentiment model using the `transformers` library:
58
56
 
59
57
  .. code:: shell
60
58
 
61
59
  pip install transformers
62
60
 
63
- The code below downloads files the cloud, applies function
64
- `is_positive_dialogue_ending()` to each. All files with a positive sentiment
65
- are copied to local directory `output/`.
61
+ The code below downloads files the cloud, and applies a user-defined function
62
+ to each one of them. All files with a positive sentiment
63
+ detected are then copied to the local directory.
66
64
 
67
65
  .. code:: py
68
66
 
@@ -85,7 +83,7 @@ are copied to local directory `output/`.
85
83
  )
86
84
 
87
85
  positive_chain = chain.filter(Column("is_positive") == True)
88
- positive_chain.export_files("./output1")
86
+ positive_chain.export_files("./output")
89
87
 
90
88
  print(f"{positive_chain.count()} files were exported")
91
89
 
@@ -101,11 +99,11 @@ are copied to local directory `output/`.
101
99
  13
102
100
 
103
101
 
104
- LLM judging LLMs dialogs
105
- ==========================
102
+ LLM judging chatbots
103
+ =============================
106
104
 
107
- Finding good dialogs using an LLM can be more efficient. In this example,
108
- we use Mistral with a free API. Please install the package and get a free
105
+ LLMs can work as efficient universal classifiers. In the example below,
106
+ we employ a free API from Mistral to judge the chatbot performance. Please get a free
109
107
  Mistral API key at https://console.mistral.ai
110
108
 
111
109
  .. code:: shell
@@ -113,9 +111,7 @@ Mistral API key at https://console.mistral.ai
113
111
  $ pip install mistralai
114
112
  $ export MISTRAL_API_KEY=_your_key_
115
113
 
116
- Below is a similar code example, but this time using an LLM to evaluate the dialogs.
117
- Note, only 4 threads were used in this example `parallel=4` due to a limitation of
118
- the free LLM service.
114
+ DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
119
115
 
120
116
  .. code:: py
121
117
 
@@ -147,7 +143,7 @@ the free LLM service.
147
143
  print(f"{successful_chain.count()} files were exported")
148
144
 
149
145
 
150
- With the current prompt, we found 31 files considered successful dialogs:
146
+ With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
151
147
 
152
148
  .. code:: shell
153
149
 
@@ -161,11 +157,11 @@ With the current prompt, we found 31 files considered successful dialogs:
161
157
  Serializing Python-objects
162
158
  ==========================
163
159
 
164
- LLM responses contain valuable information for analytics, such as tokens used and the
165
- model. Preserving this information can be beneficial.
160
+ LLM responses may contain valuable information for analytics such as the number of tokens used, or the
161
+ model performance parameters.
166
162
 
167
- Instead of extracting this information from the Mistral data structure (class
168
- `ChatCompletionResponse`), we serialize the entire Python object to the internal DB.
163
+ Instead of extracting this information from the Mistral response data structure (class
164
+ `ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
169
165
 
170
166
 
171
167
  .. code:: py
@@ -213,21 +209,23 @@ Output:
213
209
  64.0% dialogs were successful
214
210
 
215
211
 
216
- Complex Python data structures
212
+ Iterating over Python data structures
217
213
  =============================================
218
214
 
219
- In the previous examples, a few dataset were saved in the embedded database
220
- (`SQLite`_ in directory `.datachain`).
221
- These datasets are versioned, and can be accessed using
215
+ In the previous examples, datasets were saved in the embedded database
216
+ (`SQLite`_ in folder `.datachain` of the working directory).
217
+ These datasets were automatically versioned, and can be accessed using
222
218
  `DataChain.from_dataset("dataset_name")`.
223
219
 
220
+ Here is how to retrieve a saved dataset and iterate over the objects:
221
+
224
222
  .. code:: py
225
223
 
226
224
  chain = DataChain.from_dataset("response")
227
225
 
228
- # Iterating one-by-one: out of memory
226
+ # Iterating one-by-one: support out-of-memory workflow
229
227
  for file, response in chain.limit(5).collect("file", "response"):
230
- # You work with Python objects
228
+ # verify the collected Python objects
231
229
  assert isinstance(response, ChatCompletionResponse)
232
230
 
233
231
  status = response.choices[0].message.content[:7]
@@ -248,9 +246,8 @@ Output:
248
246
  Vectorized analytics over Python objects
249
247
  ========================================
250
248
 
251
- Some operations can be efficiently run inside the DB without deserializing Python objects.
252
- Let's calculate the cost of using LLM APIs in a vectorized way.
253
- Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
249
+ Some operations can run inside the DB without deserialization.
250
+ For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
254
251
 
255
252
  .. code:: py
256
253
 
@@ -322,6 +319,7 @@ Community and Support
322
319
  .. github-only
323
320
  .. _Contributor Guide: CONTRIBUTING.rst
324
321
  .. _Pydantic: https://github.com/pydantic/pydantic
322
+ .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
325
323
  .. _SQLite: https://www.sqlite.org/
326
324
  .. _Getting Started: https://datachain.dvc.ai/
327
325
  .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
@@ -58,8 +58,8 @@ def trim_text(text):
58
58
  match = re.search(r'[A-Z][^.]*\.', text)
59
59
  return match.group(0) if match else ''
60
60
 
61
- images = chain.collect_one("file")
62
- captions = chain.collect_one("scene")
61
+ images = chain.collect("file")
62
+ captions = chain.collect("scene")
63
63
  _ , axes = plt.subplots(1, len(captions), figsize=(15, 5))
64
64
 
65
65
  for ax, img, caption in zip(axes, images, captions):