datachain 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (259) hide show
  1. {datachain-0.3.1 → datachain-0.3.2}/.github/workflows/benchmarks.yml +1 -1
  2. {datachain-0.3.1 → datachain-0.3.2}/.github/workflows/tests.yml +29 -0
  3. {datachain-0.3.1 → datachain-0.3.2}/.pre-commit-config.yaml +1 -1
  4. {datachain-0.3.1/src/datachain.egg-info → datachain-0.3.2}/PKG-INFO +74 -86
  5. {datachain-0.3.1 → datachain-0.3.2}/README.rst +64 -85
  6. {datachain-0.3.1 → datachain-0.3.2}/docs/index.md +5 -6
  7. {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/common_sql_functions.py +13 -11
  8. {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/torch-loader.py +3 -2
  9. {datachain-0.3.1 → datachain-0.3.2}/examples/llm_and_nlp/unstructured-text.py +15 -15
  10. {datachain-0.3.1 → datachain-0.3.2}/examples/multimodal/hf_pipeline.py +28 -19
  11. {datachain-0.3.1 → datachain-0.3.2}/examples/multimodal/wds.py +17 -6
  12. {datachain-0.3.1 → datachain-0.3.2}/examples/multimodal/wds_filtered.py +4 -2
  13. {datachain-0.3.1 → datachain-0.3.2}/noxfile.py +11 -0
  14. {datachain-0.3.1 → datachain-0.3.2}/pyproject.toml +17 -2
  15. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/catalog/catalog.py +10 -1
  16. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/schema.py +22 -8
  17. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/sqlite.py +5 -0
  18. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/dc.py +27 -13
  19. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/meta_formats.py +8 -2
  20. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/node.py +1 -1
  21. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/schema.py +4 -0
  22. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/default/base.py +3 -0
  23. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/sqlite/base.py +3 -0
  24. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/types.py +120 -11
  25. {datachain-0.3.1 → datachain-0.3.2/src/datachain.egg-info}/PKG-INFO +74 -86
  26. {datachain-0.3.1 → datachain-0.3.2}/src/datachain.egg-info/SOURCES.txt +2 -3
  27. {datachain-0.3.1 → datachain-0.3.2}/src/datachain.egg-info/requires.txt +10 -0
  28. datachain-0.3.2/tests/examples/test_examples.py +96 -0
  29. {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_datachain.py +20 -0
  30. {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_dataset_query.py +17 -38
  31. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_datachain.py +91 -1
  32. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_datachain_merge.py +8 -7
  33. datachain-0.3.2/tests/unit/lib/test_schema.py +22 -0
  34. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_data_storage.py +50 -1
  35. datachain-0.3.1/examples/get_started/json-metadata-tutorial.ipynb +0 -2020
  36. datachain-0.3.1/examples/llm/llm_chatbot_evaluation.ipynb +0 -683
  37. datachain-0.3.1/examples/multimodal/clip_fine_tuning.ipynb +0 -1948
  38. {datachain-0.3.1 → datachain-0.3.2}/.cruft.json +0 -0
  39. {datachain-0.3.1 → datachain-0.3.2}/.gitattributes +0 -0
  40. {datachain-0.3.1 → datachain-0.3.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  41. {datachain-0.3.1 → datachain-0.3.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  42. {datachain-0.3.1 → datachain-0.3.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  43. {datachain-0.3.1 → datachain-0.3.2}/.github/codecov.yaml +0 -0
  44. {datachain-0.3.1 → datachain-0.3.2}/.github/dependabot.yml +0 -0
  45. {datachain-0.3.1 → datachain-0.3.2}/.github/workflows/release.yml +0 -0
  46. {datachain-0.3.1 → datachain-0.3.2}/.github/workflows/update-template.yaml +0 -0
  47. {datachain-0.3.1 → datachain-0.3.2}/.gitignore +0 -0
  48. {datachain-0.3.1 → datachain-0.3.2}/CODE_OF_CONDUCT.rst +0 -0
  49. {datachain-0.3.1 → datachain-0.3.2}/CONTRIBUTING.rst +0 -0
  50. {datachain-0.3.1 → datachain-0.3.2}/LICENSE +0 -0
  51. {datachain-0.3.1 → datachain-0.3.2}/docs/assets/captioned_cartoons.png +0 -0
  52. {datachain-0.3.1 → datachain-0.3.2}/docs/assets/datachain.png +0 -0
  53. {datachain-0.3.1 → datachain-0.3.2}/docs/assets/flowchart.png +0 -0
  54. {datachain-0.3.1 → datachain-0.3.2}/docs/references/datachain.md +0 -0
  55. {datachain-0.3.1 → datachain-0.3.2}/docs/references/datatype.md +0 -0
  56. {datachain-0.3.1 → datachain-0.3.2}/docs/references/file.md +0 -0
  57. {datachain-0.3.1 → datachain-0.3.2}/docs/references/index.md +0 -0
  58. {datachain-0.3.1 → datachain-0.3.2}/docs/references/sql.md +0 -0
  59. {datachain-0.3.1 → datachain-0.3.2}/docs/references/torch.md +0 -0
  60. {datachain-0.3.1 → datachain-0.3.2}/docs/references/udf.md +0 -0
  61. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  62. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
  63. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
  64. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
  65. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
  66. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
  67. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/README.md +0 -0
  68. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  69. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
  70. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
  71. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
  72. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
  73. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
  74. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  75. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
  76. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  77. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  78. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  79. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  80. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  81. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  82. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  83. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  84. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  85. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  86. {datachain-0.3.1 → datachain-0.3.2}/examples/computer_vision/openimage-detect.py +0 -0
  87. {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/json-csv-reader.py +0 -0
  88. {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/udfs/parallel.py +0 -0
  89. {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/udfs/simple.py +0 -0
  90. {datachain-0.3.1 → datachain-0.3.2}/examples/get_started/udfs/stateful.py +0 -0
  91. {datachain-0.3.1 → datachain-0.3.2}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  92. {datachain-0.3.1 → datachain-0.3.2}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  93. {datachain-0.3.1 → datachain-0.3.2}/examples/llm_and_nlp/llm-claude.py +0 -0
  94. {datachain-0.3.1 → datachain-0.3.2}/examples/multimodal/clip_inference.py +0 -0
  95. {datachain-0.3.1 → datachain-0.3.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
  96. {datachain-0.3.1 → datachain-0.3.2}/mkdocs.yml +0 -0
  97. {datachain-0.3.1 → datachain-0.3.2}/setup.cfg +0 -0
  98. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/__init__.py +0 -0
  99. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/__main__.py +0 -0
  100. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/asyn.py +0 -0
  101. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/cache.py +0 -0
  102. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/catalog/__init__.py +0 -0
  103. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/catalog/datasource.py +0 -0
  104. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/catalog/loader.py +0 -0
  105. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/catalog/subclass.py +0 -0
  106. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/cli.py +0 -0
  107. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/cli_utils.py +0 -0
  108. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/__init__.py +0 -0
  109. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/azure.py +0 -0
  110. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/fileslice.py +0 -0
  111. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/fsspec.py +0 -0
  112. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/gcs.py +0 -0
  113. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/local.py +0 -0
  114. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/client/s3.py +0 -0
  115. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/config.py +0 -0
  116. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/__init__.py +0 -0
  117. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/db_engine.py +0 -0
  118. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/id_generator.py +0 -0
  119. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/job.py +0 -0
  120. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/metastore.py +0 -0
  121. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/serializer.py +0 -0
  122. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/data_storage/warehouse.py +0 -0
  123. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/dataset.py +0 -0
  124. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/error.py +0 -0
  125. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/job.py +0 -0
  126. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/__init__.py +0 -0
  127. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/arrow.py +0 -0
  128. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/clip.py +0 -0
  129. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/__init__.py +0 -0
  130. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/flatten.py +0 -0
  131. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
  132. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
  133. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/unflatten.py +0 -0
  134. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  135. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/data_model.py +0 -0
  136. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/dataset_info.py +0 -0
  137. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/file.py +0 -0
  138. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/image.py +0 -0
  139. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/model_store.py +0 -0
  140. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/pytorch.py +0 -0
  141. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/settings.py +0 -0
  142. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/signal_schema.py +0 -0
  143. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/text.py +0 -0
  144. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/udf.py +0 -0
  145. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/udf_signature.py +0 -0
  146. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/utils.py +0 -0
  147. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/vfile.py +0 -0
  148. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/webdataset.py +0 -0
  149. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/lib/webdataset_laion.py +0 -0
  150. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/listing.py +0 -0
  151. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/nodes_fetcher.py +0 -0
  152. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/nodes_thread_pool.py +0 -0
  153. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/progress.py +0 -0
  154. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/py.typed +0 -0
  155. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/__init__.py +0 -0
  156. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/batch.py +0 -0
  157. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/builtins.py +0 -0
  158. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/dataset.py +0 -0
  159. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/dispatch.py +0 -0
  160. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/metrics.py +0 -0
  161. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/params.py +0 -0
  162. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/queue.py +0 -0
  163. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/session.py +0 -0
  164. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/query/udf.py +0 -0
  165. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/remote/__init__.py +0 -0
  166. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/remote/studio.py +0 -0
  167. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/__init__.py +0 -0
  168. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/default/__init__.py +0 -0
  169. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/__init__.py +0 -0
  170. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/array.py +0 -0
  171. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/conditional.py +0 -0
  172. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/path.py +0 -0
  173. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/random.py +0 -0
  174. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/functions/string.py +0 -0
  175. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/selectable.py +0 -0
  176. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/sqlite/__init__.py +0 -0
  177. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/sqlite/types.py +0 -0
  178. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/sqlite/vector.py +0 -0
  179. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/sql/utils.py +0 -0
  180. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/storage.py +0 -0
  181. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/torch/__init__.py +0 -0
  182. {datachain-0.3.1 → datachain-0.3.2}/src/datachain/utils.py +0 -0
  183. {datachain-0.3.1 → datachain-0.3.2}/src/datachain.egg-info/dependency_links.txt +0 -0
  184. {datachain-0.3.1 → datachain-0.3.2}/src/datachain.egg-info/entry_points.txt +0 -0
  185. {datachain-0.3.1 → datachain-0.3.2}/src/datachain.egg-info/top_level.txt +0 -0
  186. {datachain-0.3.1 → datachain-0.3.2}/tests/__init__.py +0 -0
  187. {datachain-0.3.1 → datachain-0.3.2}/tests/benchmarks/__init__.py +0 -0
  188. {datachain-0.3.1 → datachain-0.3.2}/tests/benchmarks/conftest.py +0 -0
  189. {datachain-0.3.1 → datachain-0.3.2}/tests/benchmarks/test_ls.py +0 -0
  190. {datachain-0.3.1 → datachain-0.3.2}/tests/benchmarks/test_version.py +0 -0
  191. {datachain-0.3.1 → datachain-0.3.2}/tests/conftest.py +0 -0
  192. {datachain-0.3.1 → datachain-0.3.2}/tests/data.py +0 -0
  193. {datachain-0.3.1 → datachain-0.3.2}/tests/examples/__init__.py +0 -0
  194. {datachain-0.3.1 → datachain-0.3.2}/tests/examples/test_wds_e2e.py +0 -0
  195. {datachain-0.3.1 → datachain-0.3.2}/tests/examples/wds_data.py +0 -0
  196. {datachain-0.3.1 → datachain-0.3.2}/tests/func/__init__.py +0 -0
  197. {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_catalog.py +0 -0
  198. {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_client.py +0 -0
  199. {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_datasets.py +0 -0
  200. {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_feature_pickling.py +0 -0
  201. {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_ls.py +0 -0
  202. {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_pull.py +0 -0
  203. {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_pytorch.py +0 -0
  204. {datachain-0.3.1 → datachain-0.3.2}/tests/func/test_query.py +0 -0
  205. {datachain-0.3.1 → datachain-0.3.2}/tests/scripts/feature_class.py +0 -0
  206. {datachain-0.3.1 → datachain-0.3.2}/tests/scripts/feature_class_parallel.py +0 -0
  207. {datachain-0.3.1 → datachain-0.3.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  208. {datachain-0.3.1 → datachain-0.3.2}/tests/scripts/name_len_slow.py +0 -0
  209. {datachain-0.3.1 → datachain-0.3.2}/tests/test_cli_e2e.py +0 -0
  210. {datachain-0.3.1 → datachain-0.3.2}/tests/test_query_e2e.py +0 -0
  211. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/__init__.py +0 -0
  212. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/__init__.py +0 -0
  213. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/conftest.py +0 -0
  214. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_arrow.py +0 -0
  215. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_clip.py +0 -0
  216. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  217. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_feature.py +0 -0
  218. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_feature_utils.py +0 -0
  219. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_file.py +0 -0
  220. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_image.py +0 -0
  221. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_signal_schema.py +0 -0
  222. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_sql_to_python.py +0 -0
  223. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_text.py +0 -0
  224. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_udf_signature.py +0 -0
  225. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_utils.py +0 -0
  226. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/lib/test_webdataset.py +0 -0
  227. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/__init__.py +0 -0
  228. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/sqlite/__init__.py +0 -0
  229. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
  230. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_array.py +0 -0
  231. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_conditional.py +0 -0
  232. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_path.py +0 -0
  233. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_random.py +0 -0
  234. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_selectable.py +0 -0
  235. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/sql/test_string.py +0 -0
  236. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_asyn.py +0 -0
  237. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_cache.py +0 -0
  238. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_catalog.py +0 -0
  239. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_catalog_loader.py +0 -0
  240. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_cli_parsing.py +0 -0
  241. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_client.py +0 -0
  242. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_client_s3.py +0 -0
  243. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_database_engine.py +0 -0
  244. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_dataset.py +0 -0
  245. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_dispatch.py +0 -0
  246. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_fileslice.py +0 -0
  247. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_id_generator.py +0 -0
  248. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_listing.py +0 -0
  249. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_metastore.py +0 -0
  250. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_module_exports.py +0 -0
  251. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_query_metrics.py +0 -0
  252. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_query_params.py +0 -0
  253. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_serializer.py +0 -0
  254. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_session.py +0 -0
  255. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_storage.py +0 -0
  256. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_udf.py +0 -0
  257. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_utils.py +0 -0
  258. {datachain-0.3.1 → datachain-0.3.2}/tests/unit/test_warehouse.py +0 -0
  259. {datachain-0.3.1 → datachain-0.3.2}/tests/utils.py +0 -0
@@ -11,7 +11,7 @@ env:
11
11
  FORCE_COLOR: "1"
12
12
 
13
13
  jobs:
14
- build:
14
+ run:
15
15
  if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') }}
16
16
  runs-on: ubuntu-latest
17
17
 
@@ -199,3 +199,32 @@ jobs:
199
199
  --splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
200
200
  tests ../datachain/tests
201
201
  working-directory: backend/datachain_server
202
+
203
+
204
+ examples:
205
+ runs-on: ${{ matrix.os }}
206
+ timeout-minutes: 60
207
+ strategy:
208
+ fail-fast: false
209
+ matrix:
210
+ os: [ubuntu-latest-16-cores, macos-latest, windows-latest-8-cores]
211
+ pyv: ['3.9', '3.12']
212
+ group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
213
+ steps:
214
+
215
+ - uses: actions/checkout@v4
216
+
217
+ - name: Set up Python ${{ matrix.pyv }}
218
+ uses: actions/setup-python@v5
219
+ with:
220
+ python-version: ${{ matrix.pyv }}
221
+ cache: 'pip'
222
+
223
+ - name: Upgrade nox and uv
224
+ run: |
225
+ python -m pip install --upgrade 'nox[uv]'
226
+ nox --version
227
+ uv --version
228
+
229
+ - name: Run examples
230
+ run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.5.6'
27
+ rev: 'v0.5.7'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -81,6 +81,15 @@ Requires-Dist: types-python-dateutil; extra == "dev"
81
81
  Requires-Dist: types-pytz; extra == "dev"
82
82
  Requires-Dist: types-PyYAML; extra == "dev"
83
83
  Requires-Dist: types-requests; extra == "dev"
84
+ Provides-Extra: examples
85
+ Requires-Dist: datachain[tests]; extra == "examples"
86
+ Requires-Dist: numpy<2,>=1; extra == "examples"
87
+ Requires-Dist: defusedxml; extra == "examples"
88
+ Requires-Dist: accelerate; extra == "examples"
89
+ Requires-Dist: unstructured[pdf]; extra == "examples"
90
+ Requires-Dist: pdfplumber==0.11.3; extra == "examples"
91
+ Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
92
+ Requires-Dist: nltk==3.8.1; extra == "examples"
84
93
 
85
94
  |PyPI| |Python Version| |Codecov| |Tests|
86
95
 
@@ -100,102 +109,78 @@ Requires-Dist: types-requests; extra == "dev"
100
109
  AI 🔗 DataChain
101
110
  ----------------
102
111
 
103
- DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
104
- AI engineers build a metadata layer on top of unstructured files and analyze data using
105
- this layer.
112
+ DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
113
+ It is made to organize your unstructured data into datasets and wrangle it at scale on
114
+ your local machine.
106
115
 
107
- 📂 **Raw Files Processing**
108
- Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
109
- Local), version and update datasets.
116
+ Key Features
117
+ ============
110
118
 
111
- 🌟 **Metadata layer.**
112
- Build a metadata layer on top of files using structured sources like CSV, Parquet,
113
- and JSON files.
119
+ 📂 **Storage as a Source of Truth.**
120
+ - Process unstructured data without redundant copies: S3, GCP, Azure, and local
121
+ file systems.
122
+ - Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
123
+ - Join files and metadata together into persistent, versioned, columnar datasets.
114
124
 
115
- **Metadata enrichment.**
116
- Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
125
+ 🐍 **Python-friendly data pipelines.**
126
+ - Operate on Python objects and object fields.
127
+ - Built-in parallelization and out-of-memory compute without a need in SQL or
128
+ Spark jobs.
117
129
 
118
- 🛠️ **Data Transformation.**
119
- Transform metadata using traditional methods like filtering, grouping, joining, and
120
- others.
130
+ 🧠 **Data Enrichment and Processing.**
131
+ - Generate metadata columns using local AI models and LLM APIs.
132
+ - Filter, join, and group by AI metadata. Vector similarity search.
133
+ - Pass datasets to Pytorch and Tensorflow, or export back into storage.
121
134
 
122
- 🐍 **User-friendly interface.**
123
- Operate efficiently with familiar Python objects and object fields, eliminating the
124
- need for SQL.
135
+ 🚀 **Efficiency.**
136
+ - Parallelization, out-of-memory workloads and data caching.
137
+ - Vectorized operations on Python object fields: sum, count, avg, etc.
138
+ - Vector search on embeddings.
125
139
 
126
140
 
141
+ Quick Start
142
+ -----------
143
+
127
144
  .. code:: console
128
145
 
129
146
  $ pip install datachain
130
147
 
131
148
 
132
- Data Structures
133
- ===============
134
-
135
- DataChain introduces expressive data structures tailored for AI-specific workload:
136
-
137
- - **Dataset:** Preserves the file-references and meta-information. Takes care of Python
138
- object serialization, dataset versioning and difference. Operations on dataset:
139
-
140
- - **Transformations:** traditional data-frame or SQL operations such as filtering,
141
- grouping, joining.
142
- - **Enrichments:** mapping, aggregating and generating using customer’s Python
143
- code. This is needed to work with ML inference and LLM calls.
144
-
145
- - **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
146
- mode - only when needed.
147
-
148
- DataChain name comes from these major data structures: dataset and chaining.
149
-
149
+ Selecting files using JSON metadata
150
+ ======================================
150
151
 
151
- What’s new in DataChain?
152
- ========================
152
+ A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`),
153
+ annotated with ground truth and model inferences in the 'json-pairs' format,
154
+ where each image has a matching JSON file like `cat.1009.json`:
153
155
 
154
- The project combines multiple ideas from different areas in order to simplify AI
155
- use-cases and at the same time to fit it into traditional data infrastructure.
156
+ .. code:: json
156
157
 
157
- - **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
158
- native language for AI. It’s powered by `Pydantic`_ data models.
159
- - **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
160
- group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
161
- needed for distributed computations.
162
- - **Resuming data processing** (in development). Introduces idempotent operations,
163
- allowing data processing to resume from the last successful process file/record/batch
164
- if it fails due to issues like failed LLM calls, ML inference or file download.
158
+ {
159
+ "class": "cat", "id": "1009", "num_annotators": 8,
160
+ "inference": {"class": "dog", "confidence": 0.68}
161
+ }
165
162
 
166
- Additional relatively new ideas:
163
+ Example of downloading only high-confidence cat images using JSON metadata:
167
164
 
168
- - **Functional style data processing.** Using a functional/chaining approach to data
169
- processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
170
- - **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
171
- and implements data versioning, extending ideas from DVC (developed by the same team).
172
165
 
166
+ .. code:: py
173
167
 
174
- What DataChain is NOT?
175
- ======================
176
-
177
- - **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
178
- `SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
179
- version.
180
- - **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
181
- it delegates heavy data transformations to underlying data warehouses and focuses on
182
- AI specific data enrichments and orchestrating all the pieces together.
183
-
168
+ from datachain import Column, DataChain
184
169
 
185
- Quick Start
186
- -----------
170
+ meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
171
+ images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
187
172
 
188
- Data curation with a local model
189
- =================================
173
+ images_id = images.map(id=lambda file: file.path.split('.')[-2])
174
+ annotated = images_id.merge(meta, on="id", right_on="meta.id")
190
175
 
191
- We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
192
- - 50 files total in this example.
193
- These dialogs involve users chatting with a bot while looking for better wireless plans.
194
- Our goal is to identify the successful dialogs.
176
+ likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
+ & (Column("meta.inference.class_") == "cat"))
178
+ likely_cats.export_files("high-confidence-cats/", signal="file")
195
179
 
196
- The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
197
180
 
198
- First, we'll show batch inference with a simple sentiment model using the `transformers` library:
181
+ Data curation with a local AI model
182
+ ===================================
183
+ Batch inference with a simple sentiment model using the `transformers` library:
199
184
 
200
185
  .. code:: shell
201
186
 
@@ -246,30 +231,30 @@ LLM judging chatbots
246
231
  =============================
247
232
 
248
233
  LLMs can work as efficient universal classifiers. In the example below,
249
- we employ a free API from Mistral to judge the chatbot performance. Please get a free
234
+ we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
250
235
  Mistral API key at https://console.mistral.ai
251
236
 
237
+
252
238
  .. code:: shell
253
239
 
254
- $ pip install mistralai
240
+ $ pip install mistralai (Requires version >=1.0.0)
255
241
  $ export MISTRAL_API_KEY=_your_key_
256
242
 
257
243
  DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
258
244
 
259
245
  .. code:: py
260
246
 
261
- from mistralai.client import MistralClient
262
- from mistralai.models.chat_completion import ChatMessage
247
+ from mistralai import Mistral
263
248
  from datachain import File, DataChain, Column
264
249
 
265
250
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
266
251
 
267
252
  def eval_dialogue(file: File) -> bool:
268
- client = MistralClient()
269
- response = client.chat(
253
+ client = Mistral()
254
+ response = client.chat.complete(
270
255
  model="open-mixtral-8x22b",
271
- messages=[ChatMessage(role="system", content=PROMPT),
272
- ChatMessage(role="user", content=file.read())])
256
+ messages=[{"role": "system", "content": PROMPT},
257
+ {"role": "user", "content": file.read()}])
273
258
  result = response.choices[0].message.content
274
259
  return result.lower().startswith("success")
275
260
 
@@ -309,8 +294,8 @@ Instead of extracting this information from the Mistral response data structure
309
294
 
310
295
  .. code:: py
311
296
 
312
- from mistralai.client import MistralClient
313
- from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
297
+ from mistralai import Mistral
298
+ from mistralai.models import ChatCompletionResponse
314
299
  from datachain import File, DataChain, Column
315
300
 
316
301
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
@@ -319,8 +304,8 @@ Instead of extracting this information from the Mistral response data structure
319
304
  client = MistralClient()
320
305
  return client.chat(
321
306
  model="open-mixtral-8x22b",
322
- messages=[ChatMessage(role="system", content=PROMPT),
323
- ChatMessage(role="user", content=file.read())])
307
+ messages=[{"role": "system", "content": PROMPT},
308
+ {"role": "user", "content": file.read()}])
324
309
 
325
310
  chain = (
326
311
  DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
@@ -438,7 +423,10 @@ Tutorials
438
423
  ---------
439
424
 
440
425
  * `Getting Started`_
441
- * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
426
+ * `Multimodal <https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`__)
427
+ * `LLM evaluations <https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`__)
428
+ * `Reading JSON metadata <https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`__)
429
+
442
430
 
443
431
  Contributions
444
432
  -------------
@@ -16,102 +16,78 @@
16
16
  AI 🔗 DataChain
17
17
  ----------------
18
18
 
19
- DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
20
- AI engineers build a metadata layer on top of unstructured files and analyze data using
21
- this layer.
19
+ DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
20
+ It is made to organize your unstructured data into datasets and wrangle it at scale on
21
+ your local machine.
22
22
 
23
- 📂 **Raw Files Processing**
24
- Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
25
- Local), version and update datasets.
23
+ Key Features
24
+ ============
26
25
 
27
- 🌟 **Metadata layer.**
28
- Build a metadata layer on top of files using structured sources like CSV, Parquet,
29
- and JSON files.
26
+ 📂 **Storage as a Source of Truth.**
27
+ - Process unstructured data without redundant copies: S3, GCP, Azure, and local
28
+ file systems.
29
+ - Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
30
+ - Join files and metadata together into persistent, versioned, columnar datasets.
30
31
 
31
- **Metadata enrichment.**
32
- Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
32
+ 🐍 **Python-friendly data pipelines.**
33
+ - Operate on Python objects and object fields.
34
+ - Built-in parallelization and out-of-memory compute without a need in SQL or
35
+ Spark jobs.
33
36
 
34
- 🛠️ **Data Transformation.**
35
- Transform metadata using traditional methods like filtering, grouping, joining, and
36
- others.
37
+ 🧠 **Data Enrichment and Processing.**
38
+ - Generate metadata columns using local AI models and LLM APIs.
39
+ - Filter, join, and group by AI metadata. Vector similarity search.
40
+ - Pass datasets to Pytorch and Tensorflow, or export back into storage.
37
41
 
38
- 🐍 **User-friendly interface.**
39
- Operate efficiently with familiar Python objects and object fields, eliminating the
40
- need for SQL.
42
+ 🚀 **Efficiency.**
43
+ - Parallelization, out-of-memory workloads and data caching.
44
+ - Vectorized operations on Python object fields: sum, count, avg, etc.
45
+ - Vector search on embeddings.
41
46
 
42
47
 
48
+ Quick Start
49
+ -----------
50
+
43
51
  .. code:: console
44
52
 
45
53
  $ pip install datachain
46
54
 
47
55
 
48
- Data Structures
49
- ===============
50
-
51
- DataChain introduces expressive data structures tailored for AI-specific workload:
52
-
53
- - **Dataset:** Preserves the file-references and meta-information. Takes care of Python
54
- object serialization, dataset versioning and difference. Operations on dataset:
55
-
56
- - **Transformations:** traditional data-frame or SQL operations such as filtering,
57
- grouping, joining.
58
- - **Enrichments:** mapping, aggregating and generating using customer’s Python
59
- code. This is needed to work with ML inference and LLM calls.
60
-
61
- - **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
62
- mode - only when needed.
63
-
64
- DataChain name comes from these major data structures: dataset and chaining.
65
-
56
+ Selecting files using JSON metadata
57
+ ======================================
66
58
 
67
- What’s new in DataChain?
68
- ========================
59
+ A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`),
60
+ annotated with ground truth and model inferences in the 'json-pairs' format,
61
+ where each image has a matching JSON file like `cat.1009.json`:
69
62
 
70
- The project combines multiple ideas from different areas in order to simplify AI
71
- use-cases and at the same time to fit it into traditional data infrastructure.
63
+ .. code:: json
72
64
 
73
- - **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
74
- native language for AI. It’s powered by `Pydantic`_ data models.
75
- - **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
76
- group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
77
- needed for distributed computations.
78
- - **Resuming data processing** (in development). Introduces idempotent operations,
79
- allowing data processing to resume from the last successful process file/record/batch
80
- if it fails due to issues like failed LLM calls, ML inference or file download.
65
+ {
66
+ "class": "cat", "id": "1009", "num_annotators": 8,
67
+ "inference": {"class": "dog", "confidence": 0.68}
68
+ }
81
69
 
82
- Additional relatively new ideas:
70
+ Example of downloading only high-confidence cat images using JSON metadata:
83
71
 
84
- - **Functional style data processing.** Using a functional/chaining approach to data
85
- processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
86
- - **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
87
- and implements data versioning, extending ideas from DVC (developed by the same team).
88
72
 
73
+ .. code:: py
89
74
 
90
- What DataChain is NOT?
91
- ======================
92
-
93
- - **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
94
- `SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
95
- version.
96
- - **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
97
- it delegates heavy data transformations to underlying data warehouses and focuses on
98
- AI specific data enrichments and orchestrating all the pieces together.
99
-
75
+ from datachain import Column, DataChain
100
76
 
101
- Quick Start
102
- -----------
77
+ meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
78
+ images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
103
79
 
104
- Data curation with a local model
105
- =================================
80
+ images_id = images.map(id=lambda file: file.path.split('.')[-2])
81
+ annotated = images_id.merge(meta, on="id", right_on="meta.id")
106
82
 
107
- We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
108
- - 50 files total in this example.
109
- These dialogs involve users chatting with a bot while looking for better wireless plans.
110
- Our goal is to identify the successful dialogs.
83
+ likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
84
+ & (Column("meta.inference.class_") == "cat"))
85
+ likely_cats.export_files("high-confidence-cats/", signal="file")
111
86
 
112
- The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
113
87
 
114
- First, we'll show batch inference with a simple sentiment model using the `transformers` library:
88
+ Data curation with a local AI model
89
+ ===================================
90
+ Batch inference with a simple sentiment model using the `transformers` library:
115
91
 
116
92
  .. code:: shell
117
93
 
@@ -162,30 +138,30 @@ LLM judging chatbots
162
138
  =============================
163
139
 
164
140
  LLMs can work as efficient universal classifiers. In the example below,
165
- we employ a free API from Mistral to judge the chatbot performance. Please get a free
141
+ we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
166
142
  Mistral API key at https://console.mistral.ai
167
143
 
144
+
168
145
  .. code:: shell
169
146
 
170
- $ pip install mistralai
147
+ $ pip install mistralai (Requires version >=1.0.0)
171
148
  $ export MISTRAL_API_KEY=_your_key_
172
149
 
173
150
  DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
174
151
 
175
152
  .. code:: py
176
153
 
177
- from mistralai.client import MistralClient
178
- from mistralai.models.chat_completion import ChatMessage
154
+ from mistralai import Mistral
179
155
  from datachain import File, DataChain, Column
180
156
 
181
157
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
182
158
 
183
159
  def eval_dialogue(file: File) -> bool:
184
- client = MistralClient()
185
- response = client.chat(
160
+ client = Mistral()
161
+ response = client.chat.complete(
186
162
  model="open-mixtral-8x22b",
187
- messages=[ChatMessage(role="system", content=PROMPT),
188
- ChatMessage(role="user", content=file.read())])
163
+ messages=[{"role": "system", "content": PROMPT},
164
+ {"role": "user", "content": file.read()}])
189
165
  result = response.choices[0].message.content
190
166
  return result.lower().startswith("success")
191
167
 
@@ -225,8 +201,8 @@ Instead of extracting this information from the Mistral response data structure
225
201
 
226
202
  .. code:: py
227
203
 
228
- from mistralai.client import MistralClient
229
- from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
204
+ from mistralai import Mistral
205
+ from mistralai.models import ChatCompletionResponse
230
206
  from datachain import File, DataChain, Column
231
207
 
232
208
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
@@ -235,8 +211,8 @@ Instead of extracting this information from the Mistral response data structure
235
211
  client = MistralClient()
236
212
  return client.chat(
237
213
  model="open-mixtral-8x22b",
238
- messages=[ChatMessage(role="system", content=PROMPT),
239
- ChatMessage(role="user", content=file.read())])
214
+ messages=[{"role": "system", "content": PROMPT},
215
+ {"role": "user", "content": file.read()}])
240
216
 
241
217
  chain = (
242
218
  DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
@@ -354,7 +330,10 @@ Tutorials
354
330
  ---------
355
331
 
356
332
  * `Getting Started`_
357
- * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
333
+ * `Multimodal <https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`__)
334
+ * `LLM evaluations <https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`__)
335
+ * `Reading JSON metadata <https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`__)
336
+
358
337
 
359
338
  Contributions
360
339
  -------------
@@ -24,8 +24,7 @@ For example, let us consider the New Yorker Cartoon caption contest dataset, whe
24
24
  # pip install transformers
25
25
  #
26
26
 
27
- from datachain.lib.dc import Column, DataChain
28
- from datachain.lib.file import File
27
+ from datachain.lib.dc import Column, DataChain, File
29
28
  from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
30
29
 
31
30
  images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
@@ -75,7 +74,7 @@ plt.show()
75
74
 
76
75
  If interested to see more multimodal examples for DataChain, please follow this tutorial:
77
76
 
78
- [https://github.com/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb](https://github.com/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb)
77
+ [https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
79
78
 
80
79
  ### Handling Python objects
81
80
 
@@ -134,7 +133,7 @@ chain = (
134
133
 
135
134
  If you are interested in more LLM evaluation examples for DataChain, please follow this tutorial:
136
135
 
137
- [https://github.com/iterative/datachain/blob/main/examples/llm/llm_chatbot_evaluation.ipynb](https://github.com/iterative/datachain/blob/main/examples/llm/llm_chatbot_evaluation.ipynb)
136
+ [https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb](https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb)
138
137
 
139
138
  ### Vectorized analytics
140
139
 
@@ -280,7 +279,7 @@ images_with_dogs.select("annotations", "file.name").show()
280
279
  ```
281
280
  For in-depth review of working with JSON metadata, please follow this tutorial:
282
281
 
283
- [https://github.com/iterative/datachain/blob/main/examples/get_started/json-metadata-tutorial.ipynb](https://github.com/iterative/datachain/blob/main/examples/get_started/json-metadata-tutorial.ipynb)
282
+ [https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb](https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb)
284
283
 
285
284
  ### Passing data to training
286
285
 
@@ -300,4 +299,4 @@ train(loader, model, optimizer)
300
299
 
301
300
  See a larger example for CLIP fine-tuning here:
302
301
 
303
- [https://github.com/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb](https://github.com/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb)
302
+ [https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
@@ -10,13 +10,13 @@ def num_chars_udf(file):
10
10
  return ([],)
11
11
 
12
12
 
13
- ds = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/")
14
- ds.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
13
+ dc = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/")
14
+ dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
15
15
  "file.path", "num_chars"
16
16
  ).show(5)
17
17
 
18
18
  (
19
- ds.mutate(
19
+ dc.mutate(
20
20
  length=string.length(path.name(C("file.path"))),
21
21
  parts=string.split(path.name(C("file.path")), literal(".")),
22
22
  )
@@ -25,7 +25,7 @@ ds.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
25
25
  )
26
26
 
27
27
  (
28
- ds.mutate(
28
+ dc.mutate(
29
29
  stem=path.file_stem(path.name(C("file.path"))),
30
30
  ext=path.file_ext(path.name(C("file.path"))),
31
31
  )
@@ -33,14 +33,16 @@ ds.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
33
33
  .show(5)
34
34
  )
35
35
 
36
+
37
+ chain = dc.mutate(
38
+ a=array.length(string.split(C("file.path"), literal("/"))),
39
+ b=array.length(string.split(path.name(C("file.path")), literal("0"))),
40
+ )
41
+
36
42
  (
37
- ds.mutate(
38
- a=array.length(string.split(C("file.path"), literal("/"))),
39
- b=array.length(string.split(path.name(C("file.path")), literal("0"))),
40
- )
41
- .mutate(
42
- greatest=greatest(C("a"), C("b")),
43
- least=least(C("a"), C("b")),
43
+ chain.mutate(
44
+ greatest=greatest(chain.column("a"), C("b")),
45
+ least=least(chain.column("a"), C("b")),
44
46
  )
45
47
  .select("a", "b", "greatest", "least")
46
48
  .show(10)
@@ -1,5 +1,6 @@
1
1
  # pip install Pillow torchvision
2
2
 
3
+ import os
3
4
  from posixpath import basename
4
5
 
5
6
  import torch
@@ -11,6 +12,7 @@ from datachain import C, DataChain
11
12
  from datachain.torch import label_to_int
12
13
 
13
14
  STORAGE = "gs://datachain-demo/dogs-and-cats/"
15
+ NUM_EPOCHS = os.getenv("NUM_EPOCHS", "3")
14
16
 
15
17
  # Define transformation for data preprocessing
16
18
  transform = v2.Compose(
@@ -66,8 +68,7 @@ if __name__ == "__main__":
66
68
  optimizer = optim.Adam(model.parameters(), lr=0.001)
67
69
 
68
70
  # Train the model
69
- num_epochs = 3
70
- for epoch in range(num_epochs):
71
+ for epoch in range(int(NUM_EPOCHS)):
71
72
  for i, data in enumerate(train_loader):
72
73
  inputs, labels = data
73
74
  optimizer.zero_grad()