datachain 0.2.8__tar.gz → 0.2.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (276) hide show
  1. {datachain-0.2.8 → datachain-0.2.10}/.github/workflows/tests.yml +71 -1
  2. {datachain-0.2.8/src/datachain.egg-info → datachain-0.2.10}/PKG-INFO +13 -12
  3. {datachain-0.2.8 → datachain-0.2.10}/README.rst +6 -5
  4. {datachain-0.2.8 → datachain-0.2.10}/examples/json-csv-reader.py +4 -2
  5. datachain-0.2.10/examples/llm-claude-aggregate-query.py +60 -0
  6. {datachain-0.2.8 → datachain-0.2.10}/examples/llm-claude-simple-query.py +31 -11
  7. datachain-0.2.10/examples/llm-claude.py +42 -0
  8. {datachain-0.2.8 → datachain-0.2.10}/examples/multimodal/clip_fine_tuning.ipynb +114 -111
  9. {datachain-0.2.8 → datachain-0.2.10}/examples/openimage-detect.py +1 -1
  10. {datachain-0.2.8 → datachain-0.2.10}/examples/pose_detection.py +1 -2
  11. {datachain-0.2.8 → datachain-0.2.10}/examples/wds.py +3 -6
  12. {datachain-0.2.8 → datachain-0.2.10}/mkdocs.yml +0 -3
  13. {datachain-0.2.8 → datachain-0.2.10}/pyproject.toml +4 -4
  14. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/__init__.py +17 -8
  15. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/catalog/catalog.py +5 -5
  16. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/cli.py +0 -2
  17. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/data_storage/schema.py +5 -5
  18. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/data_storage/sqlite.py +1 -1
  19. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/data_storage/warehouse.py +7 -7
  20. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/arrow.py +25 -8
  21. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/clip.py +6 -11
  22. datachain-0.2.10/src/datachain/lib/convert/flatten.py +67 -0
  23. datachain-0.2.10/src/datachain/lib/convert/type_converter.py +96 -0
  24. datachain-0.2.10/src/datachain/lib/convert/unflatten.py +69 -0
  25. datachain-0.2.10/src/datachain/lib/convert/values_to_tuples.py +85 -0
  26. datachain-0.2.10/src/datachain/lib/data_model.py +74 -0
  27. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/dc.py +192 -167
  28. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/feature_registry.py +36 -10
  29. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/file.py +41 -41
  30. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/gpt4_vision.py +1 -9
  31. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/hf_image_to_text.py +9 -17
  32. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/hf_pipeline.py +4 -12
  33. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/image.py +2 -18
  34. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/image_transform.py +0 -1
  35. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/iptc_exif_xmp.py +8 -15
  36. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/meta_formats.py +1 -5
  37. datachain-0.2.10/src/datachain/lib/model_store.py +77 -0
  38. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/pytorch.py +9 -21
  39. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/signal_schema.py +120 -58
  40. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/text.py +5 -16
  41. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/udf.py +114 -30
  42. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/udf_signature.py +5 -5
  43. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/webdataset.py +3 -4
  44. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/webdataset_laion.py +2 -3
  45. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/node.py +4 -4
  46. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/query/batch.py +1 -1
  47. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/query/dataset.py +40 -60
  48. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/query/dispatch.py +28 -17
  49. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/query/udf.py +46 -26
  50. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/remote/studio.py +1 -9
  51. datachain-0.2.10/src/datachain/torch/__init__.py +21 -0
  52. {datachain-0.2.8 → datachain-0.2.10/src/datachain.egg-info}/PKG-INFO +13 -12
  53. {datachain-0.2.8 → datachain-0.2.10}/src/datachain.egg-info/SOURCES.txt +8 -7
  54. {datachain-0.2.8 → datachain-0.2.10}/src/datachain.egg-info/requires.txt +7 -7
  55. {datachain-0.2.8 → datachain-0.2.10}/tests/conftest.py +1 -1
  56. {datachain-0.2.8 → datachain-0.2.10}/tests/examples/test_wds_e2e.py +1 -1
  57. {datachain-0.2.8 → datachain-0.2.10}/tests/func/test_catalog.py +2 -2
  58. {datachain-0.2.8 → datachain-0.2.10}/tests/func/test_datachain.py +21 -3
  59. {datachain-0.2.8 → datachain-0.2.10}/tests/func/test_dataset_query.py +40 -40
  60. {datachain-0.2.8 → datachain-0.2.10}/tests/func/test_datasets.py +2 -2
  61. {datachain-0.2.8 → datachain-0.2.10}/tests/func/test_pull.py +3 -3
  62. {datachain-0.2.8 → datachain-0.2.10}/tests/func/test_pytorch.py +5 -12
  63. {datachain-0.2.8 → datachain-0.2.10}/tests/scripts/feature_class.py +3 -2
  64. {datachain-0.2.8 → datachain-0.2.10}/tests/scripts/feature_class_parallel.py +5 -4
  65. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_arrow.py +17 -3
  66. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_datachain.py +173 -89
  67. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_datachain_bootstrap.py +5 -5
  68. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_datachain_merge.py +15 -15
  69. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_feature.py +86 -152
  70. datachain-0.2.10/tests/unit/lib/test_feature_utils.py +109 -0
  71. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_image.py +1 -1
  72. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_signal_schema.py +15 -27
  73. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_udf_signature.py +6 -5
  74. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_utils.py +5 -5
  75. datachain-0.2.10/tests/unit/sql/sqlite/__init__.py +0 -0
  76. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_dataset.py +3 -3
  77. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_listing.py +2 -2
  78. datachain-0.2.10/tests/unit/test_module_exports.py +93 -0
  79. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_udf.py +14 -60
  80. datachain-0.2.8/docs/tutorials/cv_intro.md +0 -217
  81. datachain-0.2.8/docs/tutorials/udfs.md +0 -94
  82. datachain-0.2.8/examples/llm-claude-aggregate-query.py +0 -40
  83. datachain-0.2.8/examples/llm-claude.py +0 -21
  84. datachain-0.2.8/src/datachain/image/__init__.py +0 -3
  85. datachain-0.2.8/src/datachain/lib/cached_stream.py +0 -38
  86. datachain-0.2.8/src/datachain/lib/claude.py +0 -69
  87. datachain-0.2.8/src/datachain/lib/feature.py +0 -412
  88. datachain-0.2.8/src/datachain/lib/feature_utils.py +0 -154
  89. datachain-0.2.8/tests/unit/lib/test_feature_utils.py +0 -142
  90. datachain-0.2.8/tests/unit/test_module_exports.py +0 -30
  91. {datachain-0.2.8 → datachain-0.2.10}/.cruft.json +0 -0
  92. {datachain-0.2.8 → datachain-0.2.10}/.gitattributes +0 -0
  93. {datachain-0.2.8 → datachain-0.2.10}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  94. {datachain-0.2.8 → datachain-0.2.10}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  95. {datachain-0.2.8 → datachain-0.2.10}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  96. {datachain-0.2.8 → datachain-0.2.10}/.github/codecov.yaml +0 -0
  97. {datachain-0.2.8 → datachain-0.2.10}/.github/dependabot.yml +0 -0
  98. {datachain-0.2.8 → datachain-0.2.10}/.github/workflows/benchmarks.yml +0 -0
  99. {datachain-0.2.8 → datachain-0.2.10}/.github/workflows/release.yml +0 -0
  100. {datachain-0.2.8 → datachain-0.2.10}/.github/workflows/update-template.yaml +0 -0
  101. {datachain-0.2.8 → datachain-0.2.10}/.gitignore +0 -0
  102. {datachain-0.2.8 → datachain-0.2.10}/.pre-commit-config.yaml +0 -0
  103. {datachain-0.2.8 → datachain-0.2.10}/.reuse/dep5 +0 -0
  104. {datachain-0.2.8 → datachain-0.2.10}/CODE_OF_CONDUCT.rst +0 -0
  105. {datachain-0.2.8 → datachain-0.2.10}/CONTRIBUTING.rst +0 -0
  106. {datachain-0.2.8 → datachain-0.2.10}/LICENSE +0 -0
  107. {datachain-0.2.8 → datachain-0.2.10}/LICENSES/Apache-2.0.txt +0 -0
  108. {datachain-0.2.8 → datachain-0.2.10}/LICENSES/BSD-3-Clause.txt +0 -0
  109. {datachain-0.2.8 → datachain-0.2.10}/LICENSES/Python-2.0.txt +0 -0
  110. {datachain-0.2.8 → datachain-0.2.10}/docs/assets/datachain.png +0 -0
  111. {datachain-0.2.8 → datachain-0.2.10}/docs/index.md +0 -0
  112. {datachain-0.2.8 → datachain-0.2.10}/docs/references/catalog.md +0 -0
  113. {datachain-0.2.8 → datachain-0.2.10}/docs/references/datachain.md +0 -0
  114. {datachain-0.2.8 → datachain-0.2.10}/examples/blip2_image_desc_lib.py +0 -0
  115. {datachain-0.2.8 → datachain-0.2.10}/examples/clip.py +0 -0
  116. {datachain-0.2.8 → datachain-0.2.10}/examples/common_sql_functions.py +0 -0
  117. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
  118. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
  119. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
  120. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/README.md +0 -0
  121. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  122. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
  123. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
  124. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
  125. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
  126. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  127. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  128. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  129. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  130. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  131. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  132. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  133. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  134. {datachain-0.2.8 → datachain-0.2.10}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  135. {datachain-0.2.8 → datachain-0.2.10}/examples/hf_pipeline.py +0 -0
  136. {datachain-0.2.8 → datachain-0.2.10}/examples/iptc_exif_xmp_lib.py +0 -0
  137. {datachain-0.2.8 → datachain-0.2.10}/examples/llava2_image_desc_lib.py +0 -0
  138. {datachain-0.2.8 → datachain-0.2.10}/examples/loader.py +0 -0
  139. {datachain-0.2.8 → datachain-0.2.10}/examples/neurips/README +0 -0
  140. {datachain-0.2.8 → datachain-0.2.10}/examples/neurips/distance_to_query.py +0 -0
  141. {datachain-0.2.8 → datachain-0.2.10}/examples/neurips/llm_chat.py +0 -0
  142. {datachain-0.2.8 → datachain-0.2.10}/examples/neurips/requirements.txt +0 -0
  143. {datachain-0.2.8 → datachain-0.2.10}/examples/neurips/single_query.py +0 -0
  144. {datachain-0.2.8 → datachain-0.2.10}/examples/neurips/text_loaders.py +0 -0
  145. {datachain-0.2.8 → datachain-0.2.10}/examples/openai_image_desc_lib.py +0 -0
  146. {datachain-0.2.8 → datachain-0.2.10}/examples/torch-loader.py +0 -0
  147. {datachain-0.2.8 → datachain-0.2.10}/examples/udfs/batching.py +0 -0
  148. {datachain-0.2.8 → datachain-0.2.10}/examples/udfs/image_transformation.py +0 -0
  149. {datachain-0.2.8 → datachain-0.2.10}/examples/udfs/parallel.py +0 -0
  150. {datachain-0.2.8 → datachain-0.2.10}/examples/udfs/simple.py +0 -0
  151. {datachain-0.2.8 → datachain-0.2.10}/examples/udfs/stateful.py +0 -0
  152. {datachain-0.2.8 → datachain-0.2.10}/examples/udfs/stateful_similarity.py +0 -0
  153. {datachain-0.2.8 → datachain-0.2.10}/examples/unstructured-text.py +0 -0
  154. {datachain-0.2.8 → datachain-0.2.10}/examples/wds_filtered.py +0 -0
  155. {datachain-0.2.8 → datachain-0.2.10}/examples/zalando/zalando_clip.py +0 -0
  156. {datachain-0.2.8 → datachain-0.2.10}/examples/zalando/zalando_dir_as_class.py +0 -0
  157. {datachain-0.2.8 → datachain-0.2.10}/examples/zalando/zalando_splits_and_classes_ds.py +0 -0
  158. {datachain-0.2.8 → datachain-0.2.10}/examples/zalando/zalando_splits_and_classes_output.py +0 -0
  159. {datachain-0.2.8 → datachain-0.2.10}/noxfile.py +0 -0
  160. {datachain-0.2.8 → datachain-0.2.10}/setup.cfg +0 -0
  161. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/__main__.py +0 -0
  162. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/asyn.py +0 -0
  163. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/cache.py +0 -0
  164. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/catalog/__init__.py +0 -0
  165. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/catalog/datasource.py +0 -0
  166. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/catalog/loader.py +0 -0
  167. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/catalog/subclass.py +0 -0
  168. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/cli_utils.py +0 -0
  169. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/client/__init__.py +0 -0
  170. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/client/azure.py +0 -0
  171. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/client/fileslice.py +0 -0
  172. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/client/fsspec.py +0 -0
  173. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/client/gcs.py +0 -0
  174. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/client/local.py +0 -0
  175. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/client/s3.py +0 -0
  176. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/config.py +0 -0
  177. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/data_storage/__init__.py +0 -0
  178. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/data_storage/db_engine.py +0 -0
  179. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/data_storage/id_generator.py +0 -0
  180. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/data_storage/job.py +0 -0
  181. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/data_storage/metastore.py +0 -0
  182. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/data_storage/serializer.py +0 -0
  183. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/dataset.py +0 -0
  184. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/error.py +0 -0
  185. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/__init__.py +0 -0
  186. {datachain-0.2.8/src/datachain/remote → datachain-0.2.10/src/datachain/lib/convert}/__init__.py +0 -0
  187. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/settings.py +0 -0
  188. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/unstructured.py +0 -0
  189. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/utils.py +0 -0
  190. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/lib/vfile.py +0 -0
  191. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/listing.py +0 -0
  192. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/nodes_fetcher.py +0 -0
  193. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/nodes_thread_pool.py +0 -0
  194. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/progress.py +0 -0
  195. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/py.typed +0 -0
  196. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/query/__init__.py +0 -0
  197. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/query/builtins.py +0 -0
  198. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/query/metrics.py +0 -0
  199. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/query/params.py +0 -0
  200. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/query/schema.py +0 -0
  201. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/query/session.py +0 -0
  202. {datachain-0.2.8/tests/benchmarks → datachain-0.2.10/src/datachain/remote}/__init__.py +0 -0
  203. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/__init__.py +0 -0
  204. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/default/__init__.py +0 -0
  205. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/default/base.py +0 -0
  206. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/functions/__init__.py +0 -0
  207. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/functions/array.py +0 -0
  208. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/functions/conditional.py +0 -0
  209. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/functions/path.py +0 -0
  210. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/functions/random.py +0 -0
  211. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/functions/string.py +0 -0
  212. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/selectable.py +0 -0
  213. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/sqlite/__init__.py +0 -0
  214. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/sqlite/base.py +0 -0
  215. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/sqlite/types.py +0 -0
  216. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/sqlite/vector.py +0 -0
  217. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/types.py +0 -0
  218. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/sql/utils.py +0 -0
  219. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/storage.py +0 -0
  220. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/text/__init__.py +0 -0
  221. {datachain-0.2.8 → datachain-0.2.10}/src/datachain/utils.py +0 -0
  222. {datachain-0.2.8 → datachain-0.2.10}/src/datachain.egg-info/dependency_links.txt +0 -0
  223. {datachain-0.2.8 → datachain-0.2.10}/src/datachain.egg-info/entry_points.txt +0 -0
  224. {datachain-0.2.8 → datachain-0.2.10}/src/datachain.egg-info/top_level.txt +0 -0
  225. {datachain-0.2.8 → datachain-0.2.10}/tests/__init__.py +0 -0
  226. {datachain-0.2.8/tests/examples → datachain-0.2.10/tests/benchmarks}/__init__.py +0 -0
  227. {datachain-0.2.8 → datachain-0.2.10}/tests/benchmarks/conftest.py +0 -0
  228. {datachain-0.2.8 → datachain-0.2.10}/tests/benchmarks/test_ls.py +0 -0
  229. {datachain-0.2.8 → datachain-0.2.10}/tests/benchmarks/test_version.py +0 -0
  230. {datachain-0.2.8 → datachain-0.2.10}/tests/data.py +0 -0
  231. {datachain-0.2.8/tests/func → datachain-0.2.10/tests/examples}/__init__.py +0 -0
  232. {datachain-0.2.8 → datachain-0.2.10}/tests/examples/wds_data.py +0 -0
  233. {datachain-0.2.8/tests/unit → datachain-0.2.10/tests/func}/__init__.py +0 -0
  234. {datachain-0.2.8 → datachain-0.2.10}/tests/func/test_client.py +0 -0
  235. {datachain-0.2.8 → datachain-0.2.10}/tests/func/test_ls.py +0 -0
  236. {datachain-0.2.8 → datachain-0.2.10}/tests/func/test_query.py +0 -0
  237. {datachain-0.2.8 → datachain-0.2.10}/tests/scripts/name_len_normal.py +0 -0
  238. {datachain-0.2.8 → datachain-0.2.10}/tests/scripts/name_len_slow.py +0 -0
  239. {datachain-0.2.8 → datachain-0.2.10}/tests/test_cli_e2e.py +0 -0
  240. {datachain-0.2.8 → datachain-0.2.10}/tests/test_query_e2e.py +0 -0
  241. {datachain-0.2.8/tests/unit/lib → datachain-0.2.10/tests/unit}/__init__.py +0 -0
  242. {datachain-0.2.8/tests/unit/sql → datachain-0.2.10/tests/unit/lib}/__init__.py +0 -0
  243. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/conftest.py +0 -0
  244. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_clip.py +0 -0
  245. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_file.py +0 -0
  246. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_text.py +0 -0
  247. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/lib/test_webdataset.py +0 -0
  248. {datachain-0.2.8/tests/unit/sql/sqlite → datachain-0.2.10/tests/unit/sql}/__init__.py +0 -0
  249. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/sql/sqlite/test_utils.py +0 -0
  250. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/sql/test_array.py +0 -0
  251. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/sql/test_conditional.py +0 -0
  252. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/sql/test_path.py +0 -0
  253. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/sql/test_random.py +0 -0
  254. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/sql/test_selectable.py +0 -0
  255. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/sql/test_string.py +0 -0
  256. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_asyn.py +0 -0
  257. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_cache.py +0 -0
  258. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_catalog.py +0 -0
  259. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_catalog_loader.py +0 -0
  260. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_cli_parsing.py +0 -0
  261. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_client.py +0 -0
  262. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_client_s3.py +0 -0
  263. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_data_storage.py +0 -0
  264. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_database_engine.py +0 -0
  265. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_dispatch.py +0 -0
  266. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_fileslice.py +0 -0
  267. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_id_generator.py +0 -0
  268. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_metastore.py +0 -0
  269. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_query_metrics.py +0 -0
  270. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_query_params.py +0 -0
  271. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_serializer.py +0 -0
  272. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_session.py +0 -0
  273. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_storage.py +0 -0
  274. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_utils.py +0 -0
  275. {datachain-0.2.8 → datachain-0.2.10}/tests/unit/test_warehouse.py +0 -0
  276. {datachain-0.2.8 → datachain-0.2.10}/tests/utils.py +0 -0
@@ -50,7 +50,7 @@ jobs:
50
50
  - name: Lint code
51
51
  run: nox -s lint
52
52
 
53
- tests:
53
+ datachain:
54
54
  timeout-minutes: 25
55
55
  runs-on: ${{ matrix.os }}
56
56
  strategy:
@@ -125,3 +125,73 @@ jobs:
125
125
 
126
126
  - name: Build docs
127
127
  run: nox -s docs
128
+
129
+
130
+ studio:
131
+ if: '!github.event.pull_request.head.repo.fork'
132
+ runs-on: ubuntu-latest-16-cores
133
+ strategy:
134
+ matrix:
135
+ pyv: ['3.12']
136
+ group: [1, 2, 3, 4, 5, 6]
137
+ services:
138
+ postgres:
139
+ image: postgres:16.3
140
+ ports:
141
+ - 5432:5432
142
+ env:
143
+ POSTGRES_USER: test
144
+ POSTGRES_DB: database
145
+ POSTGRES_HOST_AUTH_METHOD: trust
146
+ clickhouse:
147
+ image: clickhouse/clickhouse-server:24
148
+ ports:
149
+ - 8123:8123
150
+ - 9010:9000
151
+ env:
152
+ CLICKHOUSE_DB: studio_local_db
153
+ CLICKHOUSE_USER: studio_local
154
+ CLICKHOUSE_PASSWORD: ch123456789!
155
+ CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
156
+ redis:
157
+ image: redis:7.2.5
158
+ ports:
159
+ - 6379:6379
160
+ steps:
161
+
162
+ - name: Check out Studio
163
+ uses: actions/checkout@v4
164
+ with:
165
+ fetch-depth: 0
166
+ repository: iterative/studio
167
+ ref: develop
168
+ token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
169
+
170
+ - name: Check out repository
171
+ uses: actions/checkout@v4
172
+ with:
173
+ path: './backend/datachain'
174
+ fetch-depth: 0
175
+
176
+ - name: Set up Python ${{ matrix.pyv }}
177
+ uses: actions/setup-python@v5
178
+ with:
179
+ python-version: ${{ matrix.pyv }}
180
+ cache: 'pip'
181
+
182
+ - name: Install uv
183
+ run: |
184
+ python -m pip install --upgrade uv
185
+ uv --version
186
+
187
+ - name: Install dependencies
188
+ run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
189
+
190
+ - name: Run tests
191
+ # Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
192
+ run: >
193
+ pytest
194
+ --config-file=pyproject.toml -rsx
195
+ --splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
196
+ tests ../datachain/tests
197
+ working-directory: backend/datachain_server
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.8
3
+ Version: 0.2.10
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -39,6 +39,7 @@ Requires-Dist: ujson>=5.9.0
39
39
  Requires-Dist: pydantic<3,>=2
40
40
  Requires-Dist: jmespath>=1.0
41
41
  Requires-Dist: datamodel-code-generator>=0.25
42
+ Requires-Dist: Pillow<11,>=10.0.0
42
43
  Provides-Extra: docs
43
44
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
44
45
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -46,11 +47,10 @@ Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
46
47
  Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
47
48
  Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
48
49
  Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
49
- Provides-Extra: cv
50
- Requires-Dist: Pillow<11,>=10.0.0; extra == "cv"
51
- Requires-Dist: torch>=2.1.0; extra == "cv"
52
- Requires-Dist: torchvision; extra == "cv"
53
- Requires-Dist: transformers>=4.36.0; extra == "cv"
50
+ Provides-Extra: torch
51
+ Requires-Dist: torch>=2.1.0; extra == "torch"
52
+ Requires-Dist: torchvision; extra == "torch"
53
+ Requires-Dist: transformers>=4.36.0; extra == "torch"
54
54
  Provides-Extra: remote
55
55
  Requires-Dist: lz4; extra == "remote"
56
56
  Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
@@ -58,7 +58,7 @@ Requires-Dist: requests>=2.22.0; extra == "remote"
58
58
  Provides-Extra: vector
59
59
  Requires-Dist: usearch; extra == "vector"
60
60
  Provides-Extra: tests
61
- Requires-Dist: datachain[cv,remote,vector]; extra == "tests"
61
+ Requires-Dist: datachain[remote,torch,vector]; extra == "tests"
62
62
  Requires-Dist: pytest<9,>=8; extra == "tests"
63
63
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
64
64
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
@@ -89,11 +89,11 @@ Requires-Dist: types-ujson; extra == "dev"
89
89
  .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
90
90
  :target: https://pypi.org/project/datachain
91
91
  :alt: Python Version
92
- .. |Codecov| image:: https://codecov.io/gh/iterative/dvcx/branch/main/graph/badge.svg?token=VSCP2T9R5X
93
- :target: https://app.codecov.io/gh/iterative/dvcx
92
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
93
+ :target: https://codecov.io/gh/iterative/datachain
94
94
  :alt: Codecov
95
- .. |Tests| image:: https://github.com/iterative/dvcx/workflows/Tests/badge.svg
96
- :target: https://github.com/iterative/dvcx/actions?workflow=Tests
95
+ .. |Tests| image:: https://github.com/iterative/datachain/workflows/Tests/badge.svg
96
+ :target: https://github.com/iterative/datachain/actions?workflow=Tests
97
97
  :alt: Tests
98
98
 
99
99
  AI 🔗 DataChain
@@ -397,7 +397,8 @@ Chain results can be exported or passed directly to Pytorch dataloader. For exam
397
397
  Tutorials
398
398
  ------------------
399
399
 
400
- * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
400
+ * `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
401
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
401
402
 
402
403
  Contributions
403
404
  --------------------
@@ -6,11 +6,11 @@
6
6
  .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
7
7
  :target: https://pypi.org/project/datachain
8
8
  :alt: Python Version
9
- .. |Codecov| image:: https://codecov.io/gh/iterative/dvcx/branch/main/graph/badge.svg?token=VSCP2T9R5X
10
- :target: https://app.codecov.io/gh/iterative/dvcx
9
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
10
+ :target: https://codecov.io/gh/iterative/datachain
11
11
  :alt: Codecov
12
- .. |Tests| image:: https://github.com/iterative/dvcx/workflows/Tests/badge.svg
13
- :target: https://github.com/iterative/dvcx/actions?workflow=Tests
12
+ .. |Tests| image:: https://github.com/iterative/datachain/workflows/Tests/badge.svg
13
+ :target: https://github.com/iterative/datachain/actions?workflow=Tests
14
14
  :alt: Tests
15
15
 
16
16
  AI 🔗 DataChain
@@ -314,7 +314,8 @@ Chain results can be exported or passed directly to Pytorch dataloader. For exam
314
314
  Tutorials
315
315
  ------------------
316
316
 
317
- * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
317
+ * `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
318
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
318
319
 
319
320
  Contributions
320
321
  --------------------
@@ -103,7 +103,8 @@ def main():
103
103
  print("========================================================================")
104
104
  print("static CSV with header schema test parsing 3.5K objects")
105
105
  print("========================================================================")
106
- static_csv_ds = DataChain.from_csv(uri, spec=ChatFeature)
106
+ static_csv_ds = DataChain.from_csv(uri, output=ChatFeature, object_name="chat")
107
+ static_csv_ds.print_schema()
107
108
  print(static_csv_ds.to_pandas())
108
109
 
109
110
  uri = "gs://datachain-demo/laion-aesthetics-csv"
@@ -111,7 +112,8 @@ def main():
111
112
  print("========================================================================")
112
113
  print("dynamic CSV with header schema test parsing 3M objects")
113
114
  print("========================================================================")
114
- dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", show_schema=True)
115
+ dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion")
116
+ dynamic_csv_ds.print_schema()
115
117
  print(dynamic_csv_ds.to_pandas())
116
118
 
117
119
 
@@ -0,0 +1,60 @@
1
+ import os
2
+
3
+ import anthropic
4
+ import pandas as pd
5
+ from anthropic.types import Message
6
+
7
+ from datachain import Column, DataChain
8
+ from datachain.sql.functions import path
9
+
10
+ DATA = "gs://dvcx-datalakes/chatbot-public"
11
+ MODEL = "claude-3-opus-20240229"
12
+ PROMPT = """Consider the following dialogues between the 'user' and the 'bot' separated\
13
+ by '===='. The 'user' is a human trying to find the best mobile plan. The 'bot' is a \
14
+ chatbot designed to query the user and offer the best solution. The dialog is \
15
+ successful if the 'bot' is able to gather the information and offer a plan, or inform \
16
+ the user that such plan does not exist. The dialog is not successful if the \
17
+ conversation ends early or the 'user' requests additional functions the 'bot' \
18
+ cannot perform. Read the dialogues and classify them into a fixed number of concise \
19
+ failure reasons covering most failure cases. Present output as JSON list of reason \
20
+ strings and nothing else.
21
+ """
22
+
23
+ TEMPERATURE = 0.9
24
+ DEFAULT_OUTPUT_TOKENS = 1024
25
+
26
+ API_KEY = os.environ.get("ANTHROPIC_API_KEY")
27
+
28
+
29
+ chain = (
30
+ DataChain.from_storage(DATA, type="text")
31
+ .filter(Column("file.name").glob("*.txt"))
32
+ .limit(5)
33
+ .settings(parallel=4, cache=True)
34
+ .agg(
35
+ dialogues=lambda file: ["\n=====\n".join(f.read() for f in file)],
36
+ output=str,
37
+ partition_by=path.file_ext(Column("name")),
38
+ )
39
+ .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
40
+ .map(
41
+ claude=lambda client, dialogues: client.messages.create(
42
+ model=MODEL,
43
+ system=PROMPT,
44
+ messages=[
45
+ {"role": "user", "content": dialogues},
46
+ ],
47
+ temperature=TEMPERATURE,
48
+ max_tokens=DEFAULT_OUTPUT_TOKENS,
49
+ ),
50
+ output=Message,
51
+ )
52
+ .map(
53
+ res=lambda claude: claude.content[0].text if claude.content else [],
54
+ output=str,
55
+ )
56
+ )
57
+
58
+ with pd.option_context("display.max_columns", None):
59
+ df = chain.to_pandas()
60
+ print(df)
@@ -1,12 +1,14 @@
1
1
  import json
2
+ import os
2
3
 
4
+ import anthropic
3
5
  import pandas as pd
6
+ from anthropic.types import Message
7
+ from pydantic import BaseModel
4
8
 
5
- from datachain.lib.claude import claude_processor
6
- from datachain.lib.dc import C, DataChain
7
- from datachain.lib.feature import Feature
9
+ from datachain import Column, DataChain, File
8
10
 
9
- SOURCE = "gs://dvcx-datalakes/chatbot-public"
11
+ DATA = "gs://dvcx-datalakes/chatbot-public"
10
12
  MODEL = "claude-3-opus-20240229"
11
13
  PROMPT = """Consider the dialogue between the 'user' and the 'bot'. \
12
14
  The 'user' is a human trying to find the best mobile plan. \
@@ -20,19 +22,38 @@ if it is successful, and 'Failure' if not. After that, provide \
20
22
  one-sentence explanation of the reasons for this rating. Use only \
21
23
  JSON object as output with the keys 'status', and 'explanation'.
22
24
  """
25
+ TEMPERATURE = 0.9
26
+ DEFAULT_OUTPUT_TOKENS = 1024
23
27
 
28
+ API_KEY = os.environ.get("ANTHROPIC_API_KEY")
24
29
 
25
- class Rating(Feature):
30
+
31
+ class Rating(BaseModel):
26
32
  status: str = ""
27
33
  explanation: str = ""
28
34
 
29
35
 
30
36
  chain = (
31
- DataChain.from_storage(SOURCE, type="text")
32
- .filter(C.name.glob("*.txt"))
33
- .settings(parallel=3)
37
+ DataChain.from_storage(DATA, type="text")
38
+ .filter(Column("file.name").glob("*.txt"))
34
39
  .limit(5)
35
- .map(claude=claude_processor(prompt=PROMPT, model=MODEL))
40
+ .settings(parallel=4, cache=True)
41
+ .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
42
+ .map(
43
+ claude=lambda client, file: client.messages.create(
44
+ model=MODEL,
45
+ system=PROMPT,
46
+ messages=[
47
+ {
48
+ "role": "user",
49
+ "content": file.read() if isinstance(file, File) else file,
50
+ },
51
+ ],
52
+ temperature=TEMPERATURE,
53
+ max_tokens=DEFAULT_OUTPUT_TOKENS,
54
+ ),
55
+ output=Message,
56
+ )
36
57
  .map(
37
58
  rating=lambda claude: Rating(
38
59
  **(json.loads(claude.content[0].text) if claude.content else {})
@@ -41,7 +62,6 @@ chain = (
41
62
  )
42
63
  )
43
64
 
44
- df = chain.to_pandas()
45
-
46
65
  with pd.option_context("display.max_columns", None):
66
+ df = chain.to_pandas()
47
67
  print(df)
@@ -0,0 +1,42 @@
1
+ import os
2
+
3
+ import anthropic
4
+ import pandas as pd
5
+ from anthropic.types import Message
6
+
7
+ from datachain import Column, DataChain, File
8
+
9
+ DATA = "gs://dvcx-datalakes/chatbot-public"
10
+ MODEL = "claude-3-opus-20240229"
11
+ PROMPT = """Summarise the dialog in a sentence"""
12
+ TEMPERATURE = 0.9
13
+ DEFAULT_OUTPUT_TOKENS = 1024
14
+
15
+ API_KEY = os.environ.get("ANTHROPIC_API_KEY")
16
+
17
+ chain = (
18
+ DataChain.from_storage(DATA, type="text")
19
+ .filter(Column("file.name").glob("*.txt"))
20
+ .limit(5)
21
+ .settings(parallel=4, cache=True)
22
+ .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
23
+ .map(
24
+ claude=lambda client, file: client.messages.create(
25
+ model=MODEL,
26
+ system=PROMPT,
27
+ messages=[
28
+ {
29
+ "role": "user",
30
+ "content": file.read() if isinstance(file, File) else file,
31
+ },
32
+ ],
33
+ temperature=TEMPERATURE,
34
+ max_tokens=DEFAULT_OUTPUT_TOKENS,
35
+ ),
36
+ output=Message,
37
+ )
38
+ )
39
+
40
+ with pd.option_context("display.max_columns", None):
41
+ df = chain.to_pandas()
42
+ print(df)