datachain 0.2.9__tar.gz → 0.2.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (278) hide show
  1. {datachain-0.2.9 → datachain-0.2.11}/.github/workflows/tests.yml +71 -1
  2. {datachain-0.2.9/src/datachain.egg-info → datachain-0.2.11}/PKG-INFO +14 -12
  3. {datachain-0.2.9 → datachain-0.2.11}/README.rst +6 -5
  4. {datachain-0.2.9 → datachain-0.2.11}/examples/json-csv-reader.py +4 -2
  5. datachain-0.2.11/examples/llm-claude-aggregate-query.py +57 -0
  6. {datachain-0.2.9 → datachain-0.2.11}/examples/llm-claude-simple-query.py +31 -14
  7. datachain-0.2.11/examples/llm-claude.py +39 -0
  8. {datachain-0.2.9 → datachain-0.2.11}/examples/multimodal/clip_fine_tuning.ipynb +114 -111
  9. {datachain-0.2.9 → datachain-0.2.11}/examples/openimage-detect.py +1 -1
  10. {datachain-0.2.9 → datachain-0.2.11}/examples/pose_detection.py +1 -2
  11. {datachain-0.2.9 → datachain-0.2.11}/examples/wds.py +3 -6
  12. {datachain-0.2.9 → datachain-0.2.11}/mkdocs.yml +0 -3
  13. {datachain-0.2.9 → datachain-0.2.11}/pyproject.toml +5 -4
  14. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/__init__.py +17 -8
  15. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/catalog/catalog.py +5 -5
  16. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/cli.py +0 -2
  17. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/schema.py +5 -5
  18. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/sqlite.py +1 -1
  19. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/warehouse.py +7 -7
  20. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/arrow.py +25 -8
  21. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/clip.py +6 -11
  22. datachain-0.2.11/src/datachain/lib/convert/flatten.py +67 -0
  23. datachain-0.2.11/src/datachain/lib/convert/type_converter.py +96 -0
  24. datachain-0.2.11/src/datachain/lib/convert/unflatten.py +69 -0
  25. datachain-0.2.11/src/datachain/lib/convert/values_to_tuples.py +85 -0
  26. datachain-0.2.11/src/datachain/lib/data_model.py +74 -0
  27. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/dc.py +225 -168
  28. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/file.py +41 -41
  29. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/gpt4_vision.py +1 -9
  30. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/hf_image_to_text.py +9 -17
  31. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/hf_pipeline.py +4 -12
  32. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/image.py +2 -18
  33. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/image_transform.py +0 -1
  34. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/iptc_exif_xmp.py +8 -15
  35. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/meta_formats.py +1 -5
  36. datachain-0.2.11/src/datachain/lib/model_store.py +77 -0
  37. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/pytorch.py +9 -21
  38. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/signal_schema.py +139 -60
  39. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/text.py +5 -16
  40. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/udf.py +114 -30
  41. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/udf_signature.py +5 -5
  42. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/webdataset.py +3 -3
  43. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/webdataset_laion.py +2 -3
  44. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/node.py +4 -4
  45. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/batch.py +1 -1
  46. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/dataset.py +51 -178
  47. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/dispatch.py +43 -30
  48. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/udf.py +46 -26
  49. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/remote/studio.py +1 -9
  50. datachain-0.2.11/src/datachain/torch/__init__.py +21 -0
  51. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/utils.py +39 -0
  52. {datachain-0.2.9 → datachain-0.2.11/src/datachain.egg-info}/PKG-INFO +14 -12
  53. {datachain-0.2.9 → datachain-0.2.11}/src/datachain.egg-info/SOURCES.txt +10 -8
  54. {datachain-0.2.9 → datachain-0.2.11}/src/datachain.egg-info/requires.txt +8 -7
  55. {datachain-0.2.9 → datachain-0.2.11}/tests/conftest.py +1 -1
  56. {datachain-0.2.9 → datachain-0.2.11}/tests/examples/test_wds_e2e.py +1 -1
  57. {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_catalog.py +2 -2
  58. {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_datachain.py +21 -3
  59. {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_dataset_query.py +40 -53
  60. {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_datasets.py +2 -2
  61. datachain-0.2.11/tests/func/test_feature_pickling.py +209 -0
  62. {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_pull.py +3 -3
  63. {datachain-0.2.9 → datachain-0.2.11}/tests/scripts/feature_class.py +3 -2
  64. {datachain-0.2.9 → datachain-0.2.11}/tests/scripts/feature_class_parallel.py +5 -5
  65. datachain-0.2.11/tests/scripts/feature_class_parallel_data_model.py +28 -0
  66. {datachain-0.2.9 → datachain-0.2.11}/tests/test_query_e2e.py +55 -14
  67. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_arrow.py +17 -3
  68. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_datachain.py +230 -133
  69. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_datachain_bootstrap.py +5 -5
  70. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_datachain_merge.py +15 -15
  71. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_feature.py +86 -152
  72. datachain-0.2.11/tests/unit/lib/test_feature_utils.py +109 -0
  73. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_image.py +1 -1
  74. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_signal_schema.py +22 -27
  75. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_udf_signature.py +6 -5
  76. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_utils.py +5 -5
  77. datachain-0.2.11/tests/unit/sql/sqlite/__init__.py +0 -0
  78. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_dataset.py +3 -3
  79. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_listing.py +2 -2
  80. datachain-0.2.11/tests/unit/test_module_exports.py +93 -0
  81. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_udf.py +14 -60
  82. datachain-0.2.9/docs/tutorials/cv_intro.md +0 -217
  83. datachain-0.2.9/docs/tutorials/udfs.md +0 -94
  84. datachain-0.2.9/examples/llm-claude-aggregate-query.py +0 -40
  85. datachain-0.2.9/examples/llm-claude.py +0 -21
  86. datachain-0.2.9/src/datachain/image/__init__.py +0 -3
  87. datachain-0.2.9/src/datachain/lib/cached_stream.py +0 -38
  88. datachain-0.2.9/src/datachain/lib/claude.py +0 -69
  89. datachain-0.2.9/src/datachain/lib/feature.py +0 -412
  90. datachain-0.2.9/src/datachain/lib/feature_registry.py +0 -51
  91. datachain-0.2.9/src/datachain/lib/feature_utils.py +0 -154
  92. datachain-0.2.9/tests/unit/lib/test_feature_utils.py +0 -142
  93. datachain-0.2.9/tests/unit/test_module_exports.py +0 -30
  94. {datachain-0.2.9 → datachain-0.2.11}/.cruft.json +0 -0
  95. {datachain-0.2.9 → datachain-0.2.11}/.gitattributes +0 -0
  96. {datachain-0.2.9 → datachain-0.2.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  97. {datachain-0.2.9 → datachain-0.2.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  98. {datachain-0.2.9 → datachain-0.2.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  99. {datachain-0.2.9 → datachain-0.2.11}/.github/codecov.yaml +0 -0
  100. {datachain-0.2.9 → datachain-0.2.11}/.github/dependabot.yml +0 -0
  101. {datachain-0.2.9 → datachain-0.2.11}/.github/workflows/benchmarks.yml +0 -0
  102. {datachain-0.2.9 → datachain-0.2.11}/.github/workflows/release.yml +0 -0
  103. {datachain-0.2.9 → datachain-0.2.11}/.github/workflows/update-template.yaml +0 -0
  104. {datachain-0.2.9 → datachain-0.2.11}/.gitignore +0 -0
  105. {datachain-0.2.9 → datachain-0.2.11}/.pre-commit-config.yaml +0 -0
  106. {datachain-0.2.9 → datachain-0.2.11}/.reuse/dep5 +0 -0
  107. {datachain-0.2.9 → datachain-0.2.11}/CODE_OF_CONDUCT.rst +0 -0
  108. {datachain-0.2.9 → datachain-0.2.11}/CONTRIBUTING.rst +0 -0
  109. {datachain-0.2.9 → datachain-0.2.11}/LICENSE +0 -0
  110. {datachain-0.2.9 → datachain-0.2.11}/LICENSES/Apache-2.0.txt +0 -0
  111. {datachain-0.2.9 → datachain-0.2.11}/LICENSES/BSD-3-Clause.txt +0 -0
  112. {datachain-0.2.9 → datachain-0.2.11}/LICENSES/Python-2.0.txt +0 -0
  113. {datachain-0.2.9 → datachain-0.2.11}/docs/assets/datachain.png +0 -0
  114. {datachain-0.2.9 → datachain-0.2.11}/docs/index.md +0 -0
  115. {datachain-0.2.9 → datachain-0.2.11}/docs/references/catalog.md +0 -0
  116. {datachain-0.2.9 → datachain-0.2.11}/docs/references/datachain.md +0 -0
  117. {datachain-0.2.9 → datachain-0.2.11}/examples/blip2_image_desc_lib.py +0 -0
  118. {datachain-0.2.9 → datachain-0.2.11}/examples/clip.py +0 -0
  119. {datachain-0.2.9 → datachain-0.2.11}/examples/common_sql_functions.py +0 -0
  120. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
  121. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
  122. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
  123. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/README.md +0 -0
  124. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  125. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
  126. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
  127. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
  128. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
  129. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  130. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  131. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  132. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  133. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  134. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  135. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  136. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  137. {datachain-0.2.9 → datachain-0.2.11}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  138. {datachain-0.2.9 → datachain-0.2.11}/examples/hf_pipeline.py +0 -0
  139. {datachain-0.2.9 → datachain-0.2.11}/examples/iptc_exif_xmp_lib.py +0 -0
  140. {datachain-0.2.9 → datachain-0.2.11}/examples/llava2_image_desc_lib.py +0 -0
  141. {datachain-0.2.9 → datachain-0.2.11}/examples/loader.py +0 -0
  142. {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/README +0 -0
  143. {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/distance_to_query.py +0 -0
  144. {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/llm_chat.py +0 -0
  145. {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/requirements.txt +0 -0
  146. {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/single_query.py +0 -0
  147. {datachain-0.2.9 → datachain-0.2.11}/examples/neurips/text_loaders.py +0 -0
  148. {datachain-0.2.9 → datachain-0.2.11}/examples/openai_image_desc_lib.py +0 -0
  149. {datachain-0.2.9 → datachain-0.2.11}/examples/torch-loader.py +0 -0
  150. {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/batching.py +0 -0
  151. {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/image_transformation.py +0 -0
  152. {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/parallel.py +0 -0
  153. {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/simple.py +0 -0
  154. {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/stateful.py +0 -0
  155. {datachain-0.2.9 → datachain-0.2.11}/examples/udfs/stateful_similarity.py +0 -0
  156. {datachain-0.2.9 → datachain-0.2.11}/examples/unstructured-text.py +0 -0
  157. {datachain-0.2.9 → datachain-0.2.11}/examples/wds_filtered.py +0 -0
  158. {datachain-0.2.9 → datachain-0.2.11}/examples/zalando/zalando_clip.py +0 -0
  159. {datachain-0.2.9 → datachain-0.2.11}/examples/zalando/zalando_dir_as_class.py +0 -0
  160. {datachain-0.2.9 → datachain-0.2.11}/examples/zalando/zalando_splits_and_classes_ds.py +0 -0
  161. {datachain-0.2.9 → datachain-0.2.11}/examples/zalando/zalando_splits_and_classes_output.py +0 -0
  162. {datachain-0.2.9 → datachain-0.2.11}/noxfile.py +0 -0
  163. {datachain-0.2.9 → datachain-0.2.11}/setup.cfg +0 -0
  164. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/__main__.py +0 -0
  165. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/asyn.py +0 -0
  166. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/cache.py +0 -0
  167. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/catalog/__init__.py +0 -0
  168. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/catalog/datasource.py +0 -0
  169. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/catalog/loader.py +0 -0
  170. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/catalog/subclass.py +0 -0
  171. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/cli_utils.py +0 -0
  172. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/__init__.py +0 -0
  173. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/azure.py +0 -0
  174. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/fileslice.py +0 -0
  175. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/fsspec.py +0 -0
  176. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/gcs.py +0 -0
  177. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/local.py +0 -0
  178. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/client/s3.py +0 -0
  179. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/config.py +0 -0
  180. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/__init__.py +0 -0
  181. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/db_engine.py +0 -0
  182. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/id_generator.py +0 -0
  183. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/job.py +0 -0
  184. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/metastore.py +0 -0
  185. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/data_storage/serializer.py +0 -0
  186. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/dataset.py +0 -0
  187. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/error.py +0 -0
  188. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/__init__.py +0 -0
  189. {datachain-0.2.9/src/datachain/remote → datachain-0.2.11/src/datachain/lib/convert}/__init__.py +0 -0
  190. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/settings.py +0 -0
  191. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/unstructured.py +0 -0
  192. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/utils.py +0 -0
  193. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/lib/vfile.py +0 -0
  194. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/listing.py +0 -0
  195. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/nodes_fetcher.py +0 -0
  196. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/nodes_thread_pool.py +0 -0
  197. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/progress.py +0 -0
  198. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/py.typed +0 -0
  199. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/__init__.py +0 -0
  200. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/builtins.py +0 -0
  201. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/metrics.py +0 -0
  202. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/params.py +0 -0
  203. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/schema.py +0 -0
  204. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/query/session.py +0 -0
  205. {datachain-0.2.9/tests/benchmarks → datachain-0.2.11/src/datachain/remote}/__init__.py +0 -0
  206. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/__init__.py +0 -0
  207. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/default/__init__.py +0 -0
  208. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/default/base.py +0 -0
  209. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/__init__.py +0 -0
  210. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/array.py +0 -0
  211. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/conditional.py +0 -0
  212. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/path.py +0 -0
  213. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/random.py +0 -0
  214. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/functions/string.py +0 -0
  215. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/selectable.py +0 -0
  216. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/sqlite/__init__.py +0 -0
  217. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/sqlite/base.py +0 -0
  218. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/sqlite/types.py +0 -0
  219. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/sqlite/vector.py +0 -0
  220. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/types.py +0 -0
  221. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/sql/utils.py +0 -0
  222. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/storage.py +0 -0
  223. {datachain-0.2.9 → datachain-0.2.11}/src/datachain/text/__init__.py +0 -0
  224. {datachain-0.2.9 → datachain-0.2.11}/src/datachain.egg-info/dependency_links.txt +0 -0
  225. {datachain-0.2.9 → datachain-0.2.11}/src/datachain.egg-info/entry_points.txt +0 -0
  226. {datachain-0.2.9 → datachain-0.2.11}/src/datachain.egg-info/top_level.txt +0 -0
  227. {datachain-0.2.9 → datachain-0.2.11}/tests/__init__.py +0 -0
  228. {datachain-0.2.9/tests/examples → datachain-0.2.11/tests/benchmarks}/__init__.py +0 -0
  229. {datachain-0.2.9 → datachain-0.2.11}/tests/benchmarks/conftest.py +0 -0
  230. {datachain-0.2.9 → datachain-0.2.11}/tests/benchmarks/test_ls.py +0 -0
  231. {datachain-0.2.9 → datachain-0.2.11}/tests/benchmarks/test_version.py +0 -0
  232. {datachain-0.2.9 → datachain-0.2.11}/tests/data.py +0 -0
  233. {datachain-0.2.9/tests/func → datachain-0.2.11/tests/examples}/__init__.py +0 -0
  234. {datachain-0.2.9 → datachain-0.2.11}/tests/examples/wds_data.py +0 -0
  235. {datachain-0.2.9/tests/unit → datachain-0.2.11/tests/func}/__init__.py +0 -0
  236. {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_client.py +0 -0
  237. {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_ls.py +0 -0
  238. {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_pytorch.py +0 -0
  239. {datachain-0.2.9 → datachain-0.2.11}/tests/func/test_query.py +0 -0
  240. {datachain-0.2.9 → datachain-0.2.11}/tests/scripts/name_len_normal.py +0 -0
  241. {datachain-0.2.9 → datachain-0.2.11}/tests/scripts/name_len_slow.py +0 -0
  242. {datachain-0.2.9 → datachain-0.2.11}/tests/test_cli_e2e.py +0 -0
  243. {datachain-0.2.9/tests/unit/lib → datachain-0.2.11/tests/unit}/__init__.py +0 -0
  244. {datachain-0.2.9/tests/unit/sql → datachain-0.2.11/tests/unit/lib}/__init__.py +0 -0
  245. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/conftest.py +0 -0
  246. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_clip.py +0 -0
  247. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_file.py +0 -0
  248. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_text.py +0 -0
  249. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/lib/test_webdataset.py +0 -0
  250. {datachain-0.2.9/tests/unit/sql/sqlite → datachain-0.2.11/tests/unit/sql}/__init__.py +0 -0
  251. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
  252. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_array.py +0 -0
  253. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_conditional.py +0 -0
  254. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_path.py +0 -0
  255. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_random.py +0 -0
  256. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_selectable.py +0 -0
  257. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/sql/test_string.py +0 -0
  258. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_asyn.py +0 -0
  259. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_cache.py +0 -0
  260. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_catalog.py +0 -0
  261. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_catalog_loader.py +0 -0
  262. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_cli_parsing.py +0 -0
  263. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_client.py +0 -0
  264. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_client_s3.py +0 -0
  265. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_data_storage.py +0 -0
  266. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_database_engine.py +0 -0
  267. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_dispatch.py +0 -0
  268. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_fileslice.py +0 -0
  269. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_id_generator.py +0 -0
  270. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_metastore.py +0 -0
  271. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_query_metrics.py +0 -0
  272. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_query_params.py +0 -0
  273. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_serializer.py +0 -0
  274. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_session.py +0 -0
  275. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_storage.py +0 -0
  276. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_utils.py +0 -0
  277. {datachain-0.2.9 → datachain-0.2.11}/tests/unit/test_warehouse.py +0 -0
  278. {datachain-0.2.9 → datachain-0.2.11}/tests/utils.py +0 -0
@@ -50,7 +50,7 @@ jobs:
50
50
  - name: Lint code
51
51
  run: nox -s lint
52
52
 
53
- tests:
53
+ datachain:
54
54
  timeout-minutes: 25
55
55
  runs-on: ${{ matrix.os }}
56
56
  strategy:
@@ -125,3 +125,73 @@ jobs:
125
125
 
126
126
  - name: Build docs
127
127
  run: nox -s docs
128
+
129
+
130
+ studio:
131
+ if: '!github.event.pull_request.head.repo.fork'
132
+ runs-on: ubuntu-latest-16-cores
133
+ strategy:
134
+ matrix:
135
+ pyv: ['3.12']
136
+ group: [1, 2, 3, 4, 5, 6]
137
+ services:
138
+ postgres:
139
+ image: postgres:16.3
140
+ ports:
141
+ - 5432:5432
142
+ env:
143
+ POSTGRES_USER: test
144
+ POSTGRES_DB: database
145
+ POSTGRES_HOST_AUTH_METHOD: trust
146
+ clickhouse:
147
+ image: clickhouse/clickhouse-server:24
148
+ ports:
149
+ - 8123:8123
150
+ - 9010:9000
151
+ env:
152
+ CLICKHOUSE_DB: studio_local_db
153
+ CLICKHOUSE_USER: studio_local
154
+ CLICKHOUSE_PASSWORD: ch123456789!
155
+ CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
156
+ redis:
157
+ image: redis:7.2.5
158
+ ports:
159
+ - 6379:6379
160
+ steps:
161
+
162
+ - name: Check out Studio
163
+ uses: actions/checkout@v4
164
+ with:
165
+ fetch-depth: 0
166
+ repository: iterative/studio
167
+ ref: develop
168
+ token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
169
+
170
+ - name: Check out repository
171
+ uses: actions/checkout@v4
172
+ with:
173
+ path: './backend/datachain'
174
+ fetch-depth: 0
175
+
176
+ - name: Set up Python ${{ matrix.pyv }}
177
+ uses: actions/setup-python@v5
178
+ with:
179
+ python-version: ${{ matrix.pyv }}
180
+ cache: 'pip'
181
+
182
+ - name: Install uv
183
+ run: |
184
+ python -m pip install --upgrade uv
185
+ uv --version
186
+
187
+ - name: Install dependencies
188
+ run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
189
+
190
+ - name: Run tests
191
+ # Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
192
+ run: >
193
+ pytest
194
+ --config-file=pyproject.toml -rsx
195
+ --splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
196
+ tests ../datachain/tests
197
+ working-directory: backend/datachain_server
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.9
3
+ Version: 0.2.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -35,10 +35,12 @@ Requires-Dist: shtab<2,>=1.3.4
35
35
  Requires-Dist: sqlalchemy>=2
36
36
  Requires-Dist: multiprocess==0.70.16
37
37
  Requires-Dist: dill==0.3.8
38
+ Requires-Dist: cloudpickle
38
39
  Requires-Dist: ujson>=5.9.0
39
40
  Requires-Dist: pydantic<3,>=2
40
41
  Requires-Dist: jmespath>=1.0
41
42
  Requires-Dist: datamodel-code-generator>=0.25
43
+ Requires-Dist: Pillow<11,>=10.0.0
42
44
  Provides-Extra: docs
43
45
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
44
46
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -46,11 +48,10 @@ Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
46
48
  Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
47
49
  Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
48
50
  Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
49
- Provides-Extra: cv
50
- Requires-Dist: Pillow<11,>=10.0.0; extra == "cv"
51
- Requires-Dist: torch>=2.1.0; extra == "cv"
52
- Requires-Dist: torchvision; extra == "cv"
53
- Requires-Dist: transformers>=4.36.0; extra == "cv"
51
+ Provides-Extra: torch
52
+ Requires-Dist: torch>=2.1.0; extra == "torch"
53
+ Requires-Dist: torchvision; extra == "torch"
54
+ Requires-Dist: transformers>=4.36.0; extra == "torch"
54
55
  Provides-Extra: remote
55
56
  Requires-Dist: lz4; extra == "remote"
56
57
  Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
@@ -58,7 +59,7 @@ Requires-Dist: requests>=2.22.0; extra == "remote"
58
59
  Provides-Extra: vector
59
60
  Requires-Dist: usearch; extra == "vector"
60
61
  Provides-Extra: tests
61
- Requires-Dist: datachain[cv,remote,vector]; extra == "tests"
62
+ Requires-Dist: datachain[remote,torch,vector]; extra == "tests"
62
63
  Requires-Dist: pytest<9,>=8; extra == "tests"
63
64
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
64
65
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
@@ -89,11 +90,11 @@ Requires-Dist: types-ujson; extra == "dev"
89
90
  .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
90
91
  :target: https://pypi.org/project/datachain
91
92
  :alt: Python Version
92
- .. |Codecov| image:: https://codecov.io/gh/iterative/dvcx/branch/main/graph/badge.svg?token=VSCP2T9R5X
93
- :target: https://app.codecov.io/gh/iterative/dvcx
93
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
94
+ :target: https://codecov.io/gh/iterative/datachain
94
95
  :alt: Codecov
95
- .. |Tests| image:: https://github.com/iterative/dvcx/workflows/Tests/badge.svg
96
- :target: https://github.com/iterative/dvcx/actions?workflow=Tests
96
+ .. |Tests| image:: https://github.com/iterative/datachain/workflows/Tests/badge.svg
97
+ :target: https://github.com/iterative/datachain/actions?workflow=Tests
97
98
  :alt: Tests
98
99
 
99
100
  AI 🔗 DataChain
@@ -397,7 +398,8 @@ Chain results can be exported or passed directly to Pytorch dataloader. For exam
397
398
  Tutorials
398
399
  ------------------
399
400
 
400
- * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
401
+ * `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
402
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
401
403
 
402
404
  Contributions
403
405
  --------------------
@@ -6,11 +6,11 @@
6
6
  .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
7
7
  :target: https://pypi.org/project/datachain
8
8
  :alt: Python Version
9
- .. |Codecov| image:: https://codecov.io/gh/iterative/dvcx/branch/main/graph/badge.svg?token=VSCP2T9R5X
10
- :target: https://app.codecov.io/gh/iterative/dvcx
9
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
10
+ :target: https://codecov.io/gh/iterative/datachain
11
11
  :alt: Codecov
12
- .. |Tests| image:: https://github.com/iterative/dvcx/workflows/Tests/badge.svg
13
- :target: https://github.com/iterative/dvcx/actions?workflow=Tests
12
+ .. |Tests| image:: https://github.com/iterative/datachain/workflows/Tests/badge.svg
13
+ :target: https://github.com/iterative/datachain/actions?workflow=Tests
14
14
  :alt: Tests
15
15
 
16
16
  AI 🔗 DataChain
@@ -314,7 +314,8 @@ Chain results can be exported or passed directly to Pytorch dataloader. For exam
314
314
  Tutorials
315
315
  ------------------
316
316
 
317
- * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/dvclive/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
317
+ * `Computer Vision <examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/computer_vision/fashion_product_images/1-quick-start.ipynb>`__)
318
+ * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
318
319
 
319
320
  Contributions
320
321
  --------------------
@@ -103,7 +103,8 @@ def main():
103
103
  print("========================================================================")
104
104
  print("static CSV with header schema test parsing 3.5K objects")
105
105
  print("========================================================================")
106
- static_csv_ds = DataChain.from_csv(uri, spec=ChatFeature)
106
+ static_csv_ds = DataChain.from_csv(uri, output=ChatFeature, object_name="chat")
107
+ static_csv_ds.print_schema()
107
108
  print(static_csv_ds.to_pandas())
108
109
 
109
110
  uri = "gs://datachain-demo/laion-aesthetics-csv"
@@ -111,7 +112,8 @@ def main():
111
112
  print("========================================================================")
112
113
  print("dynamic CSV with header schema test parsing 3M objects")
113
114
  print("========================================================================")
114
- dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", show_schema=True)
115
+ dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion")
116
+ dynamic_csv_ds.print_schema()
115
117
  print(dynamic_csv_ds.to_pandas())
116
118
 
117
119
 
@@ -0,0 +1,57 @@
1
+ import os
2
+
3
+ import anthropic
4
+ from anthropic.types import Message
5
+
6
+ from datachain import Column, DataChain
7
+ from datachain.sql.functions import path
8
+
9
+ DATA = "gs://dvcx-datalakes/chatbot-public"
10
+ MODEL = "claude-3-opus-20240229"
11
+ PROMPT = """Consider the following dialogues between the 'user' and the 'bot' separated\
12
+ by '===='. The 'user' is a human trying to find the best mobile plan. The 'bot' is a \
13
+ chatbot designed to query the user and offer the best solution. The dialog is \
14
+ successful if the 'bot' is able to gather the information and offer a plan, or inform \
15
+ the user that such plan does not exist. The dialog is not successful if the \
16
+ conversation ends early or the 'user' requests additional functions the 'bot' \
17
+ cannot perform. Read the dialogues and classify them into a fixed number of concise \
18
+ failure reasons covering most failure cases. Present output as JSON list of reason \
19
+ strings and nothing else.
20
+ """
21
+
22
+ TEMPERATURE = 0.9
23
+ DEFAULT_OUTPUT_TOKENS = 1024
24
+
25
+ API_KEY = os.environ.get("ANTHROPIC_API_KEY")
26
+
27
+
28
+ chain = (
29
+ DataChain.from_storage(DATA, type="text")
30
+ .filter(Column("file.name").glob("*.txt"))
31
+ .limit(5)
32
+ .settings(parallel=4, cache=True)
33
+ .agg(
34
+ dialogues=lambda file: ["\n=====\n".join(f.read() for f in file)],
35
+ output=str,
36
+ partition_by=path.file_ext(Column("name")),
37
+ )
38
+ .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
39
+ .map(
40
+ claude=lambda client, dialogues: client.messages.create(
41
+ model=MODEL,
42
+ system=PROMPT,
43
+ messages=[
44
+ {"role": "user", "content": dialogues},
45
+ ],
46
+ temperature=TEMPERATURE,
47
+ max_tokens=DEFAULT_OUTPUT_TOKENS,
48
+ ),
49
+ output=Message,
50
+ )
51
+ .map(
52
+ res=lambda claude: claude.content[0].text if claude.content else [],
53
+ output=str,
54
+ )
55
+ )
56
+
57
+ chain.show()
@@ -1,12 +1,13 @@
1
1
  import json
2
+ import os
2
3
 
3
- import pandas as pd
4
+ import anthropic
5
+ from anthropic.types import Message
6
+ from pydantic import BaseModel
4
7
 
5
- from datachain.lib.claude import claude_processor
6
- from datachain.lib.dc import C, DataChain
7
- from datachain.lib.feature import Feature
8
+ from datachain import Column, DataChain, File
8
9
 
9
- SOURCE = "gs://dvcx-datalakes/chatbot-public"
10
+ DATA = "gs://dvcx-datalakes/chatbot-public"
10
11
  MODEL = "claude-3-opus-20240229"
11
12
  PROMPT = """Consider the dialogue between the 'user' and the 'bot'. \
12
13
  The 'user' is a human trying to find the best mobile plan. \
@@ -20,19 +21,38 @@ if it is successful, and 'Failure' if not. After that, provide \
20
21
  one-sentence explanation of the reasons for this rating. Use only \
21
22
  JSON object as output with the keys 'status', and 'explanation'.
22
23
  """
24
+ TEMPERATURE = 0.9
25
+ DEFAULT_OUTPUT_TOKENS = 1024
23
26
 
27
+ API_KEY = os.environ.get("ANTHROPIC_API_KEY")
24
28
 
25
- class Rating(Feature):
29
+
30
+ class Rating(BaseModel):
26
31
  status: str = ""
27
32
  explanation: str = ""
28
33
 
29
34
 
30
35
  chain = (
31
- DataChain.from_storage(SOURCE, type="text")
32
- .filter(C.name.glob("*.txt"))
33
- .settings(parallel=3)
36
+ DataChain.from_storage(DATA, type="text")
37
+ .filter(Column("file.name").glob("*.txt"))
34
38
  .limit(5)
35
- .map(claude=claude_processor(prompt=PROMPT, model=MODEL))
39
+ .settings(parallel=4, cache=True)
40
+ .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
41
+ .map(
42
+ claude=lambda client, file: client.messages.create(
43
+ model=MODEL,
44
+ system=PROMPT,
45
+ messages=[
46
+ {
47
+ "role": "user",
48
+ "content": file.read() if isinstance(file, File) else file,
49
+ },
50
+ ],
51
+ temperature=TEMPERATURE,
52
+ max_tokens=DEFAULT_OUTPUT_TOKENS,
53
+ ),
54
+ output=Message,
55
+ )
36
56
  .map(
37
57
  rating=lambda claude: Rating(
38
58
  **(json.loads(claude.content[0].text) if claude.content else {})
@@ -41,7 +61,4 @@ chain = (
41
61
  )
42
62
  )
43
63
 
44
- df = chain.to_pandas()
45
-
46
- with pd.option_context("display.max_columns", None):
47
- print(df)
64
+ chain.show()
@@ -0,0 +1,39 @@
1
+ import os
2
+
3
+ import anthropic
4
+ from anthropic.types import Message
5
+
6
+ from datachain import Column, DataChain, File
7
+
8
+ DATA = "gs://dvcx-datalakes/chatbot-public"
9
+ MODEL = "claude-3-opus-20240229"
10
+ PROMPT = """Summarise the dialog in a sentence"""
11
+ TEMPERATURE = 0.9
12
+ DEFAULT_OUTPUT_TOKENS = 1024
13
+
14
+ API_KEY = os.environ.get("ANTHROPIC_API_KEY")
15
+
16
+ chain = (
17
+ DataChain.from_storage(DATA, type="text")
18
+ .filter(Column("file.name").glob("*.txt"))
19
+ .limit(5)
20
+ .settings(parallel=4, cache=True)
21
+ .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
22
+ .map(
23
+ claude=lambda client, file: client.messages.create(
24
+ model=MODEL,
25
+ system=PROMPT,
26
+ messages=[
27
+ {
28
+ "role": "user",
29
+ "content": file.read() if isinstance(file, File) else file,
30
+ },
31
+ ],
32
+ temperature=TEMPERATURE,
33
+ max_tokens=DEFAULT_OUTPUT_TOKENS,
34
+ ),
35
+ output=Message,
36
+ )
37
+ )
38
+
39
+ chain.show()