datachain 0.13.1__tar.gz → 0.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (353) hide show
  1. {datachain-0.13.1 → datachain-0.14.0}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.13.1 → datachain-0.14.0}/PKG-INFO +10 -10
  3. {datachain-0.13.1 → datachain-0.14.0}/README.rst +9 -9
  4. {datachain-0.13.1 → datachain-0.14.0}/docs/examples.md +15 -16
  5. {datachain-0.13.1 → datachain-0.14.0}/docs/quick-start.md +23 -20
  6. {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/file.md +4 -4
  7. {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/imagefile.md +3 -3
  8. {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/textfile.md +3 -3
  9. {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/videofile.md +3 -3
  10. {datachain-0.13.1 → datachain-0.14.0}/docs/references/datachain.md +22 -0
  11. {datachain-0.13.1 → datachain-0.14.0}/docs/references/remotes.md +4 -4
  12. {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/iptc_exif_xmp_lib.py +4 -4
  13. {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/llava2_image_desc_lib.py +4 -4
  14. {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/openimage-detect.py +4 -3
  15. {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/ultralytics-bbox.py +4 -4
  16. {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/ultralytics-pose.py +4 -4
  17. {datachain-0.13.1 → datachain-0.14.0}/examples/computer_vision/ultralytics-segment.py +4 -4
  18. {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/common_sql_functions.py +14 -14
  19. {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/json-csv-reader.py +8 -12
  20. {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/torch-loader.py +3 -3
  21. {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/udfs/parallel.py +2 -2
  22. {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/udfs/simple.py +2 -2
  23. {datachain-0.13.1 → datachain-0.14.0}/examples/get_started/udfs/stateful.py +4 -4
  24. {datachain-0.13.1 → datachain-0.14.0}/examples/llm_and_nlp/claude-query.py +7 -7
  25. {datachain-0.13.1 → datachain-0.14.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +5 -7
  26. {datachain-0.13.1 → datachain-0.14.0}/examples/multimodal/clip_inference.py +6 -8
  27. {datachain-0.13.1 → datachain-0.14.0}/examples/multimodal/hf_pipeline.py +10 -10
  28. {datachain-0.13.1 → datachain-0.14.0}/examples/multimodal/openai_image_desc_lib.py +3 -3
  29. {datachain-0.13.1 → datachain-0.14.0}/examples/multimodal/wds.py +4 -8
  30. {datachain-0.13.1 → datachain-0.14.0}/examples/multimodal/wds_filtered.py +5 -4
  31. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/__init__.py +28 -1
  32. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/catalog/catalog.py +5 -9
  33. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/ls.py +2 -2
  34. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/show.py +2 -3
  35. datachain-0.14.0/src/datachain/lib/dc/__init__.py +32 -0
  36. datachain-0.14.0/src/datachain/lib/dc/csv.py +127 -0
  37. datachain-0.13.1/src/datachain/lib/dc.py → datachain-0.14.0/src/datachain/lib/dc/datachain.py +144 -733
  38. datachain-0.14.0/src/datachain/lib/dc/datasets.py +149 -0
  39. datachain-0.14.0/src/datachain/lib/dc/hf.py +73 -0
  40. datachain-0.14.0/src/datachain/lib/dc/json.py +91 -0
  41. datachain-0.14.0/src/datachain/lib/dc/listings.py +43 -0
  42. datachain-0.14.0/src/datachain/lib/dc/pandas.py +56 -0
  43. datachain-0.14.0/src/datachain/lib/dc/parquet.py +65 -0
  44. datachain-0.14.0/src/datachain/lib/dc/records.py +90 -0
  45. datachain-0.14.0/src/datachain/lib/dc/storage.py +118 -0
  46. datachain-0.14.0/src/datachain/lib/dc/utils.py +128 -0
  47. datachain-0.14.0/src/datachain/lib/dc/values.py +53 -0
  48. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/meta_formats.py +2 -4
  49. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/pytorch.py +2 -2
  50. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/udf.py +3 -3
  51. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/toolkit/split.py +2 -2
  52. {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/PKG-INFO +10 -10
  53. {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/SOURCES.txt +13 -1
  54. {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/test_datachain.py +2 -2
  55. {datachain-0.13.1 → datachain-0.14.0}/tests/conftest.py +52 -4
  56. {datachain-0.13.1 → datachain-0.14.0}/tests/examples/test_wds_e2e.py +8 -8
  57. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_catalog.py +15 -15
  58. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_cloud_transfer.py +2 -2
  59. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_data_storage.py +2 -2
  60. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_datachain.py +193 -195
  61. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_datachain_merge.py +5 -5
  62. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_datasets.py +9 -13
  63. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_feature_pickling.py +11 -11
  64. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_file.py +3 -3
  65. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_hidden_field.py +6 -6
  66. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_listing.py +4 -4
  67. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_ls.py +2 -2
  68. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_pull.py +2 -2
  69. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_pytorch.py +3 -3
  70. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_query.py +5 -4
  71. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_session.py +4 -4
  72. {datachain-0.13.1 → datachain-0.14.0}/tests/scripts/feature_class.py +3 -3
  73. {datachain-0.13.1 → datachain-0.14.0}/tests/scripts/feature_class_exception.py +6 -6
  74. {datachain-0.13.1 → datachain-0.14.0}/tests/scripts/feature_class_parallel.py +3 -3
  75. {datachain-0.13.1 → datachain-0.14.0}/tests/scripts/feature_class_parallel_data_model.py +3 -2
  76. {datachain-0.13.1 → datachain-0.14.0}/tests/scripts/name_len_slow.py +3 -3
  77. {datachain-0.13.1 → datachain-0.14.0}/tests/test_import_time.py +10 -10
  78. {datachain-0.13.1 → datachain-0.14.0}/tests/test_telemetry.py +2 -2
  79. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_arrow.py +3 -3
  80. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_datachain.py +328 -358
  81. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_datachain_bootstrap.py +6 -5
  82. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_datachain_merge.py +23 -24
  83. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_diff.py +36 -38
  84. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_feature_utils.py +12 -12
  85. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_schema.py +4 -4
  86. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_func.py +169 -115
  87. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_listing.py +4 -8
  88. {datachain-0.13.1 → datachain-0.14.0}/tests/utils.py +17 -5
  89. {datachain-0.13.1 → datachain-0.14.0}/.cruft.json +0 -0
  90. {datachain-0.13.1 → datachain-0.14.0}/.gitattributes +0 -0
  91. {datachain-0.13.1 → datachain-0.14.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  92. {datachain-0.13.1 → datachain-0.14.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  93. {datachain-0.13.1 → datachain-0.14.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  94. {datachain-0.13.1 → datachain-0.14.0}/.github/codecov.yaml +0 -0
  95. {datachain-0.13.1 → datachain-0.14.0}/.github/dependabot.yml +0 -0
  96. {datachain-0.13.1 → datachain-0.14.0}/.github/workflows/benchmarks.yml +0 -0
  97. {datachain-0.13.1 → datachain-0.14.0}/.github/workflows/release.yml +0 -0
  98. {datachain-0.13.1 → datachain-0.14.0}/.github/workflows/tests-studio.yml +0 -0
  99. {datachain-0.13.1 → datachain-0.14.0}/.github/workflows/tests.yml +0 -0
  100. {datachain-0.13.1 → datachain-0.14.0}/.github/workflows/update-template.yaml +0 -0
  101. {datachain-0.13.1 → datachain-0.14.0}/.gitignore +0 -0
  102. {datachain-0.13.1 → datachain-0.14.0}/CODE_OF_CONDUCT.rst +0 -0
  103. {datachain-0.13.1 → datachain-0.14.0}/LICENSE +0 -0
  104. {datachain-0.13.1 → datachain-0.14.0}/docs/assets/captioned_cartoons.png +0 -0
  105. {datachain-0.13.1 → datachain-0.14.0}/docs/assets/datachain-white.svg +0 -0
  106. {datachain-0.13.1 → datachain-0.14.0}/docs/assets/datachain.svg +0 -0
  107. {datachain-0.13.1 → datachain-0.14.0}/docs/contributing.md +0 -0
  108. {datachain-0.13.1 → datachain-0.14.0}/docs/css/github-permalink-style.css +0 -0
  109. {datachain-0.13.1 → datachain-0.14.0}/docs/index.md +0 -0
  110. {datachain-0.13.1 → datachain-0.14.0}/docs/overrides/main.html +0 -0
  111. {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/arrowrow.md +0 -0
  112. {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/bbox.md +0 -0
  113. {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/index.md +0 -0
  114. {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/pose.md +0 -0
  115. {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/segment.md +0 -0
  116. {datachain-0.13.1 → datachain-0.14.0}/docs/references/data-types/tarvfile.md +0 -0
  117. {datachain-0.13.1 → datachain-0.14.0}/docs/references/func.md +0 -0
  118. {datachain-0.13.1 → datachain-0.14.0}/docs/references/index.md +0 -0
  119. {datachain-0.13.1 → datachain-0.14.0}/docs/references/toolkit.md +0 -0
  120. {datachain-0.13.1 → datachain-0.14.0}/docs/references/torch.md +0 -0
  121. {datachain-0.13.1 → datachain-0.14.0}/docs/references/udf.md +0 -0
  122. {datachain-0.13.1 → datachain-0.14.0}/docs/tutorials.md +0 -0
  123. {datachain-0.13.1 → datachain-0.14.0}/mkdocs.yml +0 -0
  124. {datachain-0.13.1 → datachain-0.14.0}/noxfile.py +0 -0
  125. {datachain-0.13.1 → datachain-0.14.0}/pyproject.toml +0 -0
  126. {datachain-0.13.1 → datachain-0.14.0}/setup.cfg +0 -0
  127. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/__main__.py +0 -0
  128. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/asyn.py +0 -0
  129. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cache.py +0 -0
  130. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/catalog/__init__.py +0 -0
  131. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/catalog/datasource.py +0 -0
  132. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/catalog/loader.py +0 -0
  133. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/__init__.py +0 -0
  134. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/__init__.py +0 -0
  135. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/datasets.py +0 -0
  136. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/du.py +0 -0
  137. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/index.py +0 -0
  138. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/misc.py +0 -0
  139. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/commands/query.py +0 -0
  140. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/parser/__init__.py +0 -0
  141. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/parser/job.py +0 -0
  142. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/parser/studio.py +0 -0
  143. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/parser/utils.py +0 -0
  144. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/cli/utils.py +0 -0
  145. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/__init__.py +0 -0
  146. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/azure.py +0 -0
  147. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/fileslice.py +0 -0
  148. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/fsspec.py +0 -0
  149. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/gcs.py +0 -0
  150. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/hf.py +0 -0
  151. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/local.py +0 -0
  152. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/client/s3.py +0 -0
  153. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/config.py +0 -0
  154. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/__init__.py +0 -0
  155. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/db_engine.py +0 -0
  156. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/job.py +0 -0
  157. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/metastore.py +0 -0
  158. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/schema.py +0 -0
  159. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/serializer.py +0 -0
  160. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/sqlite.py +0 -0
  161. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/data_storage/warehouse.py +0 -0
  162. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/dataset.py +0 -0
  163. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/diff/__init__.py +0 -0
  164. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/error.py +0 -0
  165. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/fs/__init__.py +0 -0
  166. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/fs/reference.py +0 -0
  167. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/fs/utils.py +0 -0
  168. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/__init__.py +0 -0
  169. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/aggregate.py +0 -0
  170. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/array.py +0 -0
  171. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/base.py +0 -0
  172. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/conditional.py +0 -0
  173. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/func.py +0 -0
  174. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/numeric.py +0 -0
  175. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/path.py +0 -0
  176. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/random.py +0 -0
  177. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/string.py +0 -0
  178. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/func/window.py +0 -0
  179. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/job.py +0 -0
  180. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/__init__.py +0 -0
  181. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/arrow.py +0 -0
  182. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/clip.py +0 -0
  183. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/__init__.py +0 -0
  184. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/flatten.py +0 -0
  185. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  186. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  187. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/unflatten.py +0 -0
  188. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  189. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/data_model.py +0 -0
  190. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/dataset_info.py +0 -0
  191. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/file.py +0 -0
  192. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/hf.py +0 -0
  193. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/image.py +0 -0
  194. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/listing.py +0 -0
  195. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/listing_info.py +0 -0
  196. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/model_store.py +0 -0
  197. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/settings.py +0 -0
  198. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/signal_schema.py +0 -0
  199. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/tar.py +0 -0
  200. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/text.py +0 -0
  201. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/udf_signature.py +0 -0
  202. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/utils.py +0 -0
  203. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/video.py +0 -0
  204. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/webdataset.py +0 -0
  205. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/lib/webdataset_laion.py +0 -0
  206. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/listing.py +0 -0
  207. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/__init__.py +0 -0
  208. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/bbox.py +0 -0
  209. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/pose.py +0 -0
  210. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/segment.py +0 -0
  211. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  212. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  213. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/ultralytics/pose.py +0 -0
  214. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/ultralytics/segment.py +0 -0
  215. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/model/utils.py +0 -0
  216. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/node.py +0 -0
  217. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/nodes_fetcher.py +0 -0
  218. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/nodes_thread_pool.py +0 -0
  219. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/progress.py +0 -0
  220. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/py.typed +0 -0
  221. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/__init__.py +0 -0
  222. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/batch.py +0 -0
  223. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/dataset.py +0 -0
  224. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/dispatch.py +0 -0
  225. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/metrics.py +0 -0
  226. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/params.py +0 -0
  227. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/queue.py +0 -0
  228. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/schema.py +0 -0
  229. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/session.py +0 -0
  230. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/udf.py +0 -0
  231. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/query/utils.py +0 -0
  232. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/remote/__init__.py +0 -0
  233. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/remote/studio.py +0 -0
  234. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/script_meta.py +0 -0
  235. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/__init__.py +0 -0
  236. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/default/__init__.py +0 -0
  237. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/default/base.py +0 -0
  238. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/__init__.py +0 -0
  239. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/aggregate.py +0 -0
  240. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/array.py +0 -0
  241. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/conditional.py +0 -0
  242. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/numeric.py +0 -0
  243. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/path.py +0 -0
  244. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/random.py +0 -0
  245. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/functions/string.py +0 -0
  246. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/selectable.py +0 -0
  247. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  248. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/sqlite/base.py +0 -0
  249. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/sqlite/types.py +0 -0
  250. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/sqlite/vector.py +0 -0
  251. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/types.py +0 -0
  252. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/sql/utils.py +0 -0
  253. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/studio.py +0 -0
  254. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/telemetry.py +0 -0
  255. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/toolkit/__init__.py +0 -0
  256. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/torch/__init__.py +0 -0
  257. {datachain-0.13.1 → datachain-0.14.0}/src/datachain/utils.py +0 -0
  258. {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  259. {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/entry_points.txt +0 -0
  260. {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/requires.txt +0 -0
  261. {datachain-0.13.1 → datachain-0.14.0}/src/datachain.egg-info/top_level.txt +0 -0
  262. {datachain-0.13.1 → datachain-0.14.0}/tests/__init__.py +0 -0
  263. {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/__init__.py +0 -0
  264. {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/conftest.py +0 -0
  265. {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  266. {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  267. {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/datasets/.gitignore +0 -0
  268. {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  269. {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/test_ls.py +0 -0
  270. {datachain-0.13.1 → datachain-0.14.0}/tests/benchmarks/test_version.py +0 -0
  271. {datachain-0.13.1 → datachain-0.14.0}/tests/data.py +0 -0
  272. {datachain-0.13.1 → datachain-0.14.0}/tests/examples/__init__.py +0 -0
  273. {datachain-0.13.1 → datachain-0.14.0}/tests/examples/test_examples.py +0 -0
  274. {datachain-0.13.1 → datachain-0.14.0}/tests/examples/wds_data.py +0 -0
  275. {datachain-0.13.1 → datachain-0.14.0}/tests/func/__init__.py +0 -0
  276. {datachain-0.13.1 → datachain-0.14.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  277. {datachain-0.13.1 → datachain-0.14.0}/tests/func/data/lena.jpg +0 -0
  278. {datachain-0.13.1 → datachain-0.14.0}/tests/func/fake-service-account-credentials.json +0 -0
  279. {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/__init__.py +0 -0
  280. {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/data/running-mask0.png +0 -0
  281. {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/data/running-mask1.png +0 -0
  282. {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/data/running.jpg +0 -0
  283. {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/data/ships.jpg +0 -0
  284. {datachain-0.13.1 → datachain-0.14.0}/tests/func/model/test_yolo.py +0 -0
  285. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_client.py +0 -0
  286. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_dataset_query.py +0 -0
  287. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_hf.py +0 -0
  288. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_image.py +0 -0
  289. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_meta_formats.py +0 -0
  290. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_metrics.py +0 -0
  291. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_toolkit.py +0 -0
  292. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_video.py +0 -0
  293. {datachain-0.13.1 → datachain-0.14.0}/tests/func/test_warehouse.py +0 -0
  294. {datachain-0.13.1 → datachain-0.14.0}/tests/test_atomicity.py +0 -0
  295. {datachain-0.13.1 → datachain-0.14.0}/tests/test_cli_e2e.py +0 -0
  296. {datachain-0.13.1 → datachain-0.14.0}/tests/test_cli_studio.py +0 -0
  297. {datachain-0.13.1 → datachain-0.14.0}/tests/test_query_e2e.py +0 -0
  298. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/__init__.py +0 -0
  299. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/__init__.py +0 -0
  300. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/conftest.py +0 -0
  301. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_clip.py +0 -0
  302. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_feature.py +0 -0
  303. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_file.py +0 -0
  304. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_hf.py +0 -0
  305. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_image.py +0 -0
  306. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_listing_info.py +0 -0
  307. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  308. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_signal_schema.py +0 -0
  309. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  310. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_text.py +0 -0
  311. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_udf_signature.py +0 -0
  312. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_utils.py +0 -0
  313. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/lib/test_webdataset.py +0 -0
  314. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/model/__init__.py +0 -0
  315. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/model/test_bbox.py +0 -0
  316. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/model/test_pose.py +0 -0
  317. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/model/test_segment.py +0 -0
  318. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/model/test_utils.py +0 -0
  319. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/__init__.py +0 -0
  320. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  321. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  322. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  323. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_array.py +0 -0
  324. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_conditional.py +0 -0
  325. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_path.py +0 -0
  326. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_random.py +0 -0
  327. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_selectable.py +0 -0
  328. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/sql/test_string.py +0 -0
  329. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_asyn.py +0 -0
  330. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_cache.py +0 -0
  331. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_catalog.py +0 -0
  332. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_catalog_loader.py +0 -0
  333. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_cli_parsing.py +0 -0
  334. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_client.py +0 -0
  335. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_client_gcs.py +0 -0
  336. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_client_s3.py +0 -0
  337. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_config.py +0 -0
  338. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_data_storage.py +0 -0
  339. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_database_engine.py +0 -0
  340. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_dataset.py +0 -0
  341. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_dispatch.py +0 -0
  342. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_fileslice.py +0 -0
  343. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_metastore.py +0 -0
  344. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_module_exports.py +0 -0
  345. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_pytorch.py +0 -0
  346. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_query.py +0 -0
  347. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_query_metrics.py +0 -0
  348. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_query_params.py +0 -0
  349. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_script_meta.py +0 -0
  350. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_serializer.py +0 -0
  351. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_session.py +0 -0
  352. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_utils.py +0 -0
  353. {datachain-0.13.1 → datachain-0.14.0}/tests/unit/test_warehouse.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.11.0'
27
+ rev: 'v0.11.2'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.13.1
3
+ Version: 0.14.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -169,16 +169,16 @@ high confidence scores.
169
169
 
170
170
  .. code:: py
171
171
 
172
- from datachain import Column, DataChain
172
+ import datachain as dc
173
173
 
174
- meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
175
- images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
174
+ meta = dc.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
175
+ images = dc.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
176
176
 
177
177
  images_id = images.map(id=lambda file: file.path.split('.')[-2])
178
178
  annotated = images_id.merge(meta, on="id", right_on="meta.id")
179
179
 
180
- likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
181
- & (Column("meta.inference.class_") == "cat"))
180
+ likely_cats = annotated.filter((dc.Column("meta.inference.confidence") > 0.93) \
181
+ & (dc.Column("meta.inference.class_") == "cat"))
182
182
  likely_cats.to_storage("high-confidence-cats/", signal="file")
183
183
 
184
184
 
@@ -199,11 +199,11 @@ Python code:
199
199
 
200
200
  import os
201
201
  from mistralai import Mistral
202
- from datachain import File, DataChain, Column
202
+ import datachain as dc
203
203
 
204
204
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
205
205
 
206
- def eval_dialogue(file: File) -> bool:
206
+ def eval_dialogue(file: dc.File) -> bool:
207
207
  client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
208
208
  response = client.chat.complete(
209
209
  model="open-mixtral-8x22b",
@@ -213,13 +213,13 @@ Python code:
213
213
  return result.lower().startswith("success")
214
214
 
215
215
  chain = (
216
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
216
+ dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
217
217
  .settings(parallel=4, cache=True)
218
218
  .map(is_success=eval_dialogue)
219
219
  .save("mistral_files")
220
220
  )
221
221
 
222
- successful_chain = chain.filter(Column("is_success") == True)
222
+ successful_chain = chain.filter(dc.Column("is_success") == True)
223
223
  successful_chain.to_storage("./output_mistral")
224
224
 
225
225
  print(f"{successful_chain.count()} files were exported")
@@ -58,16 +58,16 @@ high confidence scores.
58
58
 
59
59
  .. code:: py
60
60
 
61
- from datachain import Column, DataChain
61
+ import datachain as dc
62
62
 
63
- meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
64
- images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
63
+ meta = dc.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
64
+ images = dc.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
65
65
 
66
66
  images_id = images.map(id=lambda file: file.path.split('.')[-2])
67
67
  annotated = images_id.merge(meta, on="id", right_on="meta.id")
68
68
 
69
- likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
70
- & (Column("meta.inference.class_") == "cat"))
69
+ likely_cats = annotated.filter((dc.Column("meta.inference.confidence") > 0.93) \
70
+ & (dc.Column("meta.inference.class_") == "cat"))
71
71
  likely_cats.to_storage("high-confidence-cats/", signal="file")
72
72
 
73
73
 
@@ -88,11 +88,11 @@ Python code:
88
88
 
89
89
  import os
90
90
  from mistralai import Mistral
91
- from datachain import File, DataChain, Column
91
+ import datachain as dc
92
92
 
93
93
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
94
94
 
95
- def eval_dialogue(file: File) -> bool:
95
+ def eval_dialogue(file: dc.File) -> bool:
96
96
  client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
97
97
  response = client.chat.complete(
98
98
  model="open-mixtral-8x22b",
@@ -102,13 +102,13 @@ Python code:
102
102
  return result.lower().startswith("success")
103
103
 
104
104
  chain = (
105
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
105
+ dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
106
106
  .settings(parallel=4, cache=True)
107
107
  .map(is_success=eval_dialogue)
108
108
  .save("mistral_files")
109
109
  )
110
110
 
111
- successful_chain = chain.filter(Column("is_success") == True)
111
+ successful_chain = chain.filter(dc.Column("is_success") == True)
112
112
  successful_chain.to_storage("./output_mistral")
113
113
 
114
114
  print(f"{successful_chain.count()} files were exported")
@@ -13,10 +13,10 @@ title: Examples
13
13
  For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”:
14
14
 
15
15
  ```python
16
- from datachain import Column, DataChain, File # (1)!
16
+ import datachain as dc # (1)!
17
17
  from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
18
18
 
19
- images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
19
+ images = dc.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
20
20
 
21
21
  model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-mix-224")
22
22
  processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
@@ -80,7 +80,7 @@ In the below example, we are calling a Mixtral 8x22b model to judge the “servi
80
80
  # $ export MISTRAL_API_KEY='your key'
81
81
 
82
82
  import os
83
- from datachain import Column, DataChain, DataModel, Feature
83
+ import datachain as dc
84
84
  from mistralai.client import MistralClient
85
85
  from mistralai.models.chat_completion import ChatMessage
86
86
  from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel
@@ -89,12 +89,12 @@ prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in
89
89
  api_key = os.environ["MISTRAL_API_KEY"]
90
90
 
91
91
  ## register the data model ###
92
- DataModel.register(MistralModel)
92
+ dc.DataModel.register(MistralModel)
93
93
 
94
94
  chain = (
95
- DataChain
95
+ dc
96
96
  .from_storage("gs://datachain-demo/chatbot-KiT/", type="text")
97
- .filter(Column("file.name").glob("*.txt"))
97
+ .filter(dc.Column("file.name").glob("*.txt"))
98
98
  .limit(5)
99
99
  .settings(parallel=4, cache=True)
100
100
  .map(
@@ -145,13 +145,13 @@ The cost of 5 calls to Mixtral 8x22b : $0.0142
145
145
  The “save” operation makes chain dataset persistent in the current (working) directory of the query. A hidden folder `.datachain/` holds the records. A persistent dataset can be accessed later to start a derivative chain:
146
146
 
147
147
  ```python
148
- DataChain.from_dataset("rating").limit(2).save("dialog-rating")
148
+ dc.from_dataset("rating").limit(2).save("dialog-rating")
149
149
  ```
150
150
 
151
151
  Persistent datasets are immutable and automatically versioned. Here is how to access the dataset registry:
152
152
 
153
153
  ```python
154
- mydatasets = DataChain.datasets()
154
+ mydatasets = dc.datasets()
155
155
  for ds in mydatasets.collect("dataset"):
156
156
  print(f"{ds.name}@v{ds.version}")
157
157
 
@@ -167,7 +167,7 @@ dialog-rating@v2
167
167
  By default, when a saved dataset is loaded, the latest version is fetched but another version can be requested:
168
168
 
169
169
  ```python
170
- ds = DataChain.from_dataset("dialog-rating", version = 1)
170
+ ds = dc.from_dataset("dialog-rating", version = 1)
171
171
  ```
172
172
 
173
173
  ### Chain execution, optimization and parallelism
@@ -190,7 +190,7 @@ Here is an example of reading a simple CSV file where schema is heuristically de
190
190
  from datachain import DataChain
191
191
 
192
192
  uri="gs://datachain-demo/chatbot-csv/"
193
- csv_dataset = DataChain.from_csv(uri)
193
+ csv_dataset = dc.from_csv(uri)
194
194
 
195
195
  print(csv_dataset.to_pandas())
196
196
  ```
@@ -231,15 +231,14 @@ Note how complicated the setup is. Every image is references by the name, and th
231
231
  However, Datachain can easily parse the entire COCO structure via several reading and merging operators:
232
232
 
233
233
  ```python
234
-
235
- from datachain import Column, DataChain
234
+ import datachain as dc
236
235
 
237
236
  images_uri="gs://datachain-demo/coco2017/images/val/"
238
237
  captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json"
239
238
 
240
- images = DataChain.from_storage(images_uri)
241
- meta = DataChain.from_json(captions_uri, jmespath = "images")
242
- captions = DataChain.from_json(captions_uri, jmespath = "annotations")
239
+ images = dc.from_storage(images_uri)
240
+ meta = dc.from_json(captions_uri, jmespath = "images")
241
+ captions = dc.from_json(captions_uri, jmespath = "annotations")
243
242
 
244
243
  images_meta = images.merge(meta, on="file.name", right_on="images.file_name")
245
244
  captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id")
@@ -248,7 +247,7 @@ captioned_images = images_meta.merge(captions, on="images.id", right_on="annotat
248
247
  The resulting dataset has image entries as files decorated with all the metadata and captions:
249
248
 
250
249
  ```python
251
- images_with_dogs = captioned_images.filter(Column("annotations.caption").glob("*dog*"))
250
+ images_with_dogs = captioned_images.filter(dc.Column("annotations.caption").glob("*dog*"))
252
251
  images_with_dogs.select("annotations", "file.name").show()
253
252
  ```
254
253
 
@@ -37,16 +37,16 @@ Example of downloading only _`high-confidence cat`_ inferred images
37
37
  using JSON metadata:
38
38
 
39
39
  ``` py
40
- from datachain import Column, DataChain
40
+ import datachain as dc
41
41
 
42
- meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
43
- images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
42
+ meta = dc.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
43
+ images = dc.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
44
44
 
45
45
  images_id = images.map(id=lambda file: file.path.split('.')[-2])
46
46
  annotated = images_id.merge(meta, on="id", right_on="meta.id")
47
47
 
48
- likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
49
- & (Column("meta.inference.class_") == "cat"))
48
+ likely_cats = annotated.filter((dc.Column("meta.inference.confidence") > 0.93) \
49
+ & (dc.Column("meta.inference.class_") == "cat"))
50
50
  likely_cats.to_storage("high-confidence-cats/", signal="file")
51
51
  ```
52
52
 
@@ -67,7 +67,7 @@ sentiment detected are then copied to the local directory.
67
67
 
68
68
  ``` py
69
69
  from transformers import pipeline
70
- from datachain import DataChain, Column
70
+ import datachain as dc
71
71
 
72
72
  classifier = pipeline("sentiment-analysis", device="cpu",
73
73
  model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
@@ -77,7 +77,7 @@ def is_positive_dialogue_ending(file) -> bool:
77
77
  return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
78
78
 
79
79
  chain = (
80
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
80
+ dc.from_storage("gs://datachain-demo/chatbot-KiT/",
81
81
  object_name="file", type="text", anon=True)
82
82
  .settings(parallel=8, cache=True)
83
83
  .map(is_positive=is_positive_dialogue_ending)
@@ -118,11 +118,11 @@ to 4 requests at the same time.
118
118
  ``` py
119
119
  import os
120
120
  from mistralai import Mistral
121
- from datachain import File, DataChain, Column
121
+ import datachain as dc
122
122
 
123
123
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
124
124
 
125
- def eval_dialogue(file: File) -> bool:
125
+ def eval_dialogue(file: dc.File) -> bool:
126
126
  client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
127
127
  response = client.chat.complete(
128
128
  model="open-mixtral-8x22b",
@@ -132,12 +132,12 @@ def eval_dialogue(file: File) -> bool:
132
132
  return result.lower().startswith("success")
133
133
 
134
134
  chain = (
135
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
135
+ dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
136
136
  .map(is_success=eval_dialogue)
137
137
  .save("mistral_files")
138
138
  )
139
139
 
140
- successful_chain = chain.filter(Column("is_success") == True)
140
+ successful_chain = chain.filter(dc.Column("is_success") == True)
141
141
  successful_chain.to_storage("./output_mistral")
142
142
 
143
143
  print(f"{successful_chain.count()} files were exported")
@@ -165,11 +165,11 @@ serialize the entire LLM response to the internal DB:
165
165
  ``` py
166
166
  from mistralai import Mistral
167
167
  from mistralai.models import ChatCompletionResponse
168
- from datachain import File, DataChain, Column
168
+ import datachain as dc
169
169
 
170
170
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
171
171
 
172
- def eval_dialog(file: File) -> ChatCompletionResponse:
172
+ def eval_dialog(file: dc.File) -> ChatCompletionResponse:
173
173
  client = MistralClient()
174
174
  return client.chat(
175
175
  model="open-mixtral-8x22b",
@@ -177,7 +177,7 @@ def eval_dialog(file: File) -> ChatCompletionResponse:
177
177
  {"role": "user", "content": file.read()}])
178
178
 
179
179
  chain = (
180
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
180
+ dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
181
181
  .settings(parallel=4, cache=True)
182
182
  .map(response=eval_dialog)
183
183
  .map(status=lambda response: response.choices[0].message.content.lower()[:7])
@@ -186,7 +186,7 @@ chain = (
186
186
 
187
187
  chain.select("file.name", "status", "response.usage").show(5)
188
188
 
189
- success_rate = chain.filter(Column("status") == "success").count() / chain.count()
189
+ success_rate = chain.filter(dc.Column("status") == "success").count() / chain.count()
190
190
  print(f"{100*success_rate:.1f}% dialogs were successful")
191
191
  ```
192
192
 
@@ -210,12 +210,14 @@ name usage usage usage
210
210
 
211
211
  In the previous examples, datasets were saved in the embedded database
212
212
  (`SQLite` in folder `.datachain` of the working directory). These datasets were automatically versioned, and
213
- can be accessed using `DataChain.from_dataset("dataset_name")`.
213
+ can be accessed using `dc.from_dataset("dataset_name")`.
214
214
 
215
215
  Here is how to retrieve a saved dataset and iterate over the objects:
216
216
 
217
217
  ``` py
218
- chain = DataChain.from_dataset("response")
218
+ import datachain as dc
219
+
220
+ chain = dc.from_dataset("response")
219
221
 
220
222
  # Iterating one-by-one: support out-of-memory workflow
221
223
  for file, response in chain.limit(5).collect("file", "response"):
@@ -245,7 +247,8 @@ assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M
245
247
  output tokens:
246
248
 
247
249
  ``` py
248
- chain = DataChain.from_dataset("mistral_dataset")
250
+ import datachain as dc
251
+ chain = dc.from_dataset("mistral_dataset")
249
252
 
250
253
  cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
251
254
  + chain.sum("response.usage.completion_tokens")*0.000006
@@ -268,12 +271,12 @@ file name suffix, the following code will do it:
268
271
  from torch.utils.data import DataLoader
269
272
  from transformers import CLIPProcessor
270
273
 
271
- from datachain import C, DataChain
274
+ import datachain as dc
272
275
 
273
276
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
274
277
 
275
278
  chain = (
276
- DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
279
+ dc.from_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
277
280
  .map(label=lambda name: name.split(".")[0], params=["file.name"])
278
281
  .select("file", "label").to_pytorch(
279
282
  transform=processor.image_processor,
@@ -2,13 +2,13 @@
2
2
 
3
3
  `File` is a special [`DataModel`](index.md#datachain.lib.data_model.DataModel),
4
4
  which is automatically generated when a `DataChain` is created from files,
5
- such as in [`DataChain.from_storage`](../datachain.md#datachain.lib.dc.DataChain.from_storage):
5
+ such as in [`dc.from_storage`](../datachain.md#datachain.lib.dc.storage.from_storage):
6
6
 
7
7
  ```python
8
- from datachain import DataChain
8
+ import datachain as dc
9
9
 
10
- dc = DataChain.from_storage("gs://datachain-demo/dogs-and-cats")
11
- dc.print_schema()
10
+ chain = dc.from_storage("gs://datachain-demo/dogs-and-cats")
11
+ chain.print_schema()
12
12
  ```
13
13
 
14
14
  Output:
@@ -2,12 +2,12 @@
2
2
 
3
3
  `ImageFile` is inherited from [`File`](file.md) with additional methods for working with image files.
4
4
 
5
- `ImageFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.DataChain.from_storage), using `type="image"` param:
5
+ `ImageFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.storage.from_storage), using `type="image"` param:
6
6
 
7
7
  ```python
8
- from datachain import DataChain
8
+ import datachain as dc
9
9
 
10
- dc = DataChain.from_storage("s3://bucket-name/", type="image")
10
+ chain = dc.from_storage("s3://bucket-name/", type="image")
11
11
  ```
12
12
 
13
13
  ::: datachain.lib.file.ImageFile
@@ -2,12 +2,12 @@
2
2
 
3
3
  `TextFile` is inherited from [`File`](file.md) with additional methods for working with text files.
4
4
 
5
- `TextFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.DataChain.from_storage), using `type="text"` param:
5
+ `TextFile` is generated when a `DataChain` is created [from storage](../datachain.md#datachain.lib.dc.storage.from_storage), using `type="text"` param:
6
6
 
7
7
  ```python
8
- from datachain import DataChain
8
+ import datachain as dc
9
9
 
10
- dc = DataChain.from_storage("s3://bucket-name/", type="text")
10
+ chain = dc.from_storage("s3://bucket-name/", type="text")
11
11
  ```
12
12
 
13
13
  ::: datachain.lib.file.TextFile
@@ -2,12 +2,12 @@
2
2
 
3
3
  `VideoFile` extends [`File`](file.md) and provides additional methods for working with video files.
4
4
 
5
- `VideoFile` instances are created when a `DataChain` is initialized [from storage](../datachain.md#datachain.lib.dc.DataChain.from_storage) with the `type="video"` parameter:
5
+ `VideoFile` instances are created when a `DataChain` is initialized [from storage](../datachain.md#datachain.lib.dc.storage.from_storage) with the `type="video"` parameter:
6
6
 
7
7
  ```python
8
- from datachain import DataChain
8
+ import datachain as dc
9
9
 
10
- dc = DataChain.from_storage("s3://bucket-name/", type="video")
10
+ chain = dc.from_storage("s3://bucket-name/", type="video")
11
11
  ```
12
12
 
13
13
  There are additional models for working with video files:
@@ -9,6 +9,28 @@ for examples of how to create a chain.
9
9
 
10
10
  ::: datachain.query.schema.Column
11
11
 
12
+ ::: datachain.lib.dc.csv.from_csv
13
+
14
+ ::: datachain.lib.dc.datasets.from_dataset
15
+
16
+ ::: datachain.lib.dc.datasets.datasets
17
+
18
+ ::: datachain.lib.dc.hf.from_hf
19
+
20
+ ::: datachain.lib.dc.json.from_json
21
+
22
+ ::: datachain.lib.dc.listings.listings
23
+
24
+ ::: datachain.lib.dc.pandas.from_pandas
25
+
26
+ ::: datachain.lib.dc.parquet.from_parquet
27
+
28
+ ::: datachain.lib.dc.records.from_records
29
+
30
+ ::: datachain.lib.dc.storage.from_storage
31
+
32
+ ::: datachain.lib.dc.values.from_values
33
+
12
34
  ::: datachain.lib.dc.DataChain
13
35
 
14
36
  ::: datachain.lib.utils.DataChainError
@@ -1,13 +1,13 @@
1
1
  # Interacting with remote storage
2
2
 
3
- DataChain supports reading and writing data from different remote storages using methods like `DataChain.from_storage` and `DataChain.to_storage`. The supported storages includes: local file system, AWS S3 storage, Google Cloud Storage, Azure Blob Storage, Hugging Face and more.
3
+ DataChain supports reading and writing data from different remote storages using methods like `dc.from_storage` and `dc.to_storage`. The supported storages includes: local file system, AWS S3 storage, Google Cloud Storage, Azure Blob Storage, Hugging Face and more.
4
4
 
5
5
  Example implementation for reading and writing data from/to different remote storages:
6
6
 
7
7
  ```python
8
- from datachain import DataChain
8
+ import datachain as dc
9
9
 
10
- dc = DataChain.from_storage("s3://bucket-name/path/to/data")
10
+ dc = dc.from_storage("s3://bucket-name/path/to/data")
11
11
  dc.to_storage("gs://bucket-name/path/to/data")
12
12
  ```
13
13
 
@@ -135,7 +135,7 @@ DataChain uses [s3fs](https://s3fs.readthedocs.io/en/latest/) to interact with A
135
135
 
136
136
  Example:
137
137
  ```python
138
- chain = DataChain.from_storage(
138
+ chain = dc.from_storage(
139
139
  "s3://my-bucket/my-dir",
140
140
  client_config = {
141
141
  "endpoint_url": "<minio-endpoint-url>",
@@ -13,7 +13,7 @@ from PIL import (
13
13
  TiffImagePlugin,
14
14
  )
15
15
 
16
- from datachain import C, DataChain
16
+ import datachain as dc
17
17
 
18
18
  source = "gs://datachain-demo/open-images-v6/"
19
19
 
@@ -67,9 +67,9 @@ def image_description(file):
67
67
 
68
68
  if __name__ == "__main__":
69
69
  (
70
- DataChain.from_storage(source, type="image")
70
+ dc.from_storage(source, type="image")
71
71
  .settings(parallel=-1)
72
- .filter(C("file.path").glob("*.jpg"))
72
+ .filter(dc.C("file.path").glob("*.jpg"))
73
73
  .limit(5000)
74
74
  .map(
75
75
  image_description,
@@ -77,6 +77,6 @@ if __name__ == "__main__":
77
77
  output={"xmp": dict, "exif": dict, "iptc": dict, "error": str},
78
78
  )
79
79
  .select("file.path", "xmp", "exif", "iptc", "error")
80
- .filter((C("xmp") != "{}") | (C("exif") != "{}") | (C("iptc") != "{}"))
80
+ .filter((dc.C("xmp") != "{}") | (dc.C("exif") != "{}") | (dc.C("iptc") != "{}"))
81
81
  .show()
82
82
  )
@@ -11,7 +11,7 @@ from transformers import (
11
11
  LlavaForConditionalGeneration,
12
12
  )
13
13
 
14
- from datachain import C, DataChain, Mapper
14
+ import datachain as dc
15
15
 
16
16
  model = "llava-hf/llava-1.5-7b-hf"
17
17
 
@@ -41,7 +41,7 @@ def infer_dtype(device):
41
41
  return torch.float16
42
42
 
43
43
 
44
- class LLaVADescribe(Mapper):
44
+ class LLaVADescribe(dc.Mapper):
45
45
  def __init__(self, device="cpu", model="llava-hf/llava-1.5-7b-hf", max_tokens=300):
46
46
  self.device = device
47
47
  self.model_name = model
@@ -71,8 +71,8 @@ class LLaVADescribe(Mapper):
71
71
 
72
72
  if __name__ == "__main__":
73
73
  (
74
- DataChain.from_storage(source, type="image")
75
- .filter(C("file.path").glob("*/cat*.jpg"))
74
+ dc.from_storage(source, type="image")
75
+ .filter(dc.C("file.path").glob("*/cat*.jpg"))
76
76
  .map(
77
77
  desc=LLaVADescribe(
78
78
  device=device,
@@ -2,7 +2,8 @@ import json
2
2
 
3
3
  from PIL import Image
4
4
 
5
- from datachain import C, DataChain, File, model
5
+ import datachain as dc
6
+ from datachain import File, model
6
7
  from datachain.func import path
7
8
 
8
9
 
@@ -40,8 +41,8 @@ def openimage_detect(args):
40
41
  source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
41
42
 
42
43
  (
43
- DataChain.from_storage(source)
44
- .filter(C("file.path").glob("*.jpg") | C("file.path").glob("*.json"))
44
+ dc.from_storage(source)
45
+ .filter(dc.C("file.path").glob("*.jpg") | dc.C("file.path").glob("*.json"))
45
46
  .agg(
46
47
  openimage_detect,
47
48
  partition_by=path.file_stem("file.path"),
@@ -1,17 +1,17 @@
1
1
  from ultralytics import YOLO
2
2
 
3
- from datachain import C, DataChain, File
3
+ import datachain as dc
4
4
  from datachain.model.ultralytics import YoloBBoxes
5
5
 
6
6
 
7
- def process_bboxes(yolo: YOLO, file: File) -> YoloBBoxes:
7
+ def process_bboxes(yolo: YOLO, file: dc.File) -> YoloBBoxes:
8
8
  results = yolo(file.as_image_file().read(), verbose=False)
9
9
  return YoloBBoxes.from_results(results)
10
10
 
11
11
 
12
12
  (
13
- DataChain.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
14
- .filter(C("file.path").glob("*.jpg"))
13
+ dc.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
14
+ .filter(dc.C("file.path").glob("*.jpg"))
15
15
  .limit(20)
16
16
  .setup(yolo=lambda: YOLO("yolo11n.pt"))
17
17
  .map(boxes=process_bboxes)
@@ -1,17 +1,17 @@
1
1
  from ultralytics import YOLO
2
2
 
3
- from datachain import C, DataChain, File
3
+ import datachain as dc
4
4
  from datachain.model.ultralytics import YoloPoses
5
5
 
6
6
 
7
- def process_poses(yolo: YOLO, file: File) -> YoloPoses:
7
+ def process_poses(yolo: YOLO, file: dc.File) -> YoloPoses:
8
8
  results = yolo(file.as_image_file().read(), verbose=False)
9
9
  return YoloPoses.from_results(results)
10
10
 
11
11
 
12
12
  (
13
- DataChain.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
14
- .filter(C("file.path").glob("*.jpg"))
13
+ dc.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
14
+ .filter(dc.C("file.path").glob("*.jpg"))
15
15
  .limit(20)
16
16
  .setup(yolo=lambda: YOLO("yolo11n-pose.pt"))
17
17
  .map(poses=process_poses)
@@ -1,17 +1,17 @@
1
1
  from ultralytics import YOLO
2
2
 
3
- from datachain import C, DataChain, File
3
+ import datachain as dc
4
4
  from datachain.model.ultralytics import YoloSegments
5
5
 
6
6
 
7
- def process_segments(yolo: YOLO, file: File) -> YoloSegments:
7
+ def process_segments(yolo: YOLO, file: dc.File) -> YoloSegments:
8
8
  results = yolo(file.as_image_file().read(), verbose=False)
9
9
  return YoloSegments.from_results(results)
10
10
 
11
11
 
12
12
  (
13
- DataChain.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
14
- .filter(C("file.path").glob("*.jpg"))
13
+ dc.from_storage("gs://datachain-demo/openimages-v6-test-jsonpairs/")
14
+ .filter(dc.C("file.path").glob("*.jpg"))
15
15
  .limit(20)
16
16
  .setup(yolo=lambda: YOLO("yolo11n-seg.pt"))
17
17
  .map(segments=process_segments)