datachain 0.6.8__tar.gz → 0.6.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (262) hide show
  1. {datachain-0.6.8 → datachain-0.6.9}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.6.8/src/datachain.egg-info → datachain-0.6.9}/PKG-INFO +41 -21
  3. {datachain-0.6.8 → datachain-0.6.9}/README.rst +40 -20
  4. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/catalog/catalog.py +5 -0
  5. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/client/fsspec.py +1 -1
  6. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/data_storage/metastore.py +4 -0
  7. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/dataset.py +5 -0
  8. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/dataset_info.py +3 -0
  9. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/dc.py +26 -6
  10. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/meta_formats.py +1 -0
  11. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/signal_schema.py +1 -1
  12. {datachain-0.6.8 → datachain-0.6.9/src/datachain.egg-info}/PKG-INFO +41 -21
  13. {datachain-0.6.8 → datachain-0.6.9}/src/datachain.egg-info/SOURCES.txt +0 -1
  14. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_datasets.py +4 -0
  15. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_pull.py +4 -0
  16. datachain-0.6.8/docs/assets/flowchart.png +0 -0
  17. {datachain-0.6.8 → datachain-0.6.9}/.cruft.json +0 -0
  18. {datachain-0.6.8 → datachain-0.6.9}/.gitattributes +0 -0
  19. {datachain-0.6.8 → datachain-0.6.9}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  20. {datachain-0.6.8 → datachain-0.6.9}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  21. {datachain-0.6.8 → datachain-0.6.9}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  22. {datachain-0.6.8 → datachain-0.6.9}/.github/codecov.yaml +0 -0
  23. {datachain-0.6.8 → datachain-0.6.9}/.github/dependabot.yml +0 -0
  24. {datachain-0.6.8 → datachain-0.6.9}/.github/workflows/benchmarks.yml +0 -0
  25. {datachain-0.6.8 → datachain-0.6.9}/.github/workflows/release.yml +0 -0
  26. {datachain-0.6.8 → datachain-0.6.9}/.github/workflows/tests-studio.yml +0 -0
  27. {datachain-0.6.8 → datachain-0.6.9}/.github/workflows/tests.yml +0 -0
  28. {datachain-0.6.8 → datachain-0.6.9}/.github/workflows/update-template.yaml +0 -0
  29. {datachain-0.6.8 → datachain-0.6.9}/.gitignore +0 -0
  30. {datachain-0.6.8 → datachain-0.6.9}/CODE_OF_CONDUCT.rst +0 -0
  31. {datachain-0.6.8 → datachain-0.6.9}/CONTRIBUTING.rst +0 -0
  32. {datachain-0.6.8 → datachain-0.6.9}/LICENSE +0 -0
  33. {datachain-0.6.8 → datachain-0.6.9}/docs/assets/captioned_cartoons.png +0 -0
  34. {datachain-0.6.8 → datachain-0.6.9}/docs/assets/datachain-white.svg +0 -0
  35. {datachain-0.6.8 → datachain-0.6.9}/docs/assets/datachain.svg +0 -0
  36. {datachain-0.6.8 → datachain-0.6.9}/docs/index.md +0 -0
  37. {datachain-0.6.8 → datachain-0.6.9}/docs/references/datachain.md +0 -0
  38. {datachain-0.6.8 → datachain-0.6.9}/docs/references/datatype.md +0 -0
  39. {datachain-0.6.8 → datachain-0.6.9}/docs/references/file.md +0 -0
  40. {datachain-0.6.8 → datachain-0.6.9}/docs/references/index.md +0 -0
  41. {datachain-0.6.8 → datachain-0.6.9}/docs/references/sql.md +0 -0
  42. {datachain-0.6.8 → datachain-0.6.9}/docs/references/torch.md +0 -0
  43. {datachain-0.6.8 → datachain-0.6.9}/docs/references/udf.md +0 -0
  44. {datachain-0.6.8 → datachain-0.6.9}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  45. {datachain-0.6.8 → datachain-0.6.9}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  46. {datachain-0.6.8 → datachain-0.6.9}/examples/computer_vision/openimage-detect.py +0 -0
  47. {datachain-0.6.8 → datachain-0.6.9}/examples/get_started/common_sql_functions.py +0 -0
  48. {datachain-0.6.8 → datachain-0.6.9}/examples/get_started/json-csv-reader.py +0 -0
  49. {datachain-0.6.8 → datachain-0.6.9}/examples/get_started/torch-loader.py +0 -0
  50. {datachain-0.6.8 → datachain-0.6.9}/examples/get_started/udfs/parallel.py +0 -0
  51. {datachain-0.6.8 → datachain-0.6.9}/examples/get_started/udfs/simple.py +0 -0
  52. {datachain-0.6.8 → datachain-0.6.9}/examples/get_started/udfs/stateful.py +0 -0
  53. {datachain-0.6.8 → datachain-0.6.9}/examples/llm_and_nlp/claude-query.py +0 -0
  54. {datachain-0.6.8 → datachain-0.6.9}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  55. {datachain-0.6.8 → datachain-0.6.9}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  56. {datachain-0.6.8 → datachain-0.6.9}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  57. {datachain-0.6.8 → datachain-0.6.9}/examples/multimodal/clip_inference.py +0 -0
  58. {datachain-0.6.8 → datachain-0.6.9}/examples/multimodal/hf_pipeline.py +0 -0
  59. {datachain-0.6.8 → datachain-0.6.9}/examples/multimodal/openai_image_desc_lib.py +0 -0
  60. {datachain-0.6.8 → datachain-0.6.9}/examples/multimodal/wds.py +0 -0
  61. {datachain-0.6.8 → datachain-0.6.9}/examples/multimodal/wds_filtered.py +0 -0
  62. {datachain-0.6.8 → datachain-0.6.9}/mkdocs.yml +0 -0
  63. {datachain-0.6.8 → datachain-0.6.9}/noxfile.py +0 -0
  64. {datachain-0.6.8 → datachain-0.6.9}/overrides/main.html +0 -0
  65. {datachain-0.6.8 → datachain-0.6.9}/pyproject.toml +0 -0
  66. {datachain-0.6.8 → datachain-0.6.9}/setup.cfg +0 -0
  67. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/__init__.py +0 -0
  68. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/__main__.py +0 -0
  69. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/asyn.py +0 -0
  70. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/cache.py +0 -0
  71. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/catalog/__init__.py +0 -0
  72. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/catalog/datasource.py +0 -0
  73. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/catalog/loader.py +0 -0
  74. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/cli.py +0 -0
  75. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/cli_utils.py +0 -0
  76. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/client/__init__.py +0 -0
  77. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/client/azure.py +0 -0
  78. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/client/fileslice.py +0 -0
  79. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/client/gcs.py +0 -0
  80. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/client/hf.py +0 -0
  81. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/client/local.py +0 -0
  82. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/client/s3.py +0 -0
  83. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/config.py +0 -0
  84. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/data_storage/__init__.py +0 -0
  85. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/data_storage/db_engine.py +0 -0
  86. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/data_storage/id_generator.py +0 -0
  87. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/data_storage/job.py +0 -0
  88. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/data_storage/schema.py +0 -0
  89. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/data_storage/serializer.py +0 -0
  90. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/data_storage/sqlite.py +0 -0
  91. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/data_storage/warehouse.py +0 -0
  92. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/error.py +0 -0
  93. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/job.py +0 -0
  94. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/__init__.py +0 -0
  95. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/arrow.py +0 -0
  96. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/clip.py +0 -0
  97. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/convert/__init__.py +0 -0
  98. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/convert/flatten.py +0 -0
  99. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/convert/python_to_sql.py +0 -0
  100. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/convert/sql_to_python.py +0 -0
  101. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/convert/unflatten.py +0 -0
  102. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  103. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/data_model.py +0 -0
  104. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/file.py +0 -0
  105. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/func/__init__.py +0 -0
  106. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/func/aggregate.py +0 -0
  107. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/func/func.py +0 -0
  108. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/hf.py +0 -0
  109. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/image.py +0 -0
  110. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/listing.py +0 -0
  111. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/listing_info.py +0 -0
  112. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/model_store.py +0 -0
  113. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/models/__init__.py +0 -0
  114. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/models/bbox.py +0 -0
  115. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/models/pose.py +0 -0
  116. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/models/yolo.py +0 -0
  117. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/pytorch.py +0 -0
  118. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/settings.py +0 -0
  119. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/tar.py +0 -0
  120. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/text.py +0 -0
  121. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/udf.py +0 -0
  122. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/udf_signature.py +0 -0
  123. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/utils.py +0 -0
  124. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/vfile.py +0 -0
  125. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/webdataset.py +0 -0
  126. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/lib/webdataset_laion.py +0 -0
  127. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/listing.py +0 -0
  128. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/node.py +0 -0
  129. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/nodes_fetcher.py +0 -0
  130. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/nodes_thread_pool.py +0 -0
  131. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/progress.py +0 -0
  132. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/py.typed +0 -0
  133. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/query/__init__.py +0 -0
  134. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/query/batch.py +0 -0
  135. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/query/dataset.py +0 -0
  136. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/query/dispatch.py +0 -0
  137. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/query/metrics.py +0 -0
  138. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/query/params.py +0 -0
  139. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/query/queue.py +0 -0
  140. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/query/schema.py +0 -0
  141. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/query/session.py +0 -0
  142. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/remote/__init__.py +0 -0
  143. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/remote/studio.py +0 -0
  144. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/__init__.py +0 -0
  145. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/default/__init__.py +0 -0
  146. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/default/base.py +0 -0
  147. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/functions/__init__.py +0 -0
  148. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/functions/aggregate.py +0 -0
  149. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/functions/array.py +0 -0
  150. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/functions/conditional.py +0 -0
  151. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/functions/path.py +0 -0
  152. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/functions/random.py +0 -0
  153. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/functions/string.py +0 -0
  154. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/selectable.py +0 -0
  155. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/sqlite/__init__.py +0 -0
  156. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/sqlite/base.py +0 -0
  157. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/sqlite/types.py +0 -0
  158. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/sqlite/vector.py +0 -0
  159. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/types.py +0 -0
  160. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/sql/utils.py +0 -0
  161. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/studio.py +0 -0
  162. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/telemetry.py +0 -0
  163. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/torch/__init__.py +0 -0
  164. {datachain-0.6.8 → datachain-0.6.9}/src/datachain/utils.py +0 -0
  165. {datachain-0.6.8 → datachain-0.6.9}/src/datachain.egg-info/dependency_links.txt +0 -0
  166. {datachain-0.6.8 → datachain-0.6.9}/src/datachain.egg-info/entry_points.txt +0 -0
  167. {datachain-0.6.8 → datachain-0.6.9}/src/datachain.egg-info/requires.txt +0 -0
  168. {datachain-0.6.8 → datachain-0.6.9}/src/datachain.egg-info/top_level.txt +0 -0
  169. {datachain-0.6.8 → datachain-0.6.9}/tests/__init__.py +0 -0
  170. {datachain-0.6.8 → datachain-0.6.9}/tests/benchmarks/__init__.py +0 -0
  171. {datachain-0.6.8 → datachain-0.6.9}/tests/benchmarks/conftest.py +0 -0
  172. {datachain-0.6.8 → datachain-0.6.9}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  173. {datachain-0.6.8 → datachain-0.6.9}/tests/benchmarks/datasets/.dvc/config +0 -0
  174. {datachain-0.6.8 → datachain-0.6.9}/tests/benchmarks/datasets/.gitignore +0 -0
  175. {datachain-0.6.8 → datachain-0.6.9}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  176. {datachain-0.6.8 → datachain-0.6.9}/tests/benchmarks/test_datachain.py +0 -0
  177. {datachain-0.6.8 → datachain-0.6.9}/tests/benchmarks/test_ls.py +0 -0
  178. {datachain-0.6.8 → datachain-0.6.9}/tests/benchmarks/test_version.py +0 -0
  179. {datachain-0.6.8 → datachain-0.6.9}/tests/conftest.py +0 -0
  180. {datachain-0.6.8 → datachain-0.6.9}/tests/data.py +0 -0
  181. {datachain-0.6.8 → datachain-0.6.9}/tests/examples/__init__.py +0 -0
  182. {datachain-0.6.8 → datachain-0.6.9}/tests/examples/test_examples.py +0 -0
  183. {datachain-0.6.8 → datachain-0.6.9}/tests/examples/test_wds_e2e.py +0 -0
  184. {datachain-0.6.8 → datachain-0.6.9}/tests/examples/wds_data.py +0 -0
  185. {datachain-0.6.8 → datachain-0.6.9}/tests/func/__init__.py +0 -0
  186. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_catalog.py +0 -0
  187. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_client.py +0 -0
  188. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_datachain.py +0 -0
  189. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_dataset_query.py +0 -0
  190. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_feature_pickling.py +0 -0
  191. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_listing.py +0 -0
  192. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_ls.py +0 -0
  193. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_meta_formats.py +0 -0
  194. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_metrics.py +0 -0
  195. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_pytorch.py +0 -0
  196. {datachain-0.6.8 → datachain-0.6.9}/tests/func/test_query.py +0 -0
  197. {datachain-0.6.8 → datachain-0.6.9}/tests/scripts/feature_class.py +0 -0
  198. {datachain-0.6.8 → datachain-0.6.9}/tests/scripts/feature_class_exception.py +0 -0
  199. {datachain-0.6.8 → datachain-0.6.9}/tests/scripts/feature_class_parallel.py +0 -0
  200. {datachain-0.6.8 → datachain-0.6.9}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  201. {datachain-0.6.8 → datachain-0.6.9}/tests/scripts/name_len_slow.py +0 -0
  202. {datachain-0.6.8 → datachain-0.6.9}/tests/test_atomicity.py +0 -0
  203. {datachain-0.6.8 → datachain-0.6.9}/tests/test_cli_e2e.py +0 -0
  204. {datachain-0.6.8 → datachain-0.6.9}/tests/test_cli_studio.py +0 -0
  205. {datachain-0.6.8 → datachain-0.6.9}/tests/test_query_e2e.py +0 -0
  206. {datachain-0.6.8 → datachain-0.6.9}/tests/test_telemetry.py +0 -0
  207. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/__init__.py +0 -0
  208. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/__init__.py +0 -0
  209. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/conftest.py +0 -0
  210. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_arrow.py +0 -0
  211. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_clip.py +0 -0
  212. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_datachain.py +0 -0
  213. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  214. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_datachain_merge.py +0 -0
  215. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_feature.py +0 -0
  216. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_feature_utils.py +0 -0
  217. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_file.py +0 -0
  218. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_hf.py +0 -0
  219. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_image.py +0 -0
  220. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_listing_info.py +0 -0
  221. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_models.py +0 -0
  222. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_schema.py +0 -0
  223. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_signal_schema.py +0 -0
  224. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_sql_to_python.py +0 -0
  225. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_text.py +0 -0
  226. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_udf_signature.py +0 -0
  227. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_utils.py +0 -0
  228. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/lib/test_webdataset.py +0 -0
  229. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/sql/__init__.py +0 -0
  230. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/sql/sqlite/__init__.py +0 -0
  231. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/sql/sqlite/test_utils.py +0 -0
  232. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/sql/test_array.py +0 -0
  233. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/sql/test_conditional.py +0 -0
  234. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/sql/test_path.py +0 -0
  235. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/sql/test_random.py +0 -0
  236. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/sql/test_selectable.py +0 -0
  237. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/sql/test_string.py +0 -0
  238. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_asyn.py +0 -0
  239. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_cache.py +0 -0
  240. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_catalog.py +0 -0
  241. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_catalog_loader.py +0 -0
  242. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_cli_parsing.py +0 -0
  243. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_client.py +0 -0
  244. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_client_s3.py +0 -0
  245. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_config.py +0 -0
  246. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_data_storage.py +0 -0
  247. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_database_engine.py +0 -0
  248. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_dataset.py +0 -0
  249. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_dispatch.py +0 -0
  250. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_fileslice.py +0 -0
  251. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_id_generator.py +0 -0
  252. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_listing.py +0 -0
  253. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_metastore.py +0 -0
  254. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_module_exports.py +0 -0
  255. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_query.py +0 -0
  256. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_query_metrics.py +0 -0
  257. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_query_params.py +0 -0
  258. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_serializer.py +0 -0
  259. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_session.py +0 -0
  260. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_utils.py +0 -0
  261. {datachain-0.6.8 → datachain-0.6.9}/tests/unit/test_warehouse.py +0 -0
  262. {datachain-0.6.8 → datachain-0.6.9}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.7.2'
27
+ rev: 'v0.7.3'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.8
3
+ Version: 0.6.9
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -120,33 +120,41 @@ Requires-Dist: onnx==1.16.1; extra == "examples"
120
120
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
121
121
  :alt: Tests
122
122
 
123
- DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
124
- It is made to organize your unstructured data into datasets and wrangle it at scale on
125
- your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
123
+ DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
124
+ data like images, audio, videos, text and PDFs. It integrates with external storage
125
+ (e.g., S3) to process data efficiently without data duplication and manages metadata
126
+ in an internal database for easy and efficient querying.
127
+
128
+
129
+ Use Cases
130
+ =========
131
+
132
+ 1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and
133
+ refining data in pre-training, finetuning or LLM evaluating stages.
134
+ 2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and
135
+ ad-hoc analytics using LLMs.
126
136
 
127
137
  Key Features
128
138
  ============
129
139
 
130
- 📂 **Storage as a Source of Truth.**
131
- - Process unstructured data without redundant copies from S3, GCP, Azure, and local
132
- file systems.
133
- - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
140
+ 📂 **Multimodal Dataset Versioning.**
141
+ - Version unstructured data without redundant data copies, by supporitng
142
+ references to S3, GCP, Azure, and local file systems.
143
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
134
144
  - Unite files and metadata together into persistent, versioned, columnar datasets.
135
145
 
136
- 🐍 **Python-friendly data pipelines.**
137
- - Operate on Python objects and object fields.
138
- - Built-in parallelization and out-of-memory compute without SQL or Spark.
146
+ 🐍 **Python-friendly.**
147
+ - Operate on Python objects and object fields: float scores, strings, matrixes,
148
+ LLM response objects.
149
+ - Run Python code in a high-scale, terabytes size datasets, with built-in
150
+ parallelization and memory-efficient computing — no SQL or Spark required.
139
151
 
140
152
  🧠 **Data Enrichment and Processing.**
141
153
  - Generate metadata using local AI models and LLM APIs.
142
- - Filter, join, and group by metadata. Search by vector embeddings.
154
+ - Filter, join, and group datasets by metadata. Search by vector embeddings.
155
+ - High-performance vectorized operations on Python objects: sum, count, avg, etc.
143
156
  - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
144
157
 
145
- 🚀 **Efficiency.**
146
- - Parallelization, out-of-memory workloads and data caching.
147
- - Vectorized operations on Python object fields: sum, count, avg, etc.
148
- - Optimized vector search.
149
-
150
158
 
151
159
  Quick Start
152
160
  -----------
@@ -196,7 +204,7 @@ Batch inference with a simple sentiment model using the `transformers` library:
196
204
 
197
205
  pip install transformers
198
206
 
199
- The code below downloads files the cloud, and applies a user-defined function
207
+ The code below downloads files from the cloud, and applies a user-defined function
200
208
  to each one of them. All files with a positive sentiment
201
209
  detected are then copied to the local directory.
202
210
 
@@ -429,6 +437,19 @@ name suffix, the following code will do it:
429
437
  loader = DataLoader(chain, batch_size=1)
430
438
 
431
439
 
440
+ DataChain Studio Platform
441
+ -------------------------
442
+
443
+ `DataChain Studio`_ is a proprietary solution for teams that offers:
444
+
445
+ - **Centralized dataset registry** to manage data, code and dependency
446
+ dependencies in one place.
447
+ - **Data Lineage** for data sources as well as direvative dataset.
448
+ - **UI for Multimodal Data** like images, videos, and PDFs.
449
+ - **Scalable Compute** to handle large datasets (100M+ files) and in-house
450
+ AI model inference.
451
+ - **Access control** including SSO and team based collaboration.
452
+
432
453
  Tutorials
433
454
  ---------
434
455
 
@@ -462,6 +483,5 @@ Community and Support
462
483
  .. _Pydantic: https://github.com/pydantic/pydantic
463
484
  .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
464
485
  .. _SQLite: https://www.sqlite.org/
465
- .. _Getting Started: https://datachain.dvc.ai/
466
- .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
467
- :alt: DataChain FlowChart
486
+ .. _Getting Started: https://docs.datachain.ai/
487
+ .. _DataChain Studio: https://studio.datachain.ai/
@@ -19,33 +19,41 @@
19
19
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
20
20
  :alt: Tests
21
21
 
22
- DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
23
- It is made to organize your unstructured data into datasets and wrangle it at scale on
24
- your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
22
+ DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
23
+ data like images, audio, videos, text and PDFs. It integrates with external storage
24
+ (e.g., S3) to process data efficiently without data duplication and manages metadata
25
+ in an internal database for easy and efficient querying.
26
+
27
+
28
+ Use Cases
29
+ =========
30
+
31
+ 1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and
32
+ refining data in pre-training, finetuning or LLM evaluating stages.
33
+ 2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and
34
+ ad-hoc analytics using LLMs.
25
35
 
26
36
  Key Features
27
37
  ============
28
38
 
29
- 📂 **Storage as a Source of Truth.**
30
- - Process unstructured data without redundant copies from S3, GCP, Azure, and local
31
- file systems.
32
- - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
39
+ 📂 **Multimodal Dataset Versioning.**
40
+ - Version unstructured data without redundant data copies, by supporitng
41
+ references to S3, GCP, Azure, and local file systems.
42
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
33
43
  - Unite files and metadata together into persistent, versioned, columnar datasets.
34
44
 
35
- 🐍 **Python-friendly data pipelines.**
36
- - Operate on Python objects and object fields.
37
- - Built-in parallelization and out-of-memory compute without SQL or Spark.
45
+ 🐍 **Python-friendly.**
46
+ - Operate on Python objects and object fields: float scores, strings, matrixes,
47
+ LLM response objects.
48
+ - Run Python code in a high-scale, terabytes size datasets, with built-in
49
+ parallelization and memory-efficient computing — no SQL or Spark required.
38
50
 
39
51
  🧠 **Data Enrichment and Processing.**
40
52
  - Generate metadata using local AI models and LLM APIs.
41
- - Filter, join, and group by metadata. Search by vector embeddings.
53
+ - Filter, join, and group datasets by metadata. Search by vector embeddings.
54
+ - High-performance vectorized operations on Python objects: sum, count, avg, etc.
42
55
  - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
43
56
 
44
- 🚀 **Efficiency.**
45
- - Parallelization, out-of-memory workloads and data caching.
46
- - Vectorized operations on Python object fields: sum, count, avg, etc.
47
- - Optimized vector search.
48
-
49
57
 
50
58
  Quick Start
51
59
  -----------
@@ -95,7 +103,7 @@ Batch inference with a simple sentiment model using the `transformers` library:
95
103
 
96
104
  pip install transformers
97
105
 
98
- The code below downloads files the cloud, and applies a user-defined function
106
+ The code below downloads files from the cloud, and applies a user-defined function
99
107
  to each one of them. All files with a positive sentiment
100
108
  detected are then copied to the local directory.
101
109
 
@@ -328,6 +336,19 @@ name suffix, the following code will do it:
328
336
  loader = DataLoader(chain, batch_size=1)
329
337
 
330
338
 
339
+ DataChain Studio Platform
340
+ -------------------------
341
+
342
+ `DataChain Studio`_ is a proprietary solution for teams that offers:
343
+
344
+ - **Centralized dataset registry** to manage data, code and dependency
345
+ dependencies in one place.
346
+ - **Data Lineage** for data sources as well as direvative dataset.
347
+ - **UI for Multimodal Data** like images, videos, and PDFs.
348
+ - **Scalable Compute** to handle large datasets (100M+ files) and in-house
349
+ AI model inference.
350
+ - **Access control** including SSO and team based collaboration.
351
+
331
352
  Tutorials
332
353
  ---------
333
354
 
@@ -361,6 +382,5 @@ Community and Support
361
382
  .. _Pydantic: https://github.com/pydantic/pydantic
362
383
  .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
363
384
  .. _SQLite: https://www.sqlite.org/
364
- .. _Getting Started: https://datachain.dvc.ai/
365
- .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
366
- :alt: DataChain FlowChart
385
+ .. _Getting Started: https://docs.datachain.ai/
386
+ .. _DataChain Studio: https://studio.datachain.ai/
@@ -769,6 +769,7 @@ class Catalog:
769
769
  create_rows: Optional[bool] = True,
770
770
  validate_version: Optional[bool] = True,
771
771
  listing: Optional[bool] = False,
772
+ uuid: Optional[str] = None,
772
773
  ) -> "DatasetRecord":
773
774
  """
774
775
  Creates new dataset of a specific version.
@@ -816,6 +817,7 @@ class Catalog:
816
817
  query_script=query_script,
817
818
  create_rows_table=create_rows,
818
819
  columns=columns,
820
+ uuid=uuid,
819
821
  )
820
822
 
821
823
  def create_new_dataset_version(
@@ -832,6 +834,7 @@ class Catalog:
832
834
  script_output="",
833
835
  create_rows_table=True,
834
836
  job_id: Optional[str] = None,
837
+ uuid: Optional[str] = None,
835
838
  ) -> DatasetRecord:
836
839
  """
837
840
  Creates dataset version if it doesn't exist.
@@ -855,6 +858,7 @@ class Catalog:
855
858
  schema=schema,
856
859
  job_id=job_id,
857
860
  ignore_if_exists=True,
861
+ uuid=uuid,
858
862
  )
859
863
 
860
864
  if create_rows_table:
@@ -1400,6 +1404,7 @@ class Catalog:
1400
1404
  columns=columns,
1401
1405
  feature_schema=remote_dataset_version.feature_schema,
1402
1406
  validate_version=False,
1407
+ uuid=remote_dataset_version.uuid,
1403
1408
  )
1404
1409
 
1405
1410
  # asking remote to export dataset rows table to s3 and to return signed
@@ -358,7 +358,7 @@ class Client(ABC):
358
358
  ) -> BinaryIO:
359
359
  """Open a file, including files in tar archives."""
360
360
  if use_cache and (cache_path := self.cache.get_path(file)):
361
- return open(cache_path, mode="rb") # noqa: SIM115
361
+ return open(cache_path, mode="rb")
362
362
  assert not file.location
363
363
  return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb) # type: ignore[return-value]
364
364
 
@@ -138,6 +138,7 @@ class AbstractMetastore(ABC, Serializable):
138
138
  size: Optional[int] = None,
139
139
  preview: Optional[list[dict]] = None,
140
140
  job_id: Optional[str] = None,
141
+ uuid: Optional[str] = None,
141
142
  ) -> DatasetRecord:
142
143
  """Creates new dataset version."""
143
144
 
@@ -352,6 +353,7 @@ class AbstractDBMetastore(AbstractMetastore):
352
353
  """Datasets versions table columns."""
353
354
  return [
354
355
  Column("id", Integer, primary_key=True),
356
+ Column("uuid", Text, nullable=False, default=uuid4()),
355
357
  Column(
356
358
  "dataset_id",
357
359
  Integer,
@@ -545,6 +547,7 @@ class AbstractDBMetastore(AbstractMetastore):
545
547
  size: Optional[int] = None,
546
548
  preview: Optional[list[dict]] = None,
547
549
  job_id: Optional[str] = None,
550
+ uuid: Optional[str] = None,
548
551
  conn=None,
549
552
  ) -> DatasetRecord:
550
553
  """Creates new dataset version."""
@@ -555,6 +558,7 @@ class AbstractDBMetastore(AbstractMetastore):
555
558
 
556
559
  query = self._datasets_versions_insert().values(
557
560
  dataset_id=dataset.id,
561
+ uuid=uuid or str(uuid4()),
558
562
  version=version,
559
563
  status=status,
560
564
  feature_schema=json.dumps(feature_schema or {}),
@@ -163,6 +163,7 @@ class DatasetStatus:
163
163
  @dataclass
164
164
  class DatasetVersion:
165
165
  id: int
166
+ uuid: str
166
167
  dataset_id: int
167
168
  version: int
168
169
  status: int
@@ -184,6 +185,7 @@ class DatasetVersion:
184
185
  def parse( # noqa: PLR0913
185
186
  cls: type[V],
186
187
  id: int,
188
+ uuid: str,
187
189
  dataset_id: int,
188
190
  version: int,
189
191
  status: int,
@@ -203,6 +205,7 @@ class DatasetVersion:
203
205
  ):
204
206
  return cls(
205
207
  id,
208
+ uuid,
206
209
  dataset_id,
207
210
  version,
208
211
  status,
@@ -306,6 +309,7 @@ class DatasetRecord:
306
309
  query_script: str,
307
310
  schema: str,
308
311
  version_id: int,
312
+ version_uuid: str,
309
313
  version_dataset_id: int,
310
314
  version: int,
311
315
  version_status: int,
@@ -331,6 +335,7 @@ class DatasetRecord:
331
335
 
332
336
  dataset_version = DatasetVersion.parse(
333
337
  version_id,
338
+ version_uuid,
334
339
  version_dataset_id,
335
340
  version,
336
341
  version_status,
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  from datetime import datetime
3
3
  from typing import TYPE_CHECKING, Any, Optional, Union
4
+ from uuid import uuid4
4
5
 
5
6
  from pydantic import Field, field_validator
6
7
 
@@ -15,6 +16,7 @@ if TYPE_CHECKING:
15
16
 
16
17
  class DatasetInfo(DataModel):
17
18
  name: str
19
+ uuid: str = Field(default=str(uuid4()))
18
20
  version: int = Field(default=1)
19
21
  status: int = Field(default=DatasetStatus.CREATED)
20
22
  created_at: datetime = Field(default=TIME_ZERO)
@@ -60,6 +62,7 @@ class DatasetInfo(DataModel):
60
62
  job: Optional[Job],
61
63
  ) -> "Self":
62
64
  return cls(
65
+ uuid=version.uuid,
63
66
  name=dataset.name,
64
67
  version=version.version,
65
68
  status=version.status,
@@ -30,7 +30,7 @@ from datachain.client.local import FileClient
30
30
  from datachain.dataset import DatasetRecord
31
31
  from datachain.lib.convert.python_to_sql import python_to_sql
32
32
  from datachain.lib.convert.values_to_tuples import values_to_tuples
33
- from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
33
+ from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
34
34
  from datachain.lib.dataset_info import DatasetInfo
35
35
  from datachain.lib.file import ArrowRow, File, get_file_type
36
36
  from datachain.lib.file import ExportPlacement as FileExportPlacement
@@ -895,7 +895,7 @@ class DataChain:
895
895
  2. Group-based UDF function input: Instead of individual rows, the function
896
896
  receives a list all rows within each group defined by `partition_by`.
897
897
 
898
- Example:
898
+ Examples:
899
899
  ```py
900
900
  chain = chain.agg(
901
901
  total=lambda category, amount: [sum(amount)],
@@ -904,6 +904,26 @@ class DataChain:
904
904
  )
905
905
  chain.save("new_dataset")
906
906
  ```
907
+
908
+ An alternative syntax, when you need to specify a more complex function:
909
+
910
+ ```py
911
+ # It automatically resolves which columns to pass to the function
912
+ # by looking at the function signature.
913
+ def agg_sum(
914
+ file: list[File], amount: list[float]
915
+ ) -> Iterator[tuple[File, float]]:
916
+ yield file[0], sum(amount)
917
+
918
+ chain = chain.agg(
919
+ agg_sum,
920
+ output={"file": File, "total": float},
921
+ # Alternative syntax is to use `C` (short for Column) to specify
922
+ # a column name or a nested column, e.g. C("file.path").
923
+ partition_by=C("category"),
924
+ )
925
+ chain.save("new_dataset")
926
+ ```
907
927
  """
908
928
  udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
909
929
  return self._evolve(
@@ -1242,15 +1262,15 @@ class DataChain:
1242
1262
  return self.results(row_factory=to_dict)
1243
1263
 
1244
1264
  @overload
1245
- def collect(self) -> Iterator[tuple[DataType, ...]]: ...
1265
+ def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1246
1266
 
1247
1267
  @overload
1248
- def collect(self, col: str) -> Iterator[DataType]: ... # type: ignore[overload-overlap]
1268
+ def collect(self, col: str) -> Iterator[DataValue]: ...
1249
1269
 
1250
1270
  @overload
1251
- def collect(self, *cols: str) -> Iterator[tuple[DataType, ...]]: ...
1271
+ def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1252
1272
 
1253
- def collect(self, *cols: str) -> Iterator[Union[DataType, tuple[DataType, ...]]]: # type: ignore[overload-overlap,misc]
1273
+ def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1254
1274
  """Yields rows of values, optionally limited to the specified columns.
1255
1275
 
1256
1276
  Args:
@@ -114,6 +114,7 @@ def read_meta( # noqa: C901
114
114
  )
115
115
  )
116
116
  (model_output,) = chain.collect("meta_schema")
117
+ assert isinstance(model_output, str)
117
118
  if print_schema:
118
119
  print(f"{model_output}")
119
120
  # Below 'spec' should be a dynamically converted DataModel from Pydantic
@@ -378,7 +378,7 @@ class SignalSchema:
378
378
 
379
379
  def row_to_features(
380
380
  self, row: Sequence, catalog: "Catalog", cache: bool = False
381
- ) -> list[DataType]:
381
+ ) -> list[DataValue]:
382
382
  res = []
383
383
  pos = 0
384
384
  for fr_cls in self.values.values():
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.8
3
+ Version: 0.6.9
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -120,33 +120,41 @@ Requires-Dist: onnx==1.16.1; extra == "examples"
120
120
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
121
121
  :alt: Tests
122
122
 
123
- DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
124
- It is made to organize your unstructured data into datasets and wrangle it at scale on
125
- your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
123
+ DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
124
+ data like images, audio, videos, text and PDFs. It integrates with external storage
125
+ (e.g., S3) to process data efficiently without data duplication and manages metadata
126
+ in an internal database for easy and efficient querying.
127
+
128
+
129
+ Use Cases
130
+ =========
131
+
132
+ 1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and
133
+ refining data in pre-training, finetuning or LLM evaluating stages.
134
+ 2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and
135
+ ad-hoc analytics using LLMs.
126
136
 
127
137
  Key Features
128
138
  ============
129
139
 
130
- 📂 **Storage as a Source of Truth.**
131
- - Process unstructured data without redundant copies from S3, GCP, Azure, and local
132
- file systems.
133
- - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
140
+ 📂 **Multimodal Dataset Versioning.**
141
+ - Version unstructured data without redundant data copies, by supporitng
142
+ references to S3, GCP, Azure, and local file systems.
143
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
134
144
  - Unite files and metadata together into persistent, versioned, columnar datasets.
135
145
 
136
- 🐍 **Python-friendly data pipelines.**
137
- - Operate on Python objects and object fields.
138
- - Built-in parallelization and out-of-memory compute without SQL or Spark.
146
+ 🐍 **Python-friendly.**
147
+ - Operate on Python objects and object fields: float scores, strings, matrixes,
148
+ LLM response objects.
149
+ - Run Python code in a high-scale, terabytes size datasets, with built-in
150
+ parallelization and memory-efficient computing — no SQL or Spark required.
139
151
 
140
152
  🧠 **Data Enrichment and Processing.**
141
153
  - Generate metadata using local AI models and LLM APIs.
142
- - Filter, join, and group by metadata. Search by vector embeddings.
154
+ - Filter, join, and group datasets by metadata. Search by vector embeddings.
155
+ - High-performance vectorized operations on Python objects: sum, count, avg, etc.
143
156
  - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
144
157
 
145
- 🚀 **Efficiency.**
146
- - Parallelization, out-of-memory workloads and data caching.
147
- - Vectorized operations on Python object fields: sum, count, avg, etc.
148
- - Optimized vector search.
149
-
150
158
 
151
159
  Quick Start
152
160
  -----------
@@ -196,7 +204,7 @@ Batch inference with a simple sentiment model using the `transformers` library:
196
204
 
197
205
  pip install transformers
198
206
 
199
- The code below downloads files the cloud, and applies a user-defined function
207
+ The code below downloads files from the cloud, and applies a user-defined function
200
208
  to each one of them. All files with a positive sentiment
201
209
  detected are then copied to the local directory.
202
210
 
@@ -429,6 +437,19 @@ name suffix, the following code will do it:
429
437
  loader = DataLoader(chain, batch_size=1)
430
438
 
431
439
 
440
+ DataChain Studio Platform
441
+ -------------------------
442
+
443
+ `DataChain Studio`_ is a proprietary solution for teams that offers:
444
+
445
+ - **Centralized dataset registry** to manage data, code and dependency
446
+ dependencies in one place.
447
+ - **Data Lineage** for data sources as well as direvative dataset.
448
+ - **UI for Multimodal Data** like images, videos, and PDFs.
449
+ - **Scalable Compute** to handle large datasets (100M+ files) and in-house
450
+ AI model inference.
451
+ - **Access control** including SSO and team based collaboration.
452
+
432
453
  Tutorials
433
454
  ---------
434
455
 
@@ -462,6 +483,5 @@ Community and Support
462
483
  .. _Pydantic: https://github.com/pydantic/pydantic
463
484
  .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
464
485
  .. _SQLite: https://www.sqlite.org/
465
- .. _Getting Started: https://datachain.dvc.ai/
466
- .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
467
- :alt: DataChain FlowChart
486
+ .. _Getting Started: https://docs.datachain.ai/
487
+ .. _DataChain Studio: https://studio.datachain.ai/
@@ -23,7 +23,6 @@ docs/index.md
23
23
  docs/assets/captioned_cartoons.png
24
24
  docs/assets/datachain-white.svg
25
25
  docs/assets/datachain.svg
26
- docs/assets/flowchart.png
27
26
  docs/references/datachain.md
28
27
  docs/references/datatype.md
29
28
  docs/references/file.md
@@ -56,6 +56,7 @@ def test_create_dataset_no_version_specified(cloud_test_catalog, create_rows):
56
56
  assert dataset.schema["similarity"] == Float32
57
57
  assert dataset_version.schema["similarity"] == Float32
58
58
  assert dataset_version.status == DatasetStatus.PENDING
59
+ assert dataset_version.uuid
59
60
  assert dataset.status == DatasetStatus.CREATED # dataset status is deprecated
60
61
  if create_rows:
61
62
  assert dataset_version.num_objects == 0
@@ -85,6 +86,7 @@ def test_create_dataset_with_explicit_version(cloud_test_catalog, create_rows):
85
86
  assert dataset.schema["similarity"] == Float32
86
87
  assert dataset_version.schema["similarity"] == Float32
87
88
  assert dataset_version.status == DatasetStatus.PENDING
89
+ assert dataset_version.uuid
88
90
  assert dataset.status == DatasetStatus.CREATED
89
91
  if create_rows:
90
92
  assert dataset_version.num_objects == 0
@@ -178,6 +180,7 @@ def test_create_dataset_from_sources(listed_bucket, cloud_test_catalog):
178
180
  assert dataset_version.error_stack == ""
179
181
  assert dataset_version.script_output == ""
180
182
  assert dataset_version.sources == f"{src_uri}/dogs/*"
183
+ assert dataset_version.uuid
181
184
 
182
185
  dr = catalog.warehouse.schema.dataset_row_cls
183
186
  sys_schema = {c.name: type(c.type) for c in dr.sys_columns()}
@@ -214,6 +217,7 @@ def test_create_dataset_from_sources_dataset(cloud_test_catalog, dogs_dataset):
214
217
  assert dataset_version.error_stack == ""
215
218
  assert dataset_version.script_output == ""
216
219
  assert dataset_version.sources == f"ds://{dogs_dataset.name}"
220
+ assert dataset_version.uuid
217
221
 
218
222
  dr = catalog.warehouse.schema.dataset_row_cls
219
223
  sys_schema = {c.name: type(c.type) for c in dr.sys_columns()}
@@ -13,6 +13,8 @@ from datachain.utils import STUDIO_URL, JSONSerialize
13
13
  from tests.data import ENTRIES
14
14
  from tests.utils import assert_row_names, skip_if_not_sqlite
15
15
 
16
+ DATASET_UUID = "20f5a2f1-fc9a-4e36-8b91-5a530f289451"
17
+
16
18
 
17
19
  @pytest.fixture(autouse=True)
18
20
  def studio_config():
@@ -90,6 +92,7 @@ def schema():
90
92
  def remote_dataset_version(schema, dataset_rows):
91
93
  return {
92
94
  "id": 1,
95
+ "uuid": DATASET_UUID,
93
96
  "dataset_id": 1,
94
97
  "version": 1,
95
98
  "status": 4,
@@ -179,6 +182,7 @@ def test_pull_dataset_success(
179
182
  assert dataset_version.schema
180
183
  assert dataset_version.num_objects == 4
181
184
  assert dataset_version.size == 15
185
+ assert dataset_version.uuid == DATASET_UUID
182
186
 
183
187
  assert_row_names(
184
188
  catalog,
Binary file
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes