datachain 0.7.9__tar.gz → 0.7.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (291) hide show
  1. {datachain-0.7.9 → datachain-0.7.11}/.github/workflows/tests.yml +33 -5
  2. {datachain-0.7.9 → datachain-0.7.11}/.pre-commit-config.yaml +1 -1
  3. datachain-0.7.11/PKG-INFO +206 -0
  4. datachain-0.7.11/README.rst +104 -0
  5. datachain-0.7.11/docs/contributing.md +115 -0
  6. datachain-0.7.11/docs/css/github-permalink-style.css +39 -0
  7. datachain-0.7.9/docs/index.md → datachain-0.7.11/docs/examples.md +51 -61
  8. datachain-0.7.11/docs/index.md +106 -0
  9. datachain-0.7.11/docs/quick-start.md +290 -0
  10. datachain-0.7.11/docs/references/index.md +14 -0
  11. datachain-0.7.11/docs/tutorials.md +9 -0
  12. {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/torch-loader.py +25 -20
  13. {datachain-0.7.9 → datachain-0.7.11}/examples/llm_and_nlp/unstructured-embeddings-gen.py +7 -5
  14. {datachain-0.7.9 → datachain-0.7.11}/mkdocs.yml +26 -17
  15. {datachain-0.7.9 → datachain-0.7.11}/noxfile.py +2 -0
  16. {datachain-0.7.9 → datachain-0.7.11}/pyproject.toml +3 -3
  17. datachain-0.7.11/src/datachain/client/__init__.py +3 -0
  18. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/fsspec.py +4 -2
  19. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/local.py +9 -4
  20. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/__init__.py +4 -1
  21. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/numeric.py +46 -0
  22. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/string.py +46 -0
  23. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/flatten.py +7 -5
  24. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/unflatten.py +2 -2
  25. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/values_to_tuples.py +1 -1
  26. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/dc.py +5 -1
  27. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/file.py +2 -1
  28. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/meta_formats.py +2 -1
  29. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/pytorch.py +1 -5
  30. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/signal_schema.py +28 -6
  31. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/utils.py +1 -1
  32. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/dataset.py +5 -2
  33. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/numeric.py +12 -0
  34. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/string.py +12 -0
  35. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/sqlite/base.py +40 -0
  36. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/toolkit/split.py +19 -6
  37. datachain-0.7.11/src/datachain.egg-info/PKG-INFO +206 -0
  38. {datachain-0.7.9 → datachain-0.7.11}/src/datachain.egg-info/SOURCES.txt +5 -1
  39. {datachain-0.7.9 → datachain-0.7.11}/src/datachain.egg-info/requires.txt +3 -3
  40. {datachain-0.7.9 → datachain-0.7.11}/tests/conftest.py +12 -10
  41. {datachain-0.7.9 → datachain-0.7.11}/tests/examples/test_examples.py +14 -29
  42. datachain-0.7.11/tests/func/test_toolkit.py +51 -0
  43. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_signal_schema.py +5 -0
  44. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_func.py +60 -2
  45. datachain-0.7.9/CONTRIBUTING.rst +0 -129
  46. datachain-0.7.9/PKG-INFO +0 -488
  47. datachain-0.7.9/README.rst +0 -386
  48. datachain-0.7.9/docs/references/index.md +0 -8
  49. datachain-0.7.9/src/datachain/client/__init__.py +0 -4
  50. datachain-0.7.9/src/datachain.egg-info/PKG-INFO +0 -488
  51. datachain-0.7.9/tests/func/test_toolkit.py +0 -42
  52. {datachain-0.7.9 → datachain-0.7.11}/.cruft.json +0 -0
  53. {datachain-0.7.9 → datachain-0.7.11}/.gitattributes +0 -0
  54. {datachain-0.7.9 → datachain-0.7.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  55. {datachain-0.7.9 → datachain-0.7.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  56. {datachain-0.7.9 → datachain-0.7.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  57. {datachain-0.7.9 → datachain-0.7.11}/.github/codecov.yaml +0 -0
  58. {datachain-0.7.9 → datachain-0.7.11}/.github/dependabot.yml +0 -0
  59. {datachain-0.7.9 → datachain-0.7.11}/.github/workflows/benchmarks.yml +0 -0
  60. {datachain-0.7.9 → datachain-0.7.11}/.github/workflows/release.yml +0 -0
  61. {datachain-0.7.9 → datachain-0.7.11}/.github/workflows/tests-studio.yml +0 -0
  62. {datachain-0.7.9 → datachain-0.7.11}/.github/workflows/update-template.yaml +0 -0
  63. {datachain-0.7.9 → datachain-0.7.11}/.gitignore +0 -0
  64. {datachain-0.7.9 → datachain-0.7.11}/CODE_OF_CONDUCT.rst +0 -0
  65. {datachain-0.7.9 → datachain-0.7.11}/LICENSE +0 -0
  66. {datachain-0.7.9 → datachain-0.7.11}/docs/assets/captioned_cartoons.png +0 -0
  67. {datachain-0.7.9 → datachain-0.7.11}/docs/assets/datachain-white.svg +0 -0
  68. {datachain-0.7.9 → datachain-0.7.11}/docs/assets/datachain.svg +0 -0
  69. {datachain-0.7.9 → datachain-0.7.11}/docs/overrides/main.html +0 -0
  70. {datachain-0.7.9 → datachain-0.7.11}/docs/references/datachain.md +0 -0
  71. {datachain-0.7.9 → datachain-0.7.11}/docs/references/datatype.md +0 -0
  72. {datachain-0.7.9 → datachain-0.7.11}/docs/references/file.md +0 -0
  73. {datachain-0.7.9 → datachain-0.7.11}/docs/references/sql.md +0 -0
  74. {datachain-0.7.9 → datachain-0.7.11}/docs/references/torch.md +0 -0
  75. {datachain-0.7.9 → datachain-0.7.11}/docs/references/udf.md +0 -0
  76. {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  77. {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  78. {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/openimage-detect.py +0 -0
  79. {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/ultralytics-bbox.py +0 -0
  80. {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/ultralytics-pose.py +0 -0
  81. {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/ultralytics-segment.py +0 -0
  82. {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/common_sql_functions.py +0 -0
  83. {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/json-csv-reader.py +0 -0
  84. {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/udfs/parallel.py +0 -0
  85. {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/udfs/simple.py +0 -0
  86. {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/udfs/stateful.py +0 -0
  87. {datachain-0.7.9 → datachain-0.7.11}/examples/llm_and_nlp/claude-query.py +0 -0
  88. {datachain-0.7.9 → datachain-0.7.11}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  89. {datachain-0.7.9 → datachain-0.7.11}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  90. {datachain-0.7.9 → datachain-0.7.11}/examples/multimodal/clip_inference.py +0 -0
  91. {datachain-0.7.9 → datachain-0.7.11}/examples/multimodal/hf_pipeline.py +0 -0
  92. {datachain-0.7.9 → datachain-0.7.11}/examples/multimodal/openai_image_desc_lib.py +0 -0
  93. {datachain-0.7.9 → datachain-0.7.11}/examples/multimodal/wds.py +0 -0
  94. {datachain-0.7.9 → datachain-0.7.11}/examples/multimodal/wds_filtered.py +0 -0
  95. {datachain-0.7.9 → datachain-0.7.11}/setup.cfg +0 -0
  96. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/__init__.py +0 -0
  97. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/__main__.py +0 -0
  98. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/asyn.py +0 -0
  99. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/cache.py +0 -0
  100. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/catalog/__init__.py +0 -0
  101. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/catalog/catalog.py +0 -0
  102. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/catalog/datasource.py +0 -0
  103. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/catalog/loader.py +0 -0
  104. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/cli.py +0 -0
  105. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/cli_utils.py +0 -0
  106. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/azure.py +0 -0
  107. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/fileslice.py +0 -0
  108. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/gcs.py +0 -0
  109. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/hf.py +0 -0
  110. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/s3.py +0 -0
  111. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/config.py +0 -0
  112. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/__init__.py +0 -0
  113. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/db_engine.py +0 -0
  114. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/job.py +0 -0
  115. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/metastore.py +0 -0
  116. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/schema.py +0 -0
  117. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/serializer.py +0 -0
  118. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/sqlite.py +0 -0
  119. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/warehouse.py +0 -0
  120. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/dataset.py +0 -0
  121. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/error.py +0 -0
  122. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/aggregate.py +0 -0
  123. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/array.py +0 -0
  124. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/base.py +0 -0
  125. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/conditional.py +0 -0
  126. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/func.py +0 -0
  127. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/path.py +0 -0
  128. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/random.py +0 -0
  129. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/window.py +0 -0
  130. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/job.py +0 -0
  131. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/__init__.py +0 -0
  132. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/arrow.py +0 -0
  133. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/clip.py +0 -0
  134. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/__init__.py +0 -0
  135. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/python_to_sql.py +0 -0
  136. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/sql_to_python.py +0 -0
  137. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/data_model.py +0 -0
  138. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/dataset_info.py +0 -0
  139. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/hf.py +0 -0
  140. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/image.py +0 -0
  141. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/listing.py +0 -0
  142. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/listing_info.py +0 -0
  143. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/model_store.py +0 -0
  144. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/settings.py +0 -0
  145. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/tar.py +0 -0
  146. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/text.py +0 -0
  147. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/udf.py +0 -0
  148. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/udf_signature.py +0 -0
  149. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/vfile.py +0 -0
  150. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/webdataset.py +0 -0
  151. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/webdataset_laion.py +0 -0
  152. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/listing.py +0 -0
  153. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/__init__.py +0 -0
  154. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/bbox.py +0 -0
  155. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/pose.py +0 -0
  156. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/segment.py +0 -0
  157. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/ultralytics/__init__.py +0 -0
  158. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/ultralytics/bbox.py +0 -0
  159. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/ultralytics/pose.py +0 -0
  160. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/ultralytics/segment.py +0 -0
  161. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/node.py +0 -0
  162. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/nodes_fetcher.py +0 -0
  163. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/nodes_thread_pool.py +0 -0
  164. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/progress.py +0 -0
  165. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/py.typed +0 -0
  166. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/__init__.py +0 -0
  167. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/batch.py +0 -0
  168. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/dispatch.py +0 -0
  169. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/metrics.py +0 -0
  170. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/params.py +0 -0
  171. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/queue.py +0 -0
  172. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/schema.py +0 -0
  173. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/session.py +0 -0
  174. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/remote/__init__.py +0 -0
  175. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/remote/studio.py +0 -0
  176. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/__init__.py +0 -0
  177. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/default/__init__.py +0 -0
  178. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/default/base.py +0 -0
  179. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/__init__.py +0 -0
  180. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/aggregate.py +0 -0
  181. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/array.py +0 -0
  182. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/conditional.py +0 -0
  183. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/path.py +0 -0
  184. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/random.py +0 -0
  185. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/selectable.py +0 -0
  186. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/sqlite/__init__.py +0 -0
  187. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/sqlite/types.py +0 -0
  188. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/sqlite/vector.py +0 -0
  189. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/types.py +0 -0
  190. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/utils.py +0 -0
  191. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/studio.py +0 -0
  192. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/telemetry.py +0 -0
  193. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/toolkit/__init__.py +0 -0
  194. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/torch/__init__.py +0 -0
  195. {datachain-0.7.9 → datachain-0.7.11}/src/datachain/utils.py +0 -0
  196. {datachain-0.7.9 → datachain-0.7.11}/src/datachain.egg-info/dependency_links.txt +0 -0
  197. {datachain-0.7.9 → datachain-0.7.11}/src/datachain.egg-info/entry_points.txt +0 -0
  198. {datachain-0.7.9 → datachain-0.7.11}/src/datachain.egg-info/top_level.txt +0 -0
  199. {datachain-0.7.9 → datachain-0.7.11}/tests/__init__.py +0 -0
  200. {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/__init__.py +0 -0
  201. {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/conftest.py +0 -0
  202. {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  203. {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/datasets/.dvc/config +0 -0
  204. {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/datasets/.gitignore +0 -0
  205. {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  206. {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/test_datachain.py +0 -0
  207. {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/test_ls.py +0 -0
  208. {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/test_version.py +0 -0
  209. {datachain-0.7.9 → datachain-0.7.11}/tests/data.py +0 -0
  210. {datachain-0.7.9 → datachain-0.7.11}/tests/examples/__init__.py +0 -0
  211. {datachain-0.7.9 → datachain-0.7.11}/tests/examples/test_wds_e2e.py +0 -0
  212. {datachain-0.7.9 → datachain-0.7.11}/tests/examples/wds_data.py +0 -0
  213. {datachain-0.7.9 → datachain-0.7.11}/tests/func/__init__.py +0 -0
  214. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_catalog.py +0 -0
  215. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_client.py +0 -0
  216. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_datachain.py +0 -0
  217. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_dataset_query.py +0 -0
  218. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_datasets.py +0 -0
  219. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_feature_pickling.py +0 -0
  220. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_listing.py +0 -0
  221. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_ls.py +0 -0
  222. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_meta_formats.py +0 -0
  223. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_metrics.py +0 -0
  224. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_pull.py +0 -0
  225. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_pytorch.py +0 -0
  226. {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_query.py +0 -0
  227. {datachain-0.7.9 → datachain-0.7.11}/tests/scripts/feature_class.py +0 -0
  228. {datachain-0.7.9 → datachain-0.7.11}/tests/scripts/feature_class_exception.py +0 -0
  229. {datachain-0.7.9 → datachain-0.7.11}/tests/scripts/feature_class_parallel.py +0 -0
  230. {datachain-0.7.9 → datachain-0.7.11}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  231. {datachain-0.7.9 → datachain-0.7.11}/tests/scripts/name_len_slow.py +0 -0
  232. {datachain-0.7.9 → datachain-0.7.11}/tests/test_atomicity.py +0 -0
  233. {datachain-0.7.9 → datachain-0.7.11}/tests/test_cli_e2e.py +0 -0
  234. {datachain-0.7.9 → datachain-0.7.11}/tests/test_cli_studio.py +0 -0
  235. {datachain-0.7.9 → datachain-0.7.11}/tests/test_query_e2e.py +0 -0
  236. {datachain-0.7.9 → datachain-0.7.11}/tests/test_telemetry.py +0 -0
  237. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/__init__.py +0 -0
  238. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/__init__.py +0 -0
  239. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/conftest.py +0 -0
  240. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_arrow.py +0 -0
  241. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_clip.py +0 -0
  242. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_datachain.py +0 -0
  243. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  244. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_datachain_merge.py +0 -0
  245. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_feature.py +0 -0
  246. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_feature_utils.py +0 -0
  247. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_file.py +0 -0
  248. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_hf.py +0 -0
  249. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_image.py +0 -0
  250. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_listing_info.py +0 -0
  251. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_models.py +0 -0
  252. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_schema.py +0 -0
  253. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_sql_to_python.py +0 -0
  254. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_text.py +0 -0
  255. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_udf_signature.py +0 -0
  256. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_utils.py +0 -0
  257. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_webdataset.py +0 -0
  258. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/__init__.py +0 -0
  259. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/sqlite/__init__.py +0 -0
  260. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/sqlite/test_types.py +0 -0
  261. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
  262. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_array.py +0 -0
  263. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_conditional.py +0 -0
  264. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_path.py +0 -0
  265. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_random.py +0 -0
  266. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_selectable.py +0 -0
  267. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_string.py +0 -0
  268. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_asyn.py +0 -0
  269. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_cache.py +0 -0
  270. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_catalog.py +0 -0
  271. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_catalog_loader.py +0 -0
  272. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_cli_parsing.py +0 -0
  273. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_client.py +0 -0
  274. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_client_s3.py +0 -0
  275. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_config.py +0 -0
  276. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_data_storage.py +0 -0
  277. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_database_engine.py +0 -0
  278. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_dataset.py +0 -0
  279. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_dispatch.py +0 -0
  280. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_fileslice.py +0 -0
  281. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_listing.py +0 -0
  282. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_metastore.py +0 -0
  283. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_module_exports.py +0 -0
  284. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_query.py +0 -0
  285. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_query_metrics.py +0 -0
  286. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_query_params.py +0 -0
  287. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_serializer.py +0 -0
  288. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_session.py +0 -0
  289. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_utils.py +0 -0
  290. {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_warehouse.py +0 -0
  291. {datachain-0.7.9 → datachain-0.7.11}/tests/utils.py +0 -0
@@ -3,7 +3,7 @@ name: Tests
3
3
  on:
4
4
  push:
5
5
  branches: [main]
6
- pull_request:
6
+ pull_request_target:
7
7
  workflow_dispatch:
8
8
 
9
9
  env:
@@ -14,13 +14,22 @@ concurrency:
14
14
  cancel-in-progress: true
15
15
 
16
16
  jobs:
17
+ authorize:
18
+ environment: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository && 'external' || 'internal' }}
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - run: true
22
+
17
23
  lint:
24
+ needs: authorize
25
+
18
26
  runs-on: ubuntu-latest
19
27
  steps:
20
28
  - name: Check out the repository
21
29
  uses: actions/checkout@v4
22
30
  with:
23
31
  fetch-depth: 0
32
+ ref: ${{ github.event.pull_request.head.sha || github.ref }}
24
33
 
25
34
  - name: Set up Python 3.9
26
35
  uses: actions/setup-python@v5
@@ -53,6 +62,8 @@ jobs:
53
62
  run: nox -s lint
54
63
 
55
64
  datachain:
65
+ needs: authorize
66
+
56
67
  timeout-minutes: 40
57
68
  runs-on: ${{ matrix.os }}
58
69
  strategy:
@@ -75,6 +86,7 @@ jobs:
75
86
  uses: actions/checkout@v4
76
87
  with:
77
88
  fetch-depth: 0
89
+ ref: ${{ github.event.pull_request.head.sha || github.ref }}
78
90
 
79
91
  - name: Set up Python ${{ matrix.pyv }}
80
92
  uses: actions/setup-python@v5
@@ -117,12 +129,14 @@ jobs:
117
129
  run: nox -s docs
118
130
 
119
131
  examples:
132
+ needs: authorize
133
+
120
134
  runs-on: ${{ matrix.os }}
121
135
  timeout-minutes: 60
122
136
  strategy:
123
137
  fail-fast: false
124
138
  matrix:
125
- os: [ubuntu-latest, macos-latest, windows-latest]
139
+ os: [ubuntu-latest, windows-latest]
126
140
  pyv: ['3.9', '3.12']
127
141
  group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
128
142
  exclude:
@@ -132,9 +146,10 @@ jobs:
132
146
  - {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
133
147
  - {os: ubuntu-latest-4-cores, pyv: "3.12", group: multimodal}
134
148
 
135
-
136
149
  steps:
137
150
  - uses: actions/checkout@v4
151
+ with:
152
+ ref: ${{ github.event.pull_request.head.sha || github.ref }}
138
153
 
139
154
  - name: Set up Python ${{ matrix.pyv }}
140
155
  uses: actions/setup-python@v5
@@ -151,7 +166,20 @@ jobs:
151
166
  - name: Install nox
152
167
  run: uv pip install nox --system
153
168
 
169
+ # HF runs against actual API - thus run it only once
170
+ - name: Set hf token
171
+ if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.12'
172
+ run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
173
+
154
174
  - name: Run examples
155
- env:
156
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
157
175
  run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
176
+
177
+ check:
178
+ if: always()
179
+ needs: [lint, datachain, examples]
180
+ runs-on: ubuntu-latest
181
+ steps:
182
+ - uses: re-actors/alls-green@release/v1
183
+ with:
184
+ allowed-failures: examples
185
+ jobs: ${{ toJSON(needs) }}
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.8.1'
27
+ rev: 'v0.8.2'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -0,0 +1,206 @@
1
+ Metadata-Version: 2.1
2
+ Name: datachain
3
+ Version: 0.7.11
4
+ Summary: Wrangle unstructured AI data at scale
5
+ Author-email: Dmitry Petrov <support@dvc.org>
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://datachain.dvc.ai
8
+ Project-URL: Issues, https://github.com/iterative/datachain/issues
9
+ Project-URL: Source, https://github.com/iterative/datachain
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Development Status :: 2 - Pre-Alpha
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/x-rst
18
+ License-File: LICENSE
19
+ Requires-Dist: pyyaml
20
+ Requires-Dist: tomlkit
21
+ Requires-Dist: tqdm
22
+ Requires-Dist: numpy<3,>=1
23
+ Requires-Dist: pandas>=2.0.0
24
+ Requires-Dist: pyarrow
25
+ Requires-Dist: typing-extensions
26
+ Requires-Dist: python-dateutil>=2
27
+ Requires-Dist: attrs>=21.3.0
28
+ Requires-Dist: s3fs>=2024.2.0
29
+ Requires-Dist: gcsfs>=2024.2.0
30
+ Requires-Dist: adlfs>=2024.2.0
31
+ Requires-Dist: dvc-data<4,>=3.10
32
+ Requires-Dist: dvc-objects<6,>=4
33
+ Requires-Dist: shtab<2,>=1.3.4
34
+ Requires-Dist: sqlalchemy>=2
35
+ Requires-Dist: multiprocess==0.70.16
36
+ Requires-Dist: cloudpickle
37
+ Requires-Dist: orjson>=3.10.5
38
+ Requires-Dist: pydantic<3,>=2
39
+ Requires-Dist: jmespath>=1.0
40
+ Requires-Dist: datamodel-code-generator>=0.25
41
+ Requires-Dist: Pillow<12,>=10.0.0
42
+ Requires-Dist: msgpack<2,>=1.0.4
43
+ Requires-Dist: psutil
44
+ Requires-Dist: huggingface_hub
45
+ Requires-Dist: iterative-telemetry>=0.0.9
46
+ Requires-Dist: platformdirs
47
+ Requires-Dist: dvc-studio-client<1,>=0.21
48
+ Requires-Dist: tabulate
49
+ Provides-Extra: docs
50
+ Requires-Dist: mkdocs>=1.5.2; extra == "docs"
51
+ Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
52
+ Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
53
+ Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
54
+ Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
55
+ Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
56
+ Provides-Extra: torch
57
+ Requires-Dist: torch>=2.1.0; extra == "torch"
58
+ Requires-Dist: torchvision; extra == "torch"
59
+ Requires-Dist: transformers>=4.36.0; extra == "torch"
60
+ Provides-Extra: remote
61
+ Requires-Dist: lz4; extra == "remote"
62
+ Requires-Dist: requests>=2.22.0; extra == "remote"
63
+ Provides-Extra: vector
64
+ Requires-Dist: usearch; extra == "vector"
65
+ Provides-Extra: hf
66
+ Requires-Dist: numba>=0.60.0; extra == "hf"
67
+ Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
68
+ Provides-Extra: tests
69
+ Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
70
+ Requires-Dist: pytest<9,>=8; extra == "tests"
71
+ Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
72
+ Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
73
+ Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
74
+ Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
75
+ Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
76
+ Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
77
+ Requires-Dist: virtualenv; extra == "tests"
78
+ Requires-Dist: dulwich; extra == "tests"
79
+ Requires-Dist: hypothesis; extra == "tests"
80
+ Requires-Dist: open_clip_torch; extra == "tests"
81
+ Requires-Dist: aiotools>=1.7.0; extra == "tests"
82
+ Requires-Dist: requests-mock; extra == "tests"
83
+ Requires-Dist: scipy; extra == "tests"
84
+ Provides-Extra: dev
85
+ Requires-Dist: datachain[docs,tests]; extra == "dev"
86
+ Requires-Dist: mypy==1.13.0; extra == "dev"
87
+ Requires-Dist: types-python-dateutil; extra == "dev"
88
+ Requires-Dist: types-pytz; extra == "dev"
89
+ Requires-Dist: types-PyYAML; extra == "dev"
90
+ Requires-Dist: types-requests; extra == "dev"
91
+ Requires-Dist: types-tabulate; extra == "dev"
92
+ Provides-Extra: examples
93
+ Requires-Dist: datachain[tests]; extra == "examples"
94
+ Requires-Dist: defusedxml; extra == "examples"
95
+ Requires-Dist: accelerate; extra == "examples"
96
+ Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
97
+ Requires-Dist: unstructured[pdf]; extra == "examples"
98
+ Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
+ Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
+ Requires-Dist: onnx==1.16.1; extra == "examples"
101
+ Requires-Dist: ultralytics==8.3.48; extra == "examples"
102
+
103
+ ================
104
+ |logo| DataChain
105
+ ================
106
+
107
+ |PyPI| |Python Version| |Codecov| |Tests|
108
+
109
+ .. |logo| image:: docs/assets/datachain.svg
110
+ :height: 24
111
+ .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
112
+ :target: https://pypi.org/project/datachain/
113
+ :alt: PyPI
114
+ .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
115
+ :target: https://pypi.org/project/datachain
116
+ :alt: Python Version
117
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
118
+ :target: https://codecov.io/gh/iterative/datachain
119
+ :alt: Codecov
120
+ .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
121
+ :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
122
+ :alt: Tests
123
+
124
+ DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
125
+ data like images, audio, videos, text and PDFs. It integrates with external storage
126
+ (e.g. S3) to process data efficiently without data duplication and manages metadata
127
+ in an internal database for easy and efficient querying.
128
+
129
+
130
+ Use Cases
131
+ =========
132
+
133
+ 1. **ETL.** Pythonic framework for describing and running unstructured data transformations
134
+ and enrichments, applying models to data, including LLMs.
135
+ 2. **Analytics.** DataChain dataset is a table that combines all the information about data
136
+ objects in one place + it provides dataframe-like API and vecrorized engine to do analytics
137
+ on these tables at scale.
138
+ 3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
139
+ Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
140
+
141
+ Getting Started
142
+ ===============
143
+
144
+ Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
145
+ to get started with `DataChain` and learn more.
146
+
147
+ Key Features
148
+ ============
149
+
150
+ 📂 **Multimodal Dataset Versioning.**
151
+ - Version unstructured data without moving or creating data copies, by supporting
152
+ references to S3, GCP, Azure, and local file systems.
153
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
154
+ - Unite files and metadata together into persistent, versioned, columnar datasets.
155
+
156
+ 🐍 **Python-friendly.**
157
+ - Operate on Python objects and object fields: float scores, strings, matrixes,
158
+ LLM response objects.
159
+ - Run Python code in a high-scale, terabytes size datasets, with built-in
160
+ parallelization and memory-efficient computing — no SQL or Spark required.
161
+
162
+ 🧠 **Data Enrichment and Processing.**
163
+ - Generate metadata using local AI models and LLM APIs.
164
+ - Filter, join, and group datasets by metadata. Search by vector embeddings.
165
+ - High-performance vectorized operations on Python objects: sum, count, avg, etc.
166
+ - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
167
+
168
+
169
+ Contributing
170
+ ============
171
+
172
+ Contributions are very welcome. To learn more, see the `Contributor Guide`_.
173
+
174
+
175
+ Community and Support
176
+ =====================
177
+
178
+ * `Docs <https://docs.datachain.ai/>`_
179
+ * `File an issue`_ if you encounter any problems
180
+ * `Discord Chat <https://dvc.org/chat>`_
181
+ * `Email <mailto:support@dvc.org>`_
182
+ * `Twitter <https://twitter.com/DVCorg>`_
183
+
184
+
185
+ DataChain Studio Platform
186
+ =========================
187
+
188
+ `DataChain Studio`_ is a proprietary solution for teams that offers:
189
+
190
+ - **Centralized dataset registry** to manage data, code and dependency
191
+ dependencies in one place.
192
+ - **Data Lineage** for data sources as well as derivative dataset.
193
+ - **UI for Multimodal Data** like images, videos, and PDFs.
194
+ - **Scalable Compute** to handle large datasets (100M+ files) and in-house
195
+ AI model inference.
196
+ - **Access control** including SSO and team based collaboration.
197
+
198
+ .. _PyPI: https://pypi.org/
199
+ .. _file an issue: https://github.com/iterative/datachain/issues
200
+ .. github-only
201
+ .. _Contributor Guide: https://docs.datachain.ai/contributing
202
+ .. _Pydantic: https://github.com/pydantic/pydantic
203
+ .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
204
+ .. _SQLite: https://www.sqlite.org/
205
+ .. _Getting Started: https://docs.datachain.ai/
206
+ .. _DataChain Studio: https://studio.datachain.ai/
@@ -0,0 +1,104 @@
1
+ ================
2
+ |logo| DataChain
3
+ ================
4
+
5
+ |PyPI| |Python Version| |Codecov| |Tests|
6
+
7
+ .. |logo| image:: docs/assets/datachain.svg
8
+ :height: 24
9
+ .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
10
+ :target: https://pypi.org/project/datachain/
11
+ :alt: PyPI
12
+ .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
13
+ :target: https://pypi.org/project/datachain
14
+ :alt: Python Version
15
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
16
+ :target: https://codecov.io/gh/iterative/datachain
17
+ :alt: Codecov
18
+ .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
19
+ :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
20
+ :alt: Tests
21
+
22
+ DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
23
+ data like images, audio, videos, text and PDFs. It integrates with external storage
24
+ (e.g. S3) to process data efficiently without data duplication and manages metadata
25
+ in an internal database for easy and efficient querying.
26
+
27
+
28
+ Use Cases
29
+ =========
30
+
31
+ 1. **ETL.** Pythonic framework for describing and running unstructured data transformations
32
+ and enrichments, applying models to data, including LLMs.
33
+ 2. **Analytics.** DataChain dataset is a table that combines all the information about data
34
+ objects in one place + it provides dataframe-like API and vecrorized engine to do analytics
35
+ on these tables at scale.
36
+ 3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
37
+ Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
38
+
39
+ Getting Started
40
+ ===============
41
+
42
+ Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
43
+ to get started with `DataChain` and learn more.
44
+
45
+ Key Features
46
+ ============
47
+
48
+ 📂 **Multimodal Dataset Versioning.**
49
+ - Version unstructured data without moving or creating data copies, by supporting
50
+ references to S3, GCP, Azure, and local file systems.
51
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
52
+ - Unite files and metadata together into persistent, versioned, columnar datasets.
53
+
54
+ 🐍 **Python-friendly.**
55
+ - Operate on Python objects and object fields: float scores, strings, matrixes,
56
+ LLM response objects.
57
+ - Run Python code in a high-scale, terabytes size datasets, with built-in
58
+ parallelization and memory-efficient computing — no SQL or Spark required.
59
+
60
+ 🧠 **Data Enrichment and Processing.**
61
+ - Generate metadata using local AI models and LLM APIs.
62
+ - Filter, join, and group datasets by metadata. Search by vector embeddings.
63
+ - High-performance vectorized operations on Python objects: sum, count, avg, etc.
64
+ - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
65
+
66
+
67
+ Contributing
68
+ ============
69
+
70
+ Contributions are very welcome. To learn more, see the `Contributor Guide`_.
71
+
72
+
73
+ Community and Support
74
+ =====================
75
+
76
+ * `Docs <https://docs.datachain.ai/>`_
77
+ * `File an issue`_ if you encounter any problems
78
+ * `Discord Chat <https://dvc.org/chat>`_
79
+ * `Email <mailto:support@dvc.org>`_
80
+ * `Twitter <https://twitter.com/DVCorg>`_
81
+
82
+
83
+ DataChain Studio Platform
84
+ =========================
85
+
86
+ `DataChain Studio`_ is a proprietary solution for teams that offers:
87
+
88
+ - **Centralized dataset registry** to manage data, code and dependency
89
+ dependencies in one place.
90
+ - **Data Lineage** for data sources as well as derivative dataset.
91
+ - **UI for Multimodal Data** like images, videos, and PDFs.
92
+ - **Scalable Compute** to handle large datasets (100M+ files) and in-house
93
+ AI model inference.
94
+ - **Access control** including SSO and team based collaboration.
95
+
96
+ .. _PyPI: https://pypi.org/
97
+ .. _file an issue: https://github.com/iterative/datachain/issues
98
+ .. github-only
99
+ .. _Contributor Guide: https://docs.datachain.ai/contributing
100
+ .. _Pydantic: https://github.com/pydantic/pydantic
101
+ .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
102
+ .. _SQLite: https://www.sqlite.org/
103
+ .. _Getting Started: https://docs.datachain.ai/
104
+ .. _DataChain Studio: https://studio.datachain.ai/
@@ -0,0 +1,115 @@
1
+ ---
2
+ title: Contributing
3
+ ---
4
+
5
+ # Contributor Guide
6
+
7
+ Thank you for your interest in improving this project. This project is
8
+ open-source under the [Apache 2.0
9
+ license](https://opensource.org/licenses/Apache-2.0) and welcomes
10
+ contributions in the form of bug reports, feature requests, and pull
11
+ requests.
12
+
13
+ Here is a list of important resources for contributors:
14
+
15
+ - [Source Code](https://github.com/iterative/datachain)
16
+ - [Documentation](https://docs.dvc.ai/datachain)
17
+ - [Issue Tracker](https://github.com/iterative/datachain/issues)
18
+ - [Code of Conduct](https://github.com/iterative/datachain?tab=coc-ov-file)
19
+
20
+ ## How to report a bug
21
+
22
+ Report bugs on the [Issue
23
+ Tracker](https://github.com/iterative/datachain/issues).
24
+
25
+ When filing an issue, make sure to answer these questions:
26
+
27
+ - Which operating system and Python version are you using?
28
+ - Which version of this project are you using?
29
+ - What did you do?
30
+ - What did you expect to see?
31
+ - What did you see instead?
32
+
33
+ The best way to get your bug fixed is to provide a test case, and/or
34
+ steps to reproduce the issue.
35
+
36
+ ## How to request a feature
37
+
38
+ Request features on the [Issue
39
+ Tracker](https://github.com/iterative/datachain/issues).
40
+
41
+ ## How to set up your development environment
42
+
43
+ You need Python 3.8+ and the following tools:
44
+
45
+ - [Nox](https://nox.thea.codes/)
46
+
47
+ Install the package with development requirements:
48
+
49
+ ``` console
50
+ $ pip install nox
51
+ ```
52
+
53
+ ## How to test the project
54
+
55
+ Run the full test suite:
56
+
57
+ ``` console
58
+ $ nox
59
+ ```
60
+
61
+ List the available Nox sessions:
62
+
63
+ ``` console
64
+ $ nox --list-sessions
65
+ ```
66
+
67
+ You can also run a specific Nox session. For example, invoke the unit
68
+ test suite like this:
69
+
70
+ ``` console
71
+ $ nox --session=tests
72
+ ```
73
+
74
+ Unit tests are located in the `tests` directory, and are written using
75
+ the [pytest](https://pytest.readthedocs.io/) testing framework.
76
+
77
+ ## Build documentation
78
+
79
+ If you've made any changes to the documentation (including changes to
80
+ function signatures, class definitions, or docstrings that will appear
81
+ in the API documentation), make sure it builds successfully.
82
+
83
+ ``` console
84
+ $ nox -s docs
85
+ ```
86
+
87
+ In order to run this locally with hot reload on changes:
88
+
89
+ ``` console
90
+ $ mkdocs serve
91
+ ```
92
+
93
+ ## How to submit changes
94
+
95
+ Open a [pull request](https://github.com/iterative/datachain/pulls) to
96
+ submit changes to this project.
97
+
98
+ Your pull request needs to meet the following guidelines for acceptance:
99
+
100
+ - The Nox test suite must pass without errors and warnings.
101
+ - Include unit tests. This project maintains 100% code coverage.
102
+ - If your changes add functionality, update the documentation
103
+ accordingly.
104
+
105
+ Feel free to submit early, though---we can always iterate on this.
106
+
107
+ To run linting and code formatting checks, you can invoke a `lint` session in nox:
108
+
109
+ ``` console
110
+ $ nox -s lint
111
+ ```
112
+
113
+ It is recommended to open an issue before starting work on anything.
114
+ This will allow a chance to talk it over with the owners and validate
115
+ your approach.
@@ -0,0 +1,39 @@
1
+ .headerlink {
2
+ --permalink-size: 16px; /* for font-relative sizes, 0.6em is a good choice */
3
+ --permalink-spacing: 4px;
4
+
5
+ width: calc(var(--permalink-size) + var(--permalink-spacing));
6
+ height: var(--permalink-size);
7
+ vertical-align: middle;
8
+ background-color: var(--md-default-fg-color--lighter);
9
+ background-size: var(--permalink-size);
10
+ mask-size: var(--permalink-size);
11
+ -webkit-mask-size: var(--permalink-size);
12
+ mask-repeat: no-repeat;
13
+ -webkit-mask-repeat: no-repeat;
14
+ visibility: visible;
15
+ mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
16
+ -webkit-mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
17
+ }
18
+
19
+ [id]:target .headerlink {
20
+ background-color: var(--md-typeset-a-color);
21
+ }
22
+
23
+ .headerlink:hover {
24
+ background-color: var(--md-accent-fg-color) !important;
25
+ }
26
+
27
+ @media screen and (min-width: 76.25em) {
28
+ h1, h2, h3, h4, h5, h6 {
29
+ display: flex;
30
+ align-items: center;
31
+ flex-direction: row;
32
+ column-gap: 0.2em; /* fixes spaces in titles */
33
+ }
34
+
35
+ .headerlink {
36
+ order: -1;
37
+ margin-left: calc(var(--permalink-size) * -1 - var(--permalink-spacing)) !important;
38
+ }
39
+ }