datachain 0.7.10__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (289) hide show
  1. {datachain-0.7.10 → datachain-0.8.0}/.github/workflows/tests.yml +16 -3
  2. {datachain-0.7.10 → datachain-0.8.0}/.pre-commit-config.yaml +1 -1
  3. {datachain-0.7.10/src/datachain.egg-info → datachain-0.8.0}/PKG-INFO +10 -10
  4. {datachain-0.7.10 → datachain-0.8.0}/README.rst +5 -6
  5. {datachain-0.7.10 → datachain-0.8.0}/docs/contributing.md +4 -0
  6. datachain-0.8.0/docs/css/github-permalink-style.css +39 -0
  7. {datachain-0.7.10 → datachain-0.8.0}/docs/examples.md +4 -1
  8. {datachain-0.7.10 → datachain-0.8.0}/docs/index.md +4 -1
  9. {datachain-0.7.10 → datachain-0.8.0}/docs/quick-start.md +4 -0
  10. {datachain-0.7.10 → datachain-0.8.0}/docs/references/index.md +4 -0
  11. {datachain-0.7.10 → datachain-0.8.0}/docs/tutorials.md +4 -0
  12. datachain-0.8.0/examples/get_started/json-csv-reader.py +82 -0
  13. {datachain-0.7.10 → datachain-0.8.0}/examples/get_started/torch-loader.py +25 -20
  14. {datachain-0.7.10 → datachain-0.8.0}/examples/llm_and_nlp/unstructured-embeddings-gen.py +7 -5
  15. {datachain-0.7.10 → datachain-0.8.0}/mkdocs.yml +18 -16
  16. {datachain-0.7.10 → datachain-0.8.0}/noxfile.py +2 -0
  17. {datachain-0.7.10 → datachain-0.8.0}/pyproject.toml +5 -4
  18. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/catalog/catalog.py +53 -41
  19. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/cli.py +25 -3
  20. datachain-0.8.0/src/datachain/client/__init__.py +3 -0
  21. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/sqlite.py +20 -6
  22. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/dc.py +160 -110
  23. datachain-0.8.0/src/datachain/lib/diff.py +197 -0
  24. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/file.py +2 -1
  25. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/meta_formats.py +40 -43
  26. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/pytorch.py +1 -5
  27. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/signal_schema.py +28 -6
  28. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/dataset.py +5 -1
  29. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/remote/studio.py +53 -1
  30. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/studio.py +47 -2
  31. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/toolkit/split.py +19 -6
  32. {datachain-0.7.10 → datachain-0.8.0/src/datachain.egg-info}/PKG-INFO +10 -10
  33. {datachain-0.7.10 → datachain-0.8.0}/src/datachain.egg-info/SOURCES.txt +3 -0
  34. {datachain-0.7.10 → datachain-0.8.0}/src/datachain.egg-info/requires.txt +4 -3
  35. {datachain-0.7.10 → datachain-0.8.0}/tests/conftest.py +12 -10
  36. {datachain-0.7.10 → datachain-0.8.0}/tests/examples/test_examples.py +14 -29
  37. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_datachain.py +1 -1
  38. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_meta_formats.py +4 -4
  39. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_pull.py +18 -12
  40. datachain-0.8.0/tests/func/test_toolkit.py +51 -0
  41. {datachain-0.7.10 → datachain-0.8.0}/tests/test_cli_studio.py +52 -1
  42. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_datachain.py +3 -3
  43. datachain-0.8.0/tests/unit/lib/test_diff.py +498 -0
  44. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_signal_schema.py +5 -0
  45. datachain-0.7.10/examples/get_started/json-csv-reader.py +0 -101
  46. datachain-0.7.10/src/datachain/client/__init__.py +0 -4
  47. datachain-0.7.10/tests/func/test_toolkit.py +0 -42
  48. {datachain-0.7.10 → datachain-0.8.0}/.cruft.json +0 -0
  49. {datachain-0.7.10 → datachain-0.8.0}/.gitattributes +0 -0
  50. {datachain-0.7.10 → datachain-0.8.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  51. {datachain-0.7.10 → datachain-0.8.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  52. {datachain-0.7.10 → datachain-0.8.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  53. {datachain-0.7.10 → datachain-0.8.0}/.github/codecov.yaml +0 -0
  54. {datachain-0.7.10 → datachain-0.8.0}/.github/dependabot.yml +0 -0
  55. {datachain-0.7.10 → datachain-0.8.0}/.github/workflows/benchmarks.yml +0 -0
  56. {datachain-0.7.10 → datachain-0.8.0}/.github/workflows/release.yml +0 -0
  57. {datachain-0.7.10 → datachain-0.8.0}/.github/workflows/tests-studio.yml +0 -0
  58. {datachain-0.7.10 → datachain-0.8.0}/.github/workflows/update-template.yaml +0 -0
  59. {datachain-0.7.10 → datachain-0.8.0}/.gitignore +0 -0
  60. {datachain-0.7.10 → datachain-0.8.0}/CODE_OF_CONDUCT.rst +0 -0
  61. {datachain-0.7.10 → datachain-0.8.0}/LICENSE +0 -0
  62. {datachain-0.7.10 → datachain-0.8.0}/docs/assets/captioned_cartoons.png +0 -0
  63. {datachain-0.7.10 → datachain-0.8.0}/docs/assets/datachain-white.svg +0 -0
  64. {datachain-0.7.10 → datachain-0.8.0}/docs/assets/datachain.svg +0 -0
  65. {datachain-0.7.10 → datachain-0.8.0}/docs/overrides/main.html +0 -0
  66. {datachain-0.7.10 → datachain-0.8.0}/docs/references/datachain.md +0 -0
  67. {datachain-0.7.10 → datachain-0.8.0}/docs/references/datatype.md +0 -0
  68. {datachain-0.7.10 → datachain-0.8.0}/docs/references/file.md +0 -0
  69. {datachain-0.7.10 → datachain-0.8.0}/docs/references/sql.md +0 -0
  70. {datachain-0.7.10 → datachain-0.8.0}/docs/references/torch.md +0 -0
  71. {datachain-0.7.10 → datachain-0.8.0}/docs/references/udf.md +0 -0
  72. {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  73. {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  74. {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/openimage-detect.py +0 -0
  75. {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  76. {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  77. {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  78. {datachain-0.7.10 → datachain-0.8.0}/examples/get_started/common_sql_functions.py +0 -0
  79. {datachain-0.7.10 → datachain-0.8.0}/examples/get_started/udfs/parallel.py +0 -0
  80. {datachain-0.7.10 → datachain-0.8.0}/examples/get_started/udfs/simple.py +0 -0
  81. {datachain-0.7.10 → datachain-0.8.0}/examples/get_started/udfs/stateful.py +0 -0
  82. {datachain-0.7.10 → datachain-0.8.0}/examples/llm_and_nlp/claude-query.py +0 -0
  83. {datachain-0.7.10 → datachain-0.8.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  84. {datachain-0.7.10 → datachain-0.8.0}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  85. {datachain-0.7.10 → datachain-0.8.0}/examples/multimodal/clip_inference.py +0 -0
  86. {datachain-0.7.10 → datachain-0.8.0}/examples/multimodal/hf_pipeline.py +0 -0
  87. {datachain-0.7.10 → datachain-0.8.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  88. {datachain-0.7.10 → datachain-0.8.0}/examples/multimodal/wds.py +0 -0
  89. {datachain-0.7.10 → datachain-0.8.0}/examples/multimodal/wds_filtered.py +0 -0
  90. {datachain-0.7.10 → datachain-0.8.0}/setup.cfg +0 -0
  91. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/__init__.py +0 -0
  92. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/__main__.py +0 -0
  93. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/asyn.py +0 -0
  94. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/cache.py +0 -0
  95. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/catalog/__init__.py +0 -0
  96. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/catalog/datasource.py +0 -0
  97. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/catalog/loader.py +0 -0
  98. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/cli_utils.py +0 -0
  99. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/azure.py +0 -0
  100. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/fileslice.py +0 -0
  101. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/fsspec.py +0 -0
  102. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/gcs.py +0 -0
  103. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/hf.py +0 -0
  104. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/local.py +0 -0
  105. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/s3.py +0 -0
  106. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/config.py +0 -0
  107. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/__init__.py +0 -0
  108. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/db_engine.py +0 -0
  109. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/job.py +0 -0
  110. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/metastore.py +0 -0
  111. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/schema.py +0 -0
  112. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/serializer.py +0 -0
  113. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/warehouse.py +0 -0
  114. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/dataset.py +0 -0
  115. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/error.py +0 -0
  116. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/__init__.py +0 -0
  117. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/aggregate.py +0 -0
  118. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/array.py +0 -0
  119. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/base.py +0 -0
  120. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/conditional.py +0 -0
  121. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/func.py +0 -0
  122. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/numeric.py +0 -0
  123. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/path.py +0 -0
  124. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/random.py +0 -0
  125. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/string.py +0 -0
  126. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/window.py +0 -0
  127. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/job.py +0 -0
  128. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/__init__.py +0 -0
  129. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/arrow.py +0 -0
  130. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/clip.py +0 -0
  131. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/__init__.py +0 -0
  132. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/flatten.py +0 -0
  133. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  134. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  135. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/unflatten.py +0 -0
  136. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  137. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/data_model.py +0 -0
  138. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/dataset_info.py +0 -0
  139. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/hf.py +0 -0
  140. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/image.py +0 -0
  141. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/listing.py +0 -0
  142. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/listing_info.py +0 -0
  143. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/model_store.py +0 -0
  144. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/settings.py +0 -0
  145. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/tar.py +0 -0
  146. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/text.py +0 -0
  147. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/udf.py +0 -0
  148. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/udf_signature.py +0 -0
  149. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/utils.py +0 -0
  150. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/vfile.py +0 -0
  151. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/webdataset.py +0 -0
  152. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/webdataset_laion.py +0 -0
  153. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/listing.py +0 -0
  154. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/__init__.py +0 -0
  155. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/bbox.py +0 -0
  156. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/pose.py +0 -0
  157. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/segment.py +0 -0
  158. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  159. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  160. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/ultralytics/pose.py +0 -0
  161. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/ultralytics/segment.py +0 -0
  162. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/node.py +0 -0
  163. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/nodes_fetcher.py +0 -0
  164. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/nodes_thread_pool.py +0 -0
  165. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/progress.py +0 -0
  166. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/py.typed +0 -0
  167. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/__init__.py +0 -0
  168. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/batch.py +0 -0
  169. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/dispatch.py +0 -0
  170. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/metrics.py +0 -0
  171. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/params.py +0 -0
  172. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/queue.py +0 -0
  173. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/schema.py +0 -0
  174. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/session.py +0 -0
  175. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/remote/__init__.py +0 -0
  176. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/__init__.py +0 -0
  177. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/default/__init__.py +0 -0
  178. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/default/base.py +0 -0
  179. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/__init__.py +0 -0
  180. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/aggregate.py +0 -0
  181. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/array.py +0 -0
  182. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/conditional.py +0 -0
  183. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/numeric.py +0 -0
  184. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/path.py +0 -0
  185. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/random.py +0 -0
  186. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/string.py +0 -0
  187. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/selectable.py +0 -0
  188. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  189. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/sqlite/base.py +0 -0
  190. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/sqlite/types.py +0 -0
  191. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/sqlite/vector.py +0 -0
  192. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/types.py +0 -0
  193. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/utils.py +0 -0
  194. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/telemetry.py +0 -0
  195. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/toolkit/__init__.py +0 -0
  196. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/torch/__init__.py +0 -0
  197. {datachain-0.7.10 → datachain-0.8.0}/src/datachain/utils.py +0 -0
  198. {datachain-0.7.10 → datachain-0.8.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  199. {datachain-0.7.10 → datachain-0.8.0}/src/datachain.egg-info/entry_points.txt +0 -0
  200. {datachain-0.7.10 → datachain-0.8.0}/src/datachain.egg-info/top_level.txt +0 -0
  201. {datachain-0.7.10 → datachain-0.8.0}/tests/__init__.py +0 -0
  202. {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/__init__.py +0 -0
  203. {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/conftest.py +0 -0
  204. {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  205. {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  206. {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/datasets/.gitignore +0 -0
  207. {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  208. {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/test_datachain.py +0 -0
  209. {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/test_ls.py +0 -0
  210. {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/test_version.py +0 -0
  211. {datachain-0.7.10 → datachain-0.8.0}/tests/data.py +0 -0
  212. {datachain-0.7.10 → datachain-0.8.0}/tests/examples/__init__.py +0 -0
  213. {datachain-0.7.10 → datachain-0.8.0}/tests/examples/test_wds_e2e.py +0 -0
  214. {datachain-0.7.10 → datachain-0.8.0}/tests/examples/wds_data.py +0 -0
  215. {datachain-0.7.10 → datachain-0.8.0}/tests/func/__init__.py +0 -0
  216. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_catalog.py +0 -0
  217. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_client.py +0 -0
  218. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_dataset_query.py +0 -0
  219. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_datasets.py +0 -0
  220. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_feature_pickling.py +0 -0
  221. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_listing.py +0 -0
  222. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_ls.py +0 -0
  223. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_metrics.py +0 -0
  224. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_pytorch.py +0 -0
  225. {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_query.py +0 -0
  226. {datachain-0.7.10 → datachain-0.8.0}/tests/scripts/feature_class.py +0 -0
  227. {datachain-0.7.10 → datachain-0.8.0}/tests/scripts/feature_class_exception.py +0 -0
  228. {datachain-0.7.10 → datachain-0.8.0}/tests/scripts/feature_class_parallel.py +0 -0
  229. {datachain-0.7.10 → datachain-0.8.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  230. {datachain-0.7.10 → datachain-0.8.0}/tests/scripts/name_len_slow.py +0 -0
  231. {datachain-0.7.10 → datachain-0.8.0}/tests/test_atomicity.py +0 -0
  232. {datachain-0.7.10 → datachain-0.8.0}/tests/test_cli_e2e.py +0 -0
  233. {datachain-0.7.10 → datachain-0.8.0}/tests/test_query_e2e.py +0 -0
  234. {datachain-0.7.10 → datachain-0.8.0}/tests/test_telemetry.py +0 -0
  235. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/__init__.py +0 -0
  236. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/__init__.py +0 -0
  237. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/conftest.py +0 -0
  238. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_arrow.py +0 -0
  239. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_clip.py +0 -0
  240. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  241. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  242. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_feature.py +0 -0
  243. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_feature_utils.py +0 -0
  244. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_file.py +0 -0
  245. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_hf.py +0 -0
  246. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_image.py +0 -0
  247. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_listing_info.py +0 -0
  248. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_models.py +0 -0
  249. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_schema.py +0 -0
  250. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  251. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_text.py +0 -0
  252. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_udf_signature.py +0 -0
  253. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_utils.py +0 -0
  254. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_webdataset.py +0 -0
  255. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/__init__.py +0 -0
  256. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  257. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  258. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  259. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_array.py +0 -0
  260. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_conditional.py +0 -0
  261. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_path.py +0 -0
  262. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_random.py +0 -0
  263. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_selectable.py +0 -0
  264. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_string.py +0 -0
  265. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_asyn.py +0 -0
  266. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_cache.py +0 -0
  267. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_catalog.py +0 -0
  268. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_catalog_loader.py +0 -0
  269. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_cli_parsing.py +0 -0
  270. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_client.py +0 -0
  271. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_client_s3.py +0 -0
  272. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_config.py +0 -0
  273. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_data_storage.py +0 -0
  274. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_database_engine.py +0 -0
  275. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_dataset.py +0 -0
  276. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_dispatch.py +0 -0
  277. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_fileslice.py +0 -0
  278. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_func.py +0 -0
  279. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_listing.py +0 -0
  280. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_metastore.py +0 -0
  281. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_module_exports.py +0 -0
  282. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_query.py +0 -0
  283. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_query_metrics.py +0 -0
  284. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_query_params.py +0 -0
  285. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_serializer.py +0 -0
  286. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_session.py +0 -0
  287. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_utils.py +0 -0
  288. {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_warehouse.py +0 -0
  289. {datachain-0.7.10 → datachain-0.8.0}/tests/utils.py +0 -0
@@ -136,7 +136,7 @@ jobs:
136
136
  strategy:
137
137
  fail-fast: false
138
138
  matrix:
139
- os: [ubuntu-latest, macos-latest, windows-latest]
139
+ os: [ubuntu-latest, windows-latest]
140
140
  pyv: ['3.9', '3.12']
141
141
  group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
142
142
  exclude:
@@ -166,7 +166,20 @@ jobs:
166
166
  - name: Install nox
167
167
  run: uv pip install nox --system
168
168
 
169
+ # HF runs against actual API - thus run it only once
170
+ - name: Set hf token
171
+ if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.12'
172
+ run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
173
+
169
174
  - name: Run examples
170
- env:
171
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
172
175
  run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
176
+
177
+ check:
178
+ if: always()
179
+ needs: [lint, datachain, examples]
180
+ runs-on: ubuntu-latest
181
+ steps:
182
+ - uses: re-actors/alls-green@release/v1
183
+ with:
184
+ allowed-failures: examples
185
+ jobs: ${{ toJSON(needs) }}
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.8.1'
27
+ rev: 'v0.8.3'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.10
3
+ Version: 0.8.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -46,6 +46,7 @@ Requires-Dist: iterative-telemetry>=0.0.9
46
46
  Requires-Dist: platformdirs
47
47
  Requires-Dist: dvc-studio-client<1,>=0.21
48
48
  Requires-Dist: tabulate
49
+ Requires-Dist: websockets
49
50
  Provides-Extra: docs
50
51
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
51
52
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -91,14 +92,14 @@ Requires-Dist: types-requests; extra == "dev"
91
92
  Requires-Dist: types-tabulate; extra == "dev"
92
93
  Provides-Extra: examples
93
94
  Requires-Dist: datachain[tests]; extra == "examples"
94
- Requires-Dist: numpy<2,>=1; extra == "examples"
95
95
  Requires-Dist: defusedxml; extra == "examples"
96
96
  Requires-Dist: accelerate; extra == "examples"
97
- Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
97
+ Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
98
+ Requires-Dist: unstructured[pdf]; extra == "examples"
98
99
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
100
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
101
  Requires-Dist: onnx==1.16.1; extra == "examples"
101
- Requires-Dist: ultralytics==8.3.37; extra == "examples"
102
+ Requires-Dist: ultralytics==8.3.50; extra == "examples"
102
103
 
103
104
  ================
104
105
  |logo| DataChain
@@ -138,6 +139,11 @@ Use Cases
138
139
  3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
139
140
  Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
140
141
 
142
+ Getting Started
143
+ ===============
144
+
145
+ Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
146
+ to get started with `DataChain` and learn more.
141
147
 
142
148
  Key Features
143
149
  ============
@@ -161,12 +167,6 @@ Key Features
161
167
  - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
162
168
 
163
169
 
164
- Getting Started
165
- ===============
166
-
167
- Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
168
-
169
-
170
170
  Contributing
171
171
  ============
172
172
 
@@ -36,6 +36,11 @@ Use Cases
36
36
  3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
37
37
  Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
38
38
 
39
+ Getting Started
40
+ ===============
41
+
42
+ Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
43
+ to get started with `DataChain` and learn more.
39
44
 
40
45
  Key Features
41
46
  ============
@@ -59,12 +64,6 @@ Key Features
59
64
  - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
60
65
 
61
66
 
62
- Getting Started
63
- ===============
64
-
65
- Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
66
-
67
-
68
67
  Contributing
69
68
  ============
70
69
 
@@ -1,3 +1,7 @@
1
+ ---
2
+ title: Contributing
3
+ ---
4
+
1
5
  # Contributor Guide
2
6
 
3
7
  Thank you for your interest in improving this project. This project is
@@ -0,0 +1,39 @@
1
+ .headerlink {
2
+ --permalink-size: 16px; /* for font-relative sizes, 0.6em is a good choice */
3
+ --permalink-spacing: 4px;
4
+
5
+ width: calc(var(--permalink-size) + var(--permalink-spacing));
6
+ height: var(--permalink-size);
7
+ vertical-align: middle;
8
+ background-color: var(--md-default-fg-color--lighter);
9
+ background-size: var(--permalink-size);
10
+ mask-size: var(--permalink-size);
11
+ -webkit-mask-size: var(--permalink-size);
12
+ mask-repeat: no-repeat;
13
+ -webkit-mask-repeat: no-repeat;
14
+ visibility: visible;
15
+ mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
16
+ -webkit-mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
17
+ }
18
+
19
+ [id]:target .headerlink {
20
+ background-color: var(--md-typeset-a-color);
21
+ }
22
+
23
+ .headerlink:hover {
24
+ background-color: var(--md-accent-fg-color) !important;
25
+ }
26
+
27
+ @media screen and (min-width: 76.25em) {
28
+ h1, h2, h3, h4, h5, h6 {
29
+ display: flex;
30
+ align-items: center;
31
+ flex-direction: row;
32
+ column-gap: 0.2em; /* fixes spaces in titles */
33
+ }
34
+
35
+ .headerlink {
36
+ order: -1;
37
+ margin-left: calc(var(--permalink-size) * -1 - var(--permalink-spacing)) !important;
38
+ }
39
+ }
@@ -1,3 +1,6 @@
1
+ ---
2
+ title: Examples
3
+ ---
1
4
 
2
5
  # Examples
3
6
 
@@ -225,7 +228,7 @@ Here is an example from MS COCO “captions” JSON which employs separate secti
225
228
  }
226
229
  ```
227
230
 
228
- Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
231
+ Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
229
232
 
230
233
  However, Datachain can easily parse the entire COCO structure via several reading and merging operators:
231
234
 
@@ -1,3 +1,6 @@
1
+ ---
2
+ title: Welcome to DataChain
3
+ ---
1
4
  # <a class="main-header-link" href="/" ><img style="display: inline-block;" src="/assets/datachain.svg" alt="DataChain"> <span style="display: inline-block;"> DataChain</span></a>
2
5
 
3
6
  <style>
@@ -83,7 +86,7 @@ The following pages provide detailed documentation on DataChain's features, arch
83
86
  - [🏃🏼‍♂️ Quick Start](quick-start.md): Get up and running with DataChain in no time.
84
87
  - [🎯 Examples](examples.md): Explore practical examples and use cases.
85
88
  - [📚 Tutorials](tutorials.md): Learn how to use DataChain for specific tasks.
86
- - [📚 API Reference](references/index.md): Dive into the technical details and API reference.
89
+ - [🐍 API Reference](references/index.md): Dive into the technical details and API reference.
87
90
  - [🤝 Contributing](contributing.md): Learn how to contribute to DataChain.
88
91
 
89
92
 
@@ -1,3 +1,7 @@
1
+ ---
2
+ title: Quick Start
3
+ ---
4
+
1
5
  # Quick Start
2
6
 
3
7
  ## Installation
@@ -1,3 +1,7 @@
1
+ ---
2
+ title: API Reference
3
+ ---
4
+
1
5
  # API Reference
2
6
 
3
7
  DataChain's API is organized into several modules:
@@ -1,3 +1,7 @@
1
+ ---
2
+ title: Tutorials
3
+ ---
4
+
1
5
  # Tutorials
2
6
 
3
7
  * Multimodal: [GitHub](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
@@ -0,0 +1,82 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from datachain import C, DataChain
6
+ from datachain.lib.data_model import ModelStore
7
+ from datachain.lib.meta_formats import gen_datamodel_code
8
+
9
+
10
+ # Sample model for static JSON model
11
+ class LicenseModel(BaseModel):
12
+ url: str
13
+ id: int
14
+ name: str
15
+
16
+
17
+ LicenseFeature = ModelStore.register(LicenseModel)
18
+
19
+
20
+ # Sample model for static CSV model
21
+ class ChatDialog(BaseModel):
22
+ id: Optional[int] = None
23
+ count: Optional[int] = None
24
+ sender: Optional[str] = None
25
+ text: Optional[str] = None
26
+
27
+
28
+ ChatFeature = ModelStore.register(ChatDialog)
29
+
30
+
31
+ def main():
32
+ # Dynamic JSONl schema from 2 objects
33
+ uri = "gs://datachain-demo/jsonl/object.jsonl"
34
+ jsonl_ds = DataChain.from_json(uri, format="jsonl", anon="True")
35
+ jsonl_ds.show()
36
+
37
+ # Dynamic JSON schema from 200 OpenImage json-pairs with validation errors
38
+ uri = "gs://datachain-demo/openimages-v6-test-jsonpairs/*json"
39
+ schema_uri = (
40
+ "gs://datachain-demo/openimages-v6-test-jsonpairs/08392c290ecc9d2a.json"
41
+ )
42
+ json_pairs_ds = DataChain.from_json(
43
+ uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage", anon="True"
44
+ )
45
+ json_pairs_ds.show()
46
+
47
+ uri = "gs://datachain-demo/coco2017/annotations_captions/"
48
+
49
+ # Print JSON schema in Pydantic format from main COCO annotation
50
+ chain = DataChain.from_storage(uri, anon="True").filter(
51
+ C("file.path").glob("*.json")
52
+ )
53
+ file = next(chain.limit(1).collect("file"))
54
+ print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
55
+
56
+ # Static JSON schema test parsing 3/7 objects
57
+ static_json_ds = DataChain.from_json(
58
+ uri, jmespath="licenses", spec=LicenseFeature, nrows=3, anon="True"
59
+ )
60
+ static_json_ds.show()
61
+
62
+ # Dynamic JSON schema test parsing 5K objects
63
+ dynamic_json_ds = DataChain.from_json(uri, jmespath="images", anon="True")
64
+ print(dynamic_json_ds.to_pandas())
65
+
66
+ # Static CSV with header schema test parsing 3.5K objects
67
+ uri = "gs://datachain-demo/chatbot-csv/"
68
+ static_csv_ds = DataChain.from_csv(
69
+ uri, output=ChatDialog, object_name="chat", anon="True"
70
+ )
71
+ static_csv_ds.print_schema()
72
+ static_csv_ds.show()
73
+
74
+ # Dynamic CSV with header schema test parsing 3/3M objects
75
+ uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
76
+ dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3, anon="True")
77
+ dynamic_csv_ds.print_schema()
78
+ dynamic_csv_ds.show()
79
+
80
+
81
+ if __name__ == "__main__":
82
+ main()
@@ -5,6 +5,7 @@ To install the required dependencies:
5
5
 
6
6
  """
7
7
 
8
+ import multiprocessing
8
9
  import os
9
10
  from posixpath import basename
10
11
 
@@ -12,17 +13,18 @@ import torch
12
13
  from torch import nn, optim
13
14
  from torch.utils.data import DataLoader
14
15
  from torchvision.transforms import v2
16
+ from tqdm import tqdm
15
17
 
16
18
  from datachain import C, DataChain
17
19
  from datachain.torch import label_to_int
18
20
 
19
21
  STORAGE = "gs://datachain-demo/dogs-and-cats/"
20
- NUM_EPOCHS = os.getenv("NUM_EPOCHS", "3")
22
+ NUM_EPOCHS = int(os.getenv("NUM_EPOCHS", "3"))
21
23
 
22
24
  # Define transformation for data preprocessing
23
25
  transform = v2.Compose(
24
26
  [
25
- v2.ToTensor(),
27
+ v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)]),
26
28
  v2.Resize((64, 64)),
27
29
  v2.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
28
30
  ]
@@ -54,6 +56,7 @@ class CNN(nn.Module):
54
56
  if __name__ == "__main__":
55
57
  ds = (
56
58
  DataChain.from_storage(STORAGE, type="image")
59
+ .settings(cache=True, prefetch=25)
57
60
  .filter(C("file.path").glob("*.jpg"))
58
61
  .map(
59
62
  label=lambda path: label_to_int(basename(path)[:3], CLASSES),
@@ -64,8 +67,10 @@ if __name__ == "__main__":
64
67
 
65
68
  train_loader = DataLoader(
66
69
  ds.to_pytorch(transform=transform),
67
- batch_size=16,
68
- num_workers=2,
70
+ batch_size=25,
71
+ num_workers=max(4, os.cpu_count() or 2),
72
+ persistent_workers=True,
73
+ multiprocessing_context=multiprocessing.get_context("spawn"),
69
74
  )
70
75
 
71
76
  model = CNN()
@@ -73,19 +78,19 @@ if __name__ == "__main__":
73
78
  optimizer = optim.Adam(model.parameters(), lr=0.001)
74
79
 
75
80
  # Train the model
76
- for epoch in range(int(NUM_EPOCHS)):
77
- for i, data in enumerate(train_loader):
78
- inputs, labels = data
79
- optimizer.zero_grad()
80
-
81
- # Forward pass
82
- outputs = model(inputs)
83
- loss = criterion(outputs, labels)
84
-
85
- # Backward pass and optimize
86
- loss.backward()
87
- optimizer.step()
88
-
89
- print(f"[{epoch + 1}, {i + 1:5d}] loss: {loss.item():.3f}")
90
-
91
- print("Finished Training")
81
+ for epoch in range(NUM_EPOCHS):
82
+ with tqdm(
83
+ train_loader, desc=f"epoch {epoch + 1}/{NUM_EPOCHS}", unit="batch"
84
+ ) as loader:
85
+ for data in loader:
86
+ inputs, labels = data
87
+ optimizer.zero_grad()
88
+
89
+ # Forward pass
90
+ outputs = model(inputs)
91
+ loss = criterion(outputs, labels)
92
+
93
+ # Backward pass and optimize
94
+ loss.backward()
95
+ optimizer.step()
96
+ loader.set_postfix(loss=loss.item())
@@ -12,11 +12,11 @@ from unstructured.cleaners.core import (
12
12
  group_broken_paragraphs,
13
13
  replace_unicode_quotes,
14
14
  )
15
- from unstructured.embed.huggingface import (
15
+ from unstructured.partition.pdf import partition_pdf
16
+ from unstructured_ingest.embed.huggingface import (
16
17
  HuggingFaceEmbeddingConfig,
17
18
  HuggingFaceEmbeddingEncoder,
18
19
  )
19
- from unstructured.partition.pdf import partition_pdf
20
20
 
21
21
  from datachain import C, DataChain, DataModel, File
22
22
 
@@ -43,6 +43,7 @@ def process_pdf(file: File) -> Iterator[Chunk]:
43
43
  chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
44
44
 
45
45
  # Clean the chunks and add new columns
46
+ text_chunks = []
46
47
  for chunk in chunks:
47
48
  chunk.apply(
48
49
  lambda text: clean(
@@ -51,16 +52,17 @@ def process_pdf(file: File) -> Iterator[Chunk]:
51
52
  )
52
53
  chunk.apply(replace_unicode_quotes)
53
54
  chunk.apply(group_broken_paragraphs)
55
+ text_chunks.append({"text": str(chunk)})
54
56
 
55
57
  # create embeddings
56
- chunks_embedded = embedding_encoder.embed_documents(chunks)
58
+ chunks_embedded = embedding_encoder.embed_documents(text_chunks)
57
59
 
58
60
  # Add new rows to DataChain
59
61
  for chunk in chunks_embedded:
60
62
  yield Chunk(
61
63
  key=file.path,
62
- text=chunk.text,
63
- embeddings=chunk.embeddings,
64
+ text=chunk.get("text"),
65
+ embeddings=chunk.get("embeddings"),
64
66
  )
65
67
 
66
68
 
@@ -27,7 +27,6 @@ theme:
27
27
  - navigation.tabs
28
28
  - navigation.path
29
29
  - navigation.top
30
- - navigation.prune
31
30
  - navigation.footer
32
31
  - toc.follow
33
32
  - content.action.edit
@@ -37,7 +36,6 @@ theme:
37
36
  - content.tooltips
38
37
  - search.highlight
39
38
  - search.suggest
40
- - navigation.sections
41
39
 
42
40
  palette:
43
41
  # Palette toggle for automatic mode
@@ -56,8 +54,8 @@ theme:
56
54
  # Palette toggle for dark mode
57
55
  - media: "(prefers-color-scheme: dark)"
58
56
  scheme: slate
59
- primary: black
60
- accent: lime
57
+ primary: teal
58
+ accent: teal
61
59
  toggle:
62
60
  icon: material/weather-night
63
61
  name: Switch to system preference
@@ -68,18 +66,18 @@ nav:
68
66
  - 🏃🏼‍♂️ Quick Start: quick-start.md
69
67
  - 🎯 Examples: examples.md
70
68
  - 📚 Tutorials: tutorials.md
71
- - 🐍 API Reference: references/index.md
69
+ - 🐍 API Reference:
70
+ - Overview: references/index.md
71
+ - DataChain: references/datachain.md
72
+ - DataType: references/datatype.md
73
+ - File: references/file.md
74
+ - UDF: references/udf.md
75
+ - Torch: references/torch.md
76
+ - SQL: references/sql.md
72
77
  - 🤝 Contributing: contributing.md
73
- - API Reference:
74
- - references/index.md
75
- - references/datachain.md
76
- - references/datatype.md
77
- - references/file.md
78
- - references/udf.md
79
- - references/torch.md
80
- - references/sql.md
81
- - DataChain Website: https://datachain.ai" target="_blank"
82
- - Studio: https://studio.datachain.ai" target="_blank"
78
+
79
+ - DataChain Website ↗: https://datachain.ai" target="_blank"
80
+ - Studio ↗: https://studio.datachain.ai" target="_blank"
83
81
 
84
82
  markdown_extensions:
85
83
  - abbr
@@ -105,7 +103,11 @@ markdown_extensions:
105
103
  - pymdownx.tilde
106
104
  - tables
107
105
  - toc:
108
- permalink: true
106
+ permalink: ''
107
+
108
+ # Custom permalink style: https://github.com/squidfunk/mkdocs-material/discussions/3535
109
+ extra_css:
110
+ - css/github-permalink-style.css
109
111
 
110
112
  extra:
111
113
  social:
@@ -81,6 +81,8 @@ def examples(session: nox.Session) -> None:
81
81
  session.install(".[examples]")
82
82
  session.run(
83
83
  "pytest",
84
+ "--durations=0",
85
+ "tests/examples",
84
86
  "-m",
85
87
  "examples",
86
88
  *session.posargs,
@@ -48,7 +48,8 @@ dependencies = [
48
48
  "iterative-telemetry>=0.0.9",
49
49
  "platformdirs",
50
50
  "dvc-studio-client>=0.21,<1",
51
- "tabulate"
51
+ "tabulate",
52
+ "websockets"
52
53
  ]
53
54
 
54
55
  [project.optional-dependencies]
@@ -104,14 +105,14 @@ dev = [
104
105
  ]
105
106
  examples = [
106
107
  "datachain[tests]",
107
- "numpy>=1,<2",
108
108
  "defusedxml",
109
109
  "accelerate",
110
- "unstructured[pdf,embed-huggingface]<0.16.0",
110
+ "unstructured_ingest[embed-huggingface]",
111
+ "unstructured[pdf]",
111
112
  "pdfplumber==0.11.4",
112
113
  "huggingface_hub[hf_transfer]",
113
114
  "onnx==1.16.1",
114
- "ultralytics==8.3.37"
115
+ "ultralytics==8.3.50"
115
116
  ]
116
117
 
117
118
  [project.urls]