datachain 0.6.4__tar.gz → 0.6.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (257) hide show
  1. {datachain-0.6.4 → datachain-0.6.6}/.github/workflows/tests.yml +2 -0
  2. {datachain-0.6.4 → datachain-0.6.6}/.pre-commit-config.yaml +1 -1
  3. {datachain-0.6.4/src/datachain.egg-info → datachain-0.6.6}/PKG-INFO +1 -1
  4. datachain-0.6.6/examples/llm_and_nlp/hf-dataset-llm-eval.py +59 -0
  5. {datachain-0.6.4 → datachain-0.6.6}/overrides/main.html +1 -0
  6. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/client/hf.py +1 -0
  7. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/arrow.py +1 -1
  8. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/dc.py +17 -4
  9. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/hf.py +4 -6
  10. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/query/dataset.py +30 -1
  11. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/types.py +29 -0
  12. {datachain-0.6.4 → datachain-0.6.6/src/datachain.egg-info}/PKG-INFO +1 -1
  13. {datachain-0.6.4 → datachain-0.6.6}/src/datachain.egg-info/SOURCES.txt +1 -0
  14. {datachain-0.6.4 → datachain-0.6.6}/tests/examples/test_examples.py +5 -1
  15. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_datachain.py +24 -0
  16. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_listing.py +1 -1
  17. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_datachain.py +28 -2
  18. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_signal_schema.py +2 -0
  19. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_data_storage.py +2 -0
  20. {datachain-0.6.4 → datachain-0.6.6}/.cruft.json +0 -0
  21. {datachain-0.6.4 → datachain-0.6.6}/.gitattributes +0 -0
  22. {datachain-0.6.4 → datachain-0.6.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  23. {datachain-0.6.4 → datachain-0.6.6}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  24. {datachain-0.6.4 → datachain-0.6.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  25. {datachain-0.6.4 → datachain-0.6.6}/.github/codecov.yaml +0 -0
  26. {datachain-0.6.4 → datachain-0.6.6}/.github/dependabot.yml +0 -0
  27. {datachain-0.6.4 → datachain-0.6.6}/.github/workflows/benchmarks.yml +0 -0
  28. {datachain-0.6.4 → datachain-0.6.6}/.github/workflows/release.yml +0 -0
  29. {datachain-0.6.4 → datachain-0.6.6}/.github/workflows/tests-studio.yml +0 -0
  30. {datachain-0.6.4 → datachain-0.6.6}/.github/workflows/update-template.yaml +0 -0
  31. {datachain-0.6.4 → datachain-0.6.6}/.gitignore +0 -0
  32. {datachain-0.6.4 → datachain-0.6.6}/CODE_OF_CONDUCT.rst +0 -0
  33. {datachain-0.6.4 → datachain-0.6.6}/CONTRIBUTING.rst +0 -0
  34. {datachain-0.6.4 → datachain-0.6.6}/LICENSE +0 -0
  35. {datachain-0.6.4 → datachain-0.6.6}/README.rst +0 -0
  36. {datachain-0.6.4 → datachain-0.6.6}/docs/assets/captioned_cartoons.png +0 -0
  37. {datachain-0.6.4 → datachain-0.6.6}/docs/assets/datachain-white.svg +0 -0
  38. {datachain-0.6.4 → datachain-0.6.6}/docs/assets/datachain.svg +0 -0
  39. {datachain-0.6.4 → datachain-0.6.6}/docs/assets/flowchart.png +0 -0
  40. {datachain-0.6.4 → datachain-0.6.6}/docs/index.md +0 -0
  41. {datachain-0.6.4 → datachain-0.6.6}/docs/references/datachain.md +0 -0
  42. {datachain-0.6.4 → datachain-0.6.6}/docs/references/datatype.md +0 -0
  43. {datachain-0.6.4 → datachain-0.6.6}/docs/references/file.md +0 -0
  44. {datachain-0.6.4 → datachain-0.6.6}/docs/references/index.md +0 -0
  45. {datachain-0.6.4 → datachain-0.6.6}/docs/references/sql.md +0 -0
  46. {datachain-0.6.4 → datachain-0.6.6}/docs/references/torch.md +0 -0
  47. {datachain-0.6.4 → datachain-0.6.6}/docs/references/udf.md +0 -0
  48. {datachain-0.6.4 → datachain-0.6.6}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  49. {datachain-0.6.4 → datachain-0.6.6}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  50. {datachain-0.6.4 → datachain-0.6.6}/examples/computer_vision/openimage-detect.py +0 -0
  51. {datachain-0.6.4 → datachain-0.6.6}/examples/get_started/common_sql_functions.py +0 -0
  52. {datachain-0.6.4 → datachain-0.6.6}/examples/get_started/json-csv-reader.py +0 -0
  53. {datachain-0.6.4 → datachain-0.6.6}/examples/get_started/torch-loader.py +0 -0
  54. {datachain-0.6.4 → datachain-0.6.6}/examples/get_started/udfs/parallel.py +0 -0
  55. {datachain-0.6.4 → datachain-0.6.6}/examples/get_started/udfs/simple.py +0 -0
  56. {datachain-0.6.4 → datachain-0.6.6}/examples/get_started/udfs/stateful.py +0 -0
  57. {datachain-0.6.4 → datachain-0.6.6}/examples/llm_and_nlp/claude-query.py +0 -0
  58. {datachain-0.6.4 → datachain-0.6.6}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  59. {datachain-0.6.4 → datachain-0.6.6}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  60. {datachain-0.6.4 → datachain-0.6.6}/examples/multimodal/clip_inference.py +0 -0
  61. {datachain-0.6.4 → datachain-0.6.6}/examples/multimodal/hf_pipeline.py +0 -0
  62. {datachain-0.6.4 → datachain-0.6.6}/examples/multimodal/openai_image_desc_lib.py +0 -0
  63. {datachain-0.6.4 → datachain-0.6.6}/examples/multimodal/wds.py +0 -0
  64. {datachain-0.6.4 → datachain-0.6.6}/examples/multimodal/wds_filtered.py +0 -0
  65. {datachain-0.6.4 → datachain-0.6.6}/mkdocs.yml +0 -0
  66. {datachain-0.6.4 → datachain-0.6.6}/noxfile.py +0 -0
  67. {datachain-0.6.4 → datachain-0.6.6}/pyproject.toml +0 -0
  68. {datachain-0.6.4 → datachain-0.6.6}/setup.cfg +0 -0
  69. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/__init__.py +0 -0
  70. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/__main__.py +0 -0
  71. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/asyn.py +0 -0
  72. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/cache.py +0 -0
  73. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/catalog/__init__.py +0 -0
  74. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/catalog/catalog.py +0 -0
  75. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/catalog/datasource.py +0 -0
  76. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/catalog/loader.py +0 -0
  77. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/cli.py +0 -0
  78. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/cli_utils.py +0 -0
  79. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/client/__init__.py +0 -0
  80. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/client/azure.py +0 -0
  81. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/client/fileslice.py +0 -0
  82. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/client/fsspec.py +0 -0
  83. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/client/gcs.py +0 -0
  84. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/client/local.py +0 -0
  85. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/client/s3.py +0 -0
  86. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/config.py +0 -0
  87. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/data_storage/__init__.py +0 -0
  88. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/data_storage/db_engine.py +0 -0
  89. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/data_storage/id_generator.py +0 -0
  90. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/data_storage/job.py +0 -0
  91. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/data_storage/metastore.py +0 -0
  92. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/data_storage/schema.py +0 -0
  93. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/data_storage/serializer.py +0 -0
  94. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/data_storage/sqlite.py +0 -0
  95. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/data_storage/warehouse.py +0 -0
  96. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/dataset.py +0 -0
  97. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/error.py +0 -0
  98. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/job.py +0 -0
  99. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/__init__.py +0 -0
  100. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/clip.py +0 -0
  101. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/convert/__init__.py +0 -0
  102. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/convert/flatten.py +0 -0
  103. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/convert/python_to_sql.py +0 -0
  104. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/convert/sql_to_python.py +0 -0
  105. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/convert/unflatten.py +0 -0
  106. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  107. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/data_model.py +0 -0
  108. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/dataset_info.py +0 -0
  109. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/file.py +0 -0
  110. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/func/__init__.py +0 -0
  111. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/func/aggregate.py +0 -0
  112. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/func/func.py +0 -0
  113. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/image.py +0 -0
  114. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/listing.py +0 -0
  115. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/listing_info.py +0 -0
  116. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/meta_formats.py +0 -0
  117. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/model_store.py +0 -0
  118. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/pytorch.py +0 -0
  119. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/settings.py +0 -0
  120. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/signal_schema.py +0 -0
  121. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/tar.py +0 -0
  122. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/text.py +0 -0
  123. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/udf.py +0 -0
  124. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/udf_signature.py +0 -0
  125. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/utils.py +0 -0
  126. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/vfile.py +0 -0
  127. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/webdataset.py +0 -0
  128. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/lib/webdataset_laion.py +0 -0
  129. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/listing.py +0 -0
  130. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/node.py +0 -0
  131. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/nodes_fetcher.py +0 -0
  132. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/nodes_thread_pool.py +0 -0
  133. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/progress.py +0 -0
  134. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/py.typed +0 -0
  135. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/query/__init__.py +0 -0
  136. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/query/batch.py +0 -0
  137. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/query/dispatch.py +0 -0
  138. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/query/metrics.py +0 -0
  139. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/query/params.py +0 -0
  140. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/query/queue.py +0 -0
  141. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/query/schema.py +0 -0
  142. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/query/session.py +0 -0
  143. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/remote/__init__.py +0 -0
  144. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/remote/studio.py +0 -0
  145. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/__init__.py +0 -0
  146. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/default/__init__.py +0 -0
  147. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/default/base.py +0 -0
  148. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/functions/__init__.py +0 -0
  149. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/functions/aggregate.py +0 -0
  150. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/functions/array.py +0 -0
  151. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/functions/conditional.py +0 -0
  152. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/functions/path.py +0 -0
  153. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/functions/random.py +0 -0
  154. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/functions/string.py +0 -0
  155. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/selectable.py +0 -0
  156. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/sqlite/__init__.py +0 -0
  157. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/sqlite/base.py +0 -0
  158. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/sqlite/types.py +0 -0
  159. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/sqlite/vector.py +0 -0
  160. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/sql/utils.py +0 -0
  161. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/studio.py +0 -0
  162. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/telemetry.py +0 -0
  163. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/torch/__init__.py +0 -0
  164. {datachain-0.6.4 → datachain-0.6.6}/src/datachain/utils.py +0 -0
  165. {datachain-0.6.4 → datachain-0.6.6}/src/datachain.egg-info/dependency_links.txt +0 -0
  166. {datachain-0.6.4 → datachain-0.6.6}/src/datachain.egg-info/entry_points.txt +0 -0
  167. {datachain-0.6.4 → datachain-0.6.6}/src/datachain.egg-info/requires.txt +0 -0
  168. {datachain-0.6.4 → datachain-0.6.6}/src/datachain.egg-info/top_level.txt +0 -0
  169. {datachain-0.6.4 → datachain-0.6.6}/tests/__init__.py +0 -0
  170. {datachain-0.6.4 → datachain-0.6.6}/tests/benchmarks/__init__.py +0 -0
  171. {datachain-0.6.4 → datachain-0.6.6}/tests/benchmarks/conftest.py +0 -0
  172. {datachain-0.6.4 → datachain-0.6.6}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  173. {datachain-0.6.4 → datachain-0.6.6}/tests/benchmarks/datasets/.dvc/config +0 -0
  174. {datachain-0.6.4 → datachain-0.6.6}/tests/benchmarks/datasets/.gitignore +0 -0
  175. {datachain-0.6.4 → datachain-0.6.6}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  176. {datachain-0.6.4 → datachain-0.6.6}/tests/benchmarks/test_datachain.py +0 -0
  177. {datachain-0.6.4 → datachain-0.6.6}/tests/benchmarks/test_ls.py +0 -0
  178. {datachain-0.6.4 → datachain-0.6.6}/tests/benchmarks/test_version.py +0 -0
  179. {datachain-0.6.4 → datachain-0.6.6}/tests/conftest.py +0 -0
  180. {datachain-0.6.4 → datachain-0.6.6}/tests/data.py +0 -0
  181. {datachain-0.6.4 → datachain-0.6.6}/tests/examples/__init__.py +0 -0
  182. {datachain-0.6.4 → datachain-0.6.6}/tests/examples/test_wds_e2e.py +0 -0
  183. {datachain-0.6.4 → datachain-0.6.6}/tests/examples/wds_data.py +0 -0
  184. {datachain-0.6.4 → datachain-0.6.6}/tests/func/__init__.py +0 -0
  185. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_catalog.py +0 -0
  186. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_client.py +0 -0
  187. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_dataset_query.py +0 -0
  188. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_datasets.py +0 -0
  189. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_feature_pickling.py +0 -0
  190. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_ls.py +0 -0
  191. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_meta_formats.py +0 -0
  192. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_metrics.py +0 -0
  193. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_pull.py +0 -0
  194. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_pytorch.py +0 -0
  195. {datachain-0.6.4 → datachain-0.6.6}/tests/func/test_query.py +0 -0
  196. {datachain-0.6.4 → datachain-0.6.6}/tests/scripts/feature_class.py +0 -0
  197. {datachain-0.6.4 → datachain-0.6.6}/tests/scripts/feature_class_exception.py +0 -0
  198. {datachain-0.6.4 → datachain-0.6.6}/tests/scripts/feature_class_parallel.py +0 -0
  199. {datachain-0.6.4 → datachain-0.6.6}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  200. {datachain-0.6.4 → datachain-0.6.6}/tests/scripts/name_len_slow.py +0 -0
  201. {datachain-0.6.4 → datachain-0.6.6}/tests/test_atomicity.py +0 -0
  202. {datachain-0.6.4 → datachain-0.6.6}/tests/test_cli_e2e.py +0 -0
  203. {datachain-0.6.4 → datachain-0.6.6}/tests/test_cli_studio.py +0 -0
  204. {datachain-0.6.4 → datachain-0.6.6}/tests/test_query_e2e.py +0 -0
  205. {datachain-0.6.4 → datachain-0.6.6}/tests/test_telemetry.py +0 -0
  206. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/__init__.py +0 -0
  207. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/__init__.py +0 -0
  208. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/conftest.py +0 -0
  209. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_arrow.py +0 -0
  210. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_clip.py +0 -0
  211. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  212. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_datachain_merge.py +0 -0
  213. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_feature.py +0 -0
  214. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_feature_utils.py +0 -0
  215. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_file.py +0 -0
  216. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_hf.py +0 -0
  217. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_image.py +0 -0
  218. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_listing_info.py +0 -0
  219. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_schema.py +0 -0
  220. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_sql_to_python.py +0 -0
  221. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_text.py +0 -0
  222. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_udf_signature.py +0 -0
  223. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_utils.py +0 -0
  224. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/lib/test_webdataset.py +0 -0
  225. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/sql/__init__.py +0 -0
  226. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/sql/sqlite/__init__.py +0 -0
  227. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/sql/sqlite/test_utils.py +0 -0
  228. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/sql/test_array.py +0 -0
  229. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/sql/test_conditional.py +0 -0
  230. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/sql/test_path.py +0 -0
  231. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/sql/test_random.py +0 -0
  232. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/sql/test_selectable.py +0 -0
  233. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/sql/test_string.py +0 -0
  234. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_asyn.py +0 -0
  235. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_cache.py +0 -0
  236. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_catalog.py +0 -0
  237. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_catalog_loader.py +0 -0
  238. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_cli_parsing.py +0 -0
  239. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_client.py +0 -0
  240. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_client_s3.py +0 -0
  241. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_config.py +0 -0
  242. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_database_engine.py +0 -0
  243. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_dataset.py +0 -0
  244. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_dispatch.py +0 -0
  245. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_fileslice.py +0 -0
  246. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_id_generator.py +0 -0
  247. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_listing.py +0 -0
  248. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_metastore.py +0 -0
  249. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_module_exports.py +0 -0
  250. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_query.py +0 -0
  251. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_query_metrics.py +0 -0
  252. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_query_params.py +0 -0
  253. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_serializer.py +0 -0
  254. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_session.py +0 -0
  255. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_utils.py +0 -0
  256. {datachain-0.6.4 → datachain-0.6.6}/tests/unit/test_warehouse.py +0 -0
  257. {datachain-0.6.4 → datachain-0.6.6}/tests/utils.py +0 -0
@@ -152,4 +152,6 @@ jobs:
152
152
  run: uv pip install nox --system
153
153
 
154
154
  - name: Run examples
155
+ env:
156
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
155
157
  run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.7.1'
27
+ rev: 'v0.7.2'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.4
3
+ Version: 0.6.6
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -0,0 +1,59 @@
1
+ from huggingface_hub import InferenceClient
2
+
3
+ from datachain import C, DataChain, DataModel
4
+
5
+ PROMPT = """
6
+ Was this dialog successful? Put result as a single word: Success or Failure.
7
+ Explain the reason in a few words.
8
+ """
9
+
10
+
11
+ class DialogEval(DataModel):
12
+ result: str
13
+ reason: str
14
+
15
+
16
+ # DataChain function to evaluate dialog.
17
+ # DataChain is using types for inputs, results to automatically infer schema.
18
+ def eval_dialog(user_input: str, bot_response: str) -> DialogEval:
19
+ client = InferenceClient("meta-llama/Llama-3.1-70B-Instruct")
20
+
21
+ completion = client.chat_completion(
22
+ messages=[
23
+ {
24
+ "role": "user",
25
+ "content": f"{PROMPT}\n\nUser: {user_input}\nBot: {bot_response}",
26
+ },
27
+ ],
28
+ response_format={"type": "json", "value": DialogEval.model_json_schema()},
29
+ )
30
+
31
+ message = completion.choices[0].message
32
+ try:
33
+ return DialogEval.model_validate_json(message.content)
34
+ except ValueError:
35
+ return DialogEval(result="Error", reason="Failed to parse response.")
36
+
37
+
38
+ # Run HF inference in parallel for each example.
39
+ # Get result as Pydantic model that DataChain can understand and serialize it.
40
+ # Save to HF as Parquet. Dataset can be previewed here:
41
+ # https://huggingface.co/datasets/dvcorg/test-datachain-llm-eval/viewer
42
+ (
43
+ DataChain.from_csv(
44
+ "hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv"
45
+ )
46
+ .settings(parallel=10)
47
+ .map(response=eval_dialog)
48
+ .to_parquet("hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet")
49
+ )
50
+
51
+ # Read it back to filter and show.
52
+ # It restores the Pydantic model from Parquet under the hood.
53
+ (
54
+ DataChain.from_parquet(
55
+ "hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet", source=False
56
+ )
57
+ .filter(C("response.result") == "Failure")
58
+ .show(3)
59
+ )
@@ -4,6 +4,7 @@
4
4
 
5
5
  {{ super() }}
6
6
 
7
+ <script data-domain="docs.datachain.ai" src="https://plausible.io/js/script.outbound-links.js"></script>
7
8
  <script type="text/javascript">
8
9
  !function () { var e, t, n; e = "14ffd92a6cbf5f2", t = function () { Reo.init({ clientID: "14ffd92a6cbf5f2" }) }, (n = document.createElement("script")).src = "https://static.reo.dev/" + e + "/reo.js", n.async = !0, n.onload = t, document.head.appendChild(n) }();
9
10
  </script>
@@ -23,6 +23,7 @@ class HfClient(Client):
23
23
 
24
24
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
25
25
  return File(
26
+ source=self.uri,
26
27
  path=path,
27
28
  size=v["size"],
28
29
  version=v["last_commit"].oid,
@@ -175,7 +175,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
175
175
  return dict
176
176
  if isinstance(col_type, pa.lib.DictionaryType):
177
177
  return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
178
- raise TypeError(f"{col_type!r} datatypes not supported")
178
+ raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
179
179
 
180
180
 
181
181
  def _nrows_file(file: File, nrows: int) -> str:
@@ -981,10 +981,23 @@ class DataChain:
981
981
 
982
982
  @resolve_columns
983
983
  def order_by(self, *args, descending: bool = False) -> "Self":
984
- """Orders by specified set of signals.
984
+ """Orders by specified set of columns.
985
985
 
986
986
  Parameters:
987
987
  descending (bool): Whether to sort in descending order or not.
988
+
989
+ Example:
990
+ ```py
991
+ dc.order_by("similarity_score", descending=True).limit(10)
992
+ ```
993
+
994
+ Note:
995
+ Order is not guaranteed when steps are added after an `order_by` statement.
996
+ I.e. when using `from_dataset` an `order_by` statement should be used if
997
+ the order of the records in the chain is important.
998
+ Using `order_by` directly before `limit`, `collect` and `collect_flatten`
999
+ will give expected results.
1000
+ See https://github.com/iterative/datachain/issues/477 for further details.
988
1001
  """
989
1002
  if descending:
990
1003
  args = tuple(sqlalchemy.desc(a) for a in args)
@@ -1179,7 +1192,7 @@ class DataChain:
1179
1192
  a tuple of row values.
1180
1193
  """
1181
1194
  db_signals = self._effective_signals_schema.db_signals()
1182
- with self._query.select(*db_signals).as_iterable() as rows:
1195
+ with self._query.ordered_select(*db_signals).as_iterable() as rows:
1183
1196
  if row_factory:
1184
1197
  rows = (row_factory(db_signals, r) for r in rows)
1185
1198
  yield from rows
@@ -1270,7 +1283,7 @@ class DataChain:
1270
1283
  chain = self.select(*cols) if cols else self
1271
1284
  signals_schema = chain._effective_signals_schema
1272
1285
  db_signals = signals_schema.db_signals()
1273
- with self._query.select(*db_signals).as_iterable() as rows:
1286
+ with self._query.ordered_select(*db_signals).as_iterable() as rows:
1274
1287
  for row in rows:
1275
1288
  ret = signals_schema.row_to_features(
1276
1289
  row, catalog=chain.session.catalog, cache=chain._settings.cache
@@ -1678,7 +1691,7 @@ class DataChain:
1678
1691
 
1679
1692
  model_name = model_name or object_name or ""
1680
1693
  hf_features = next(iter(ds_dict.values())).features
1681
- output = output | get_output_schema(hf_features, model_name)
1694
+ output = output | get_output_schema(hf_features)
1682
1695
  model = dict_to_data_model(model_name, output)
1683
1696
  if object_name:
1684
1697
  output = {object_name: model}
@@ -138,17 +138,15 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
138
138
  return HFAudio(**val)
139
139
 
140
140
 
141
- def get_output_schema(
142
- features: Features, model_name: str = "", stream: bool = True
143
- ) -> dict[str, DataType]:
141
+ def get_output_schema(features: Features) -> dict[str, DataType]:
144
142
  """Generate UDF output schema from huggingface datasets features."""
145
143
  fields_dict = {}
146
144
  for name, val in features.items():
147
- fields_dict[name] = _feature_to_chain_type(name, val) # type: ignore[assignment]
148
- return fields_dict # type: ignore[return-value]
145
+ fields_dict[name] = _feature_to_chain_type(name, val)
146
+ return fields_dict
149
147
 
150
148
 
151
- def _feature_to_chain_type(name: str, val: Any) -> type: # noqa: PLR0911
149
+ def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
152
150
  if isinstance(val, Value):
153
151
  return arrow_type_mapper(val.pa_type)
154
152
  if isinstance(val, ClassLabel):
@@ -1276,6 +1276,27 @@ class DatasetQuery:
1276
1276
  query.steps.append(SQLSelect((*args, *named_args)))
1277
1277
  return query
1278
1278
 
1279
+ @detach
1280
+ def ordered_select(self, *args, **kwargs) -> "Self":
1281
+ """
1282
+ Select the given columns or expressions using a subquery whilst
1283
+ maintaining query ordering (only applicable if last step was order_by).
1284
+
1285
+ If used with no arguments, this simply creates a subquery and
1286
+ select all columns from it.
1287
+
1288
+ Example:
1289
+ >>> ds.ordered_select(C.name, C.size * 10)
1290
+ >>> ds.ordered_select(C.name, size10x=C.size * 10)
1291
+ """
1292
+ named_args = [v.label(k) for k, v in kwargs.items()]
1293
+ query = self.clone()
1294
+ order_by = query.last_step if query.is_ordered else None
1295
+ query.steps.append(SQLSelect((*args, *named_args)))
1296
+ if order_by:
1297
+ query.steps.append(order_by)
1298
+ return query
1299
+
1279
1300
  @detach
1280
1301
  def select_except(self, *args) -> "Self":
1281
1302
  """
@@ -1338,7 +1359,7 @@ class DatasetQuery:
1338
1359
  query = self.clone(new_table=False)
1339
1360
  if (
1340
1361
  query.steps
1341
- and (last_step := query.steps[-1])
1362
+ and (last_step := query.last_step)
1342
1363
  and isinstance(last_step, SQLLimit)
1343
1364
  ):
1344
1365
  query.steps[-1] = SQLLimit(min(n, last_step.n))
@@ -1591,3 +1612,11 @@ class DatasetQuery:
1591
1612
  finally:
1592
1613
  self.cleanup()
1593
1614
  return self.__class__(name=name, version=version, catalog=self.catalog)
1615
+
1616
+ @property
1617
+ def is_ordered(self) -> bool:
1618
+ return isinstance(self.last_step, SQLOrderBy)
1619
+
1620
+ @property
1621
+ def last_step(self) -> Optional[Step]:
1622
+ return self.steps[-1] if self.steps else None
@@ -187,6 +187,22 @@ class Int32(Int):
187
187
  return read_converter(dialect).int32(value)
188
188
 
189
189
 
190
+ class UInt32(Int):
191
+ def load_dialect_impl(self, dialect):
192
+ return converter(dialect).uint32()
193
+
194
+ @staticmethod
195
+ def default_value(dialect):
196
+ return type_defaults(dialect).uint32()
197
+
198
+ @staticmethod
199
+ def db_default_value(dialect):
200
+ return db_defaults(dialect).uint32()
201
+
202
+ def on_read_convert(self, value, dialect):
203
+ return read_converter(dialect).uint32(value)
204
+
205
+
190
206
  class Int64(Int):
191
207
  def load_dialect_impl(self, dialect):
192
208
  return converter(dialect).int64()
@@ -395,6 +411,9 @@ class TypeReadConverter:
395
411
  def int32(self, value):
396
412
  return value
397
413
 
414
+ def uint32(self, value):
415
+ return value
416
+
398
417
  def int64(self, value):
399
418
  return value
400
419
 
@@ -446,6 +465,9 @@ class TypeConverter:
446
465
  def int32(self):
447
466
  return self.int()
448
467
 
468
+ def uint32(self):
469
+ return self.int()
470
+
449
471
  def int64(self):
450
472
  return self.int()
451
473
 
@@ -487,6 +509,9 @@ class TypeDefaults:
487
509
  def int32(self):
488
510
  return None
489
511
 
512
+ def uint32(self):
513
+ return None
514
+
490
515
  def int64(self):
491
516
  return None
492
517
 
@@ -528,6 +553,9 @@ class DBDefaults:
528
553
  def int32(self):
529
554
  return self.int()
530
555
 
556
+ def uint32(self):
557
+ return self.int()
558
+
531
559
  def int64(self):
532
560
  return self.int()
533
561
 
@@ -561,6 +589,7 @@ TYPES = [
561
589
  Boolean,
562
590
  Int,
563
591
  Int32,
592
+ UInt32,
564
593
  Int64,
565
594
  UInt64,
566
595
  Float,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.4
3
+ Version: 0.6.6
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -41,6 +41,7 @@ examples/get_started/udfs/parallel.py
41
41
  examples/get_started/udfs/simple.py
42
42
  examples/get_started/udfs/stateful.py
43
43
  examples/llm_and_nlp/claude-query.py
44
+ examples/llm_and_nlp/hf-dataset-llm-eval.py
44
45
  examples/llm_and_nlp/unstructured-embeddings-gen.py
45
46
  examples/llm_and_nlp/unstructured-summary-map.py
46
47
  examples/multimodal/clip_inference.py
@@ -19,8 +19,12 @@ llm_and_nlp_examples = sorted(
19
19
  [
20
20
  filename
21
21
  for filename in glob.glob("examples/llm_and_nlp/**/*.py", recursive=True)
22
- # no anthropic token
22
+ # no anthropic token, HF runs against actual API - thus run it only once
23
23
  if "claude" not in filename
24
+ and (
25
+ "hf-" not in filename
26
+ or (sys.platform == "darwin" and sys.version_info >= (3, 12))
27
+ )
24
28
  ]
25
29
  )
26
30
 
@@ -448,6 +448,30 @@ def test_show_no_truncate(capsys, test_session):
448
448
  assert details[i] in normalized_output
449
449
 
450
450
 
451
+ @pytest.mark.parametrize("ordered_by", ["letter", "number"])
452
+ def test_show_ordered(capsys, test_session, ordered_by):
453
+ numbers = [6, 2, 3, 1, 5, 7, 4]
454
+ letters = ["u", "y", "x", "z", "v", "t", "w"]
455
+
456
+ DataChain.from_values(
457
+ number=numbers, letter=letters, session=test_session
458
+ ).order_by(ordered_by).show()
459
+
460
+ captured = capsys.readouterr()
461
+ normalized_lines = [
462
+ re.sub(r"\s+", " ", line).strip() for line in captured.out.strip().split("\n")
463
+ ]
464
+
465
+ ordered_entries = sorted(
466
+ zip(numbers, letters), key=lambda x: x[0 if ordered_by == "number" else 1]
467
+ )
468
+
469
+ assert normalized_lines[0].strip() == "number letter"
470
+ for i, line in enumerate(normalized_lines[1:]):
471
+ number, letter = ordered_entries[i]
472
+ assert line == f"{i} {number} {letter}"
473
+
474
+
451
475
  def test_from_storage_dataset_stats(tmp_dir, test_session):
452
476
  for i in range(4):
453
477
  (tmp_dir / f"file{i}.txt").write_text(f"file{i}")
@@ -17,7 +17,7 @@ def test_listing_generator(cloud_test_catalog, cloud_type):
17
17
  entries = sorted(
18
18
  [e for e in ENTRIES if e.path.startswith("cats/")], key=lambda e: e.path
19
19
  )
20
- files = sorted(dc.collect("file"), key=lambda f: f.path)
20
+ files = dc.order_by("file.path").collect("file")
21
21
 
22
22
  for cat_file, cat_entry in zip(files, entries):
23
23
  assert cat_file.source == ctc.src_uri
@@ -1824,6 +1824,32 @@ def test_order_by_with_nested_columns(test_session, with_function):
1824
1824
  ]
1825
1825
 
1826
1826
 
1827
+ def test_order_by_collect(test_session):
1828
+ numbers = [6, 2, 3, 1, 5, 7, 4]
1829
+ letters = ["u", "y", "x", "z", "v", "t", "w"]
1830
+
1831
+ dc = DataChain.from_values(number=numbers, letter=letters, session=test_session)
1832
+ assert list(dc.order_by("number").collect()) == [
1833
+ (1, "z"),
1834
+ (2, "y"),
1835
+ (3, "x"),
1836
+ (4, "w"),
1837
+ (5, "v"),
1838
+ (6, "u"),
1839
+ (7, "t"),
1840
+ ]
1841
+
1842
+ assert list(dc.order_by("letter").collect()) == [
1843
+ (7, "t"),
1844
+ (6, "u"),
1845
+ (5, "v"),
1846
+ (4, "w"),
1847
+ (3, "x"),
1848
+ (2, "y"),
1849
+ (1, "z"),
1850
+ ]
1851
+
1852
+
1827
1853
  @pytest.mark.parametrize("with_function", [True, False])
1828
1854
  def test_order_by_descending(test_session, with_function):
1829
1855
  names = ["a.txt", "c.txt", "d.txt", "a.txt", "b.txt"]
@@ -1852,7 +1878,7 @@ def test_union(test_session):
1852
1878
  chain2 = DataChain.from_values(value=[3, 4], session=test_session)
1853
1879
  chain3 = chain1 | chain2
1854
1880
  assert chain3.count() == 4
1855
- assert sorted(chain3.collect("value")) == [1, 2, 3, 4]
1881
+ assert list(chain3.order_by("value").collect("value")) == [1, 2, 3, 4]
1856
1882
 
1857
1883
 
1858
1884
  def test_union_different_columns(test_session):
@@ -1887,7 +1913,7 @@ def test_union_different_column_order(test_session):
1887
1913
  chain2 = DataChain.from_values(
1888
1914
  name=["different", "order"], value=[9, 10], session=test_session
1889
1915
  )
1890
- assert sorted(chain1.union(chain2).collect()) == [
1916
+ assert list(chain1.union(chain2).order_by("value").collect()) == [
1891
1917
  (1, "chain"),
1892
1918
  (2, "more"),
1893
1919
  (9, "different"),
@@ -27,6 +27,7 @@ from datachain.sql.types import (
27
27
  Int32,
28
28
  Int64,
29
29
  String,
30
+ UInt32,
30
31
  UInt64,
31
32
  )
32
33
 
@@ -721,6 +722,7 @@ def test_mutate_change_type():
721
722
  [Boolean, bool],
722
723
  [Int, int],
723
724
  [Int32, int],
725
+ [UInt32, int],
724
726
  [Int64, int],
725
727
  [UInt64, int],
726
728
  [Float, float],
@@ -18,6 +18,7 @@ from datachain.sql.types import (
18
18
  Int32,
19
19
  Int64,
20
20
  String,
21
+ UInt32,
21
22
  UInt64,
22
23
  )
23
24
  from tests.utils import (
@@ -173,6 +174,7 @@ def test_convert_type(cloud_test_catalog):
173
174
  [Boolean(), False],
174
175
  [Int(), 0],
175
176
  [Int32(), 0],
177
+ [UInt32(), 0],
176
178
  [Int64(), 0],
177
179
  [UInt64(), 0],
178
180
  [Float(), lambda val: math.isnan(val)],
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes