datachain 0.3.8__tar.gz → 0.3.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (245) hide show
  1. {datachain-0.3.8 → datachain-0.3.10}/.github/workflows/tests.yml +1 -1
  2. {datachain-0.3.8 → datachain-0.3.10}/.pre-commit-config.yaml +1 -1
  3. {datachain-0.3.8/src/datachain.egg-info → datachain-0.3.10}/PKG-INFO +14 -14
  4. {datachain-0.3.8 → datachain-0.3.10}/README.rst +11 -12
  5. {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/udfs/parallel.py +1 -1
  6. {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/udfs/simple.py +1 -1
  7. {datachain-0.3.8 → datachain-0.3.10}/examples/llm_and_nlp/unstructured-text.py +1 -1
  8. {datachain-0.3.8 → datachain-0.3.10}/examples/multimodal/wds_filtered.py +1 -3
  9. {datachain-0.3.8 → datachain-0.3.10}/pyproject.toml +3 -2
  10. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/catalog/catalog.py +13 -91
  11. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/cli.py +6 -38
  12. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/fsspec.py +3 -0
  13. datachain-0.3.10/src/datachain/client/hf.py +47 -0
  14. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/metastore.py +2 -29
  15. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/sqlite.py +3 -12
  16. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/warehouse.py +20 -29
  17. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/dataset.py +44 -32
  18. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/arrow.py +22 -6
  19. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/dataset_info.py +4 -0
  20. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/dc.py +149 -35
  21. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/file.py +10 -33
  22. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/hf.py +2 -1
  23. datachain-0.3.10/src/datachain/lib/listing.py +119 -0
  24. datachain-0.3.10/src/datachain/lib/listing_info.py +32 -0
  25. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/meta_formats.py +4 -4
  26. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/signal_schema.py +5 -2
  27. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/webdataset.py +1 -1
  28. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/node.py +13 -0
  29. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/dataset.py +25 -87
  30. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/metrics.py +8 -0
  31. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/utils.py +5 -0
  32. {datachain-0.3.8 → datachain-0.3.10/src/datachain.egg-info}/PKG-INFO +14 -14
  33. {datachain-0.3.8 → datachain-0.3.10}/src/datachain.egg-info/SOURCES.txt +3 -0
  34. {datachain-0.3.8 → datachain-0.3.10}/src/datachain.egg-info/requires.txt +2 -1
  35. {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/test_datachain.py +4 -6
  36. {datachain-0.3.8 → datachain-0.3.10}/tests/conftest.py +4 -0
  37. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_catalog.py +49 -24
  38. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_datachain.py +147 -11
  39. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_dataset_query.py +20 -4
  40. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_datasets.py +18 -13
  41. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_feature_pickling.py +21 -16
  42. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_ls.py +7 -4
  43. datachain-0.3.10/tests/func/test_metrics.py +14 -0
  44. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_query.py +15 -23
  45. {datachain-0.3.8 → datachain-0.3.10}/tests/scripts/feature_class.py +2 -2
  46. {datachain-0.3.8 → datachain-0.3.10}/tests/scripts/feature_class_parallel.py +1 -1
  47. {datachain-0.3.8 → datachain-0.3.10}/tests/scripts/feature_class_parallel_data_model.py +1 -1
  48. {datachain-0.3.8 → datachain-0.3.10}/tests/test_query_e2e.py +5 -4
  49. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_arrow.py +38 -1
  50. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_datachain.py +95 -4
  51. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_signal_schema.py +20 -3
  52. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_dataset.py +28 -0
  53. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_listing.py +86 -0
  54. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_storage.py +0 -34
  55. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_utils.py +17 -0
  56. datachain-0.3.8/src/datachain/lib/listing.py +0 -111
  57. {datachain-0.3.8 → datachain-0.3.10}/.cruft.json +0 -0
  58. {datachain-0.3.8 → datachain-0.3.10}/.gitattributes +0 -0
  59. {datachain-0.3.8 → datachain-0.3.10}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  60. {datachain-0.3.8 → datachain-0.3.10}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  61. {datachain-0.3.8 → datachain-0.3.10}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  62. {datachain-0.3.8 → datachain-0.3.10}/.github/codecov.yaml +0 -0
  63. {datachain-0.3.8 → datachain-0.3.10}/.github/dependabot.yml +0 -0
  64. {datachain-0.3.8 → datachain-0.3.10}/.github/workflows/benchmarks.yml +0 -0
  65. {datachain-0.3.8 → datachain-0.3.10}/.github/workflows/release.yml +0 -0
  66. {datachain-0.3.8 → datachain-0.3.10}/.github/workflows/tests-studio.yml +0 -0
  67. {datachain-0.3.8 → datachain-0.3.10}/.github/workflows/update-template.yaml +0 -0
  68. {datachain-0.3.8 → datachain-0.3.10}/.gitignore +0 -0
  69. {datachain-0.3.8 → datachain-0.3.10}/CODE_OF_CONDUCT.rst +0 -0
  70. {datachain-0.3.8 → datachain-0.3.10}/CONTRIBUTING.rst +0 -0
  71. {datachain-0.3.8 → datachain-0.3.10}/LICENSE +0 -0
  72. {datachain-0.3.8 → datachain-0.3.10}/docs/assets/captioned_cartoons.png +0 -0
  73. {datachain-0.3.8 → datachain-0.3.10}/docs/assets/datachain.png +0 -0
  74. {datachain-0.3.8 → datachain-0.3.10}/docs/assets/flowchart.png +0 -0
  75. {datachain-0.3.8 → datachain-0.3.10}/docs/index.md +0 -0
  76. {datachain-0.3.8 → datachain-0.3.10}/docs/references/datachain.md +0 -0
  77. {datachain-0.3.8 → datachain-0.3.10}/docs/references/datatype.md +0 -0
  78. {datachain-0.3.8 → datachain-0.3.10}/docs/references/file.md +0 -0
  79. {datachain-0.3.8 → datachain-0.3.10}/docs/references/index.md +0 -0
  80. {datachain-0.3.8 → datachain-0.3.10}/docs/references/sql.md +0 -0
  81. {datachain-0.3.8 → datachain-0.3.10}/docs/references/torch.md +0 -0
  82. {datachain-0.3.8 → datachain-0.3.10}/docs/references/udf.md +0 -0
  83. {datachain-0.3.8 → datachain-0.3.10}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  84. {datachain-0.3.8 → datachain-0.3.10}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  85. {datachain-0.3.8 → datachain-0.3.10}/examples/computer_vision/openimage-detect.py +0 -0
  86. {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/common_sql_functions.py +0 -0
  87. {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/json-csv-reader.py +0 -0
  88. {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/torch-loader.py +0 -0
  89. {datachain-0.3.8 → datachain-0.3.10}/examples/get_started/udfs/stateful.py +0 -0
  90. {datachain-0.3.8 → datachain-0.3.10}/examples/llm_and_nlp/claude-query.py +0 -0
  91. {datachain-0.3.8 → datachain-0.3.10}/examples/multimodal/clip_inference.py +0 -0
  92. {datachain-0.3.8 → datachain-0.3.10}/examples/multimodal/hf_pipeline.py +0 -0
  93. {datachain-0.3.8 → datachain-0.3.10}/examples/multimodal/openai_image_desc_lib.py +0 -0
  94. {datachain-0.3.8 → datachain-0.3.10}/examples/multimodal/wds.py +0 -0
  95. {datachain-0.3.8 → datachain-0.3.10}/mkdocs.yml +0 -0
  96. {datachain-0.3.8 → datachain-0.3.10}/noxfile.py +0 -0
  97. {datachain-0.3.8 → datachain-0.3.10}/setup.cfg +0 -0
  98. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/__init__.py +0 -0
  99. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/__main__.py +0 -0
  100. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/asyn.py +0 -0
  101. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/cache.py +0 -0
  102. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/catalog/__init__.py +0 -0
  103. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/catalog/datasource.py +0 -0
  104. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/catalog/loader.py +0 -0
  105. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/catalog/subclass.py +0 -0
  106. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/cli_utils.py +0 -0
  107. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/__init__.py +0 -0
  108. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/azure.py +0 -0
  109. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/fileslice.py +0 -0
  110. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/gcs.py +0 -0
  111. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/local.py +0 -0
  112. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/client/s3.py +0 -0
  113. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/config.py +0 -0
  114. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/__init__.py +0 -0
  115. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/db_engine.py +0 -0
  116. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/id_generator.py +0 -0
  117. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/job.py +0 -0
  118. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/schema.py +0 -0
  119. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/data_storage/serializer.py +0 -0
  120. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/error.py +0 -0
  121. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/job.py +0 -0
  122. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/__init__.py +0 -0
  123. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/clip.py +0 -0
  124. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/__init__.py +0 -0
  125. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/flatten.py +0 -0
  126. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/python_to_sql.py +0 -0
  127. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/sql_to_python.py +0 -0
  128. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/unflatten.py +0 -0
  129. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  130. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/data_model.py +0 -0
  131. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/image.py +0 -0
  132. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/model_store.py +0 -0
  133. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/pytorch.py +0 -0
  134. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/settings.py +0 -0
  135. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/text.py +0 -0
  136. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/udf.py +0 -0
  137. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/udf_signature.py +0 -0
  138. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/utils.py +0 -0
  139. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/vfile.py +0 -0
  140. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/lib/webdataset_laion.py +0 -0
  141. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/listing.py +0 -0
  142. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/nodes_fetcher.py +0 -0
  143. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/nodes_thread_pool.py +0 -0
  144. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/progress.py +0 -0
  145. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/py.typed +0 -0
  146. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/__init__.py +0 -0
  147. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/batch.py +0 -0
  148. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/builtins.py +0 -0
  149. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/dispatch.py +0 -0
  150. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/params.py +0 -0
  151. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/queue.py +0 -0
  152. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/schema.py +0 -0
  153. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/session.py +0 -0
  154. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/query/udf.py +0 -0
  155. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/remote/__init__.py +0 -0
  156. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/remote/studio.py +0 -0
  157. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/__init__.py +0 -0
  158. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/default/__init__.py +0 -0
  159. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/default/base.py +0 -0
  160. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/__init__.py +0 -0
  161. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/array.py +0 -0
  162. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/conditional.py +0 -0
  163. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/path.py +0 -0
  164. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/random.py +0 -0
  165. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/functions/string.py +0 -0
  166. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/selectable.py +0 -0
  167. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/sqlite/__init__.py +0 -0
  168. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/sqlite/base.py +0 -0
  169. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/sqlite/types.py +0 -0
  170. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/sqlite/vector.py +0 -0
  171. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/types.py +0 -0
  172. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/sql/utils.py +0 -0
  173. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/storage.py +0 -0
  174. {datachain-0.3.8 → datachain-0.3.10}/src/datachain/torch/__init__.py +0 -0
  175. {datachain-0.3.8 → datachain-0.3.10}/src/datachain.egg-info/dependency_links.txt +0 -0
  176. {datachain-0.3.8 → datachain-0.3.10}/src/datachain.egg-info/entry_points.txt +0 -0
  177. {datachain-0.3.8 → datachain-0.3.10}/src/datachain.egg-info/top_level.txt +0 -0
  178. {datachain-0.3.8 → datachain-0.3.10}/tests/__init__.py +0 -0
  179. {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/__init__.py +0 -0
  180. {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/conftest.py +0 -0
  181. {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  182. {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/datasets/.dvc/config +0 -0
  183. {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/datasets/.gitignore +0 -0
  184. {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  185. {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/test_ls.py +0 -0
  186. {datachain-0.3.8 → datachain-0.3.10}/tests/benchmarks/test_version.py +0 -0
  187. {datachain-0.3.8 → datachain-0.3.10}/tests/data.py +0 -0
  188. {datachain-0.3.8 → datachain-0.3.10}/tests/examples/__init__.py +0 -0
  189. {datachain-0.3.8 → datachain-0.3.10}/tests/examples/test_examples.py +0 -0
  190. {datachain-0.3.8 → datachain-0.3.10}/tests/examples/test_wds_e2e.py +0 -0
  191. {datachain-0.3.8 → datachain-0.3.10}/tests/examples/wds_data.py +0 -0
  192. {datachain-0.3.8 → datachain-0.3.10}/tests/func/__init__.py +0 -0
  193. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_client.py +0 -0
  194. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_listing.py +0 -0
  195. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_pull.py +0 -0
  196. {datachain-0.3.8 → datachain-0.3.10}/tests/func/test_pytorch.py +0 -0
  197. {datachain-0.3.8 → datachain-0.3.10}/tests/scripts/name_len_slow.py +0 -0
  198. {datachain-0.3.8 → datachain-0.3.10}/tests/test_cli_e2e.py +0 -0
  199. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/__init__.py +0 -0
  200. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/__init__.py +0 -0
  201. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/conftest.py +0 -0
  202. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_clip.py +0 -0
  203. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  204. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_datachain_merge.py +0 -0
  205. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_feature.py +0 -0
  206. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_feature_utils.py +0 -0
  207. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_file.py +0 -0
  208. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_hf.py +0 -0
  209. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_image.py +0 -0
  210. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_schema.py +0 -0
  211. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_sql_to_python.py +0 -0
  212. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_text.py +0 -0
  213. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_udf_signature.py +0 -0
  214. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_utils.py +0 -0
  215. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/lib/test_webdataset.py +0 -0
  216. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/__init__.py +0 -0
  217. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/sqlite/__init__.py +0 -0
  218. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/sqlite/test_utils.py +0 -0
  219. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_array.py +0 -0
  220. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_conditional.py +0 -0
  221. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_path.py +0 -0
  222. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_random.py +0 -0
  223. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_selectable.py +0 -0
  224. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/sql/test_string.py +0 -0
  225. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_asyn.py +0 -0
  226. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_cache.py +0 -0
  227. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_catalog.py +0 -0
  228. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_catalog_loader.py +0 -0
  229. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_cli_parsing.py +0 -0
  230. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_client.py +0 -0
  231. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_client_s3.py +0 -0
  232. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_data_storage.py +0 -0
  233. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_database_engine.py +0 -0
  234. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_dispatch.py +0 -0
  235. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_fileslice.py +0 -0
  236. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_id_generator.py +0 -0
  237. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_metastore.py +0 -0
  238. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_module_exports.py +0 -0
  239. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_query_metrics.py +0 -0
  240. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_query_params.py +0 -0
  241. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_serializer.py +0 -0
  242. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_session.py +0 -0
  243. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_udf.py +0 -0
  244. {datachain-0.3.8 → datachain-0.3.10}/tests/unit/test_warehouse.py +0 -0
  245. {datachain-0.3.8 → datachain-0.3.10}/tests/utils.py +0 -0
@@ -50,7 +50,7 @@ jobs:
50
50
  run: nox -s lint
51
51
 
52
52
  datachain:
53
- timeout-minutes: 30
53
+ timeout-minutes: 40
54
54
  runs-on: ${{ matrix.os }}
55
55
  strategy:
56
56
  fail-fast: false
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.6.1'
27
+ rev: 'v0.6.3'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.8
3
+ Version: 0.3.10
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -43,6 +43,7 @@ Requires-Dist: datamodel-code-generator>=0.25
43
43
  Requires-Dist: Pillow<11,>=10.0.0
44
44
  Requires-Dist: msgpack<2,>=1.0.4
45
45
  Requires-Dist: psutil
46
+ Requires-Dist: huggingface_hub
46
47
  Provides-Extra: docs
47
48
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
48
49
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -61,7 +62,7 @@ Provides-Extra: vector
61
62
  Requires-Dist: usearch; extra == "vector"
62
63
  Provides-Extra: hf
63
64
  Requires-Dist: numba>=0.60.0; extra == "hf"
64
- Requires-Dist: datasets[audio,vision]; extra == "hf"
65
+ Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
65
66
  Provides-Extra: tests
66
67
  Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
67
68
  Requires-Dist: pytest<9,>=8; extra == "tests"
@@ -115,31 +116,30 @@ AI 🔗 DataChain
115
116
 
116
117
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
117
118
  It is made to organize your unstructured data into datasets and wrangle it at scale on
118
- your local machine.
119
+ your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
119
120
 
120
121
  Key Features
121
122
  ============
122
123
 
123
124
  📂 **Storage as a Source of Truth.**
124
- - Process unstructured data without redundant copies: S3, GCP, Azure, and local
125
+ - Process unstructured data without redundant copies from S3, GCP, Azure, and local
125
126
  file systems.
126
- - Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
127
- - Join files and metadata together into persistent, versioned, columnar datasets.
127
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
128
+ - Unite files and metadata together into persistent, versioned, columnar datasets.
128
129
 
129
130
  🐍 **Python-friendly data pipelines.**
130
131
  - Operate on Python objects and object fields.
131
- - Built-in parallelization and out-of-memory compute without a need in SQL or
132
- Spark jobs.
132
+ - Built-in parallelization and out-of-memory compute without SQL or Spark.
133
133
 
134
134
  🧠 **Data Enrichment and Processing.**
135
- - Generate metadata columns using local AI models and LLM APIs.
136
- - Filter, join, and group by AI metadata. Vector similarity search.
137
- - Pass datasets to Pytorch and Tensorflow, or export back into storage.
135
+ - Generate metadata using local AI models and LLM APIs.
136
+ - Filter, join, and group by metadata. Search by vector embeddings.
137
+ - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
138
138
 
139
139
  🚀 **Efficiency.**
140
140
  - Parallelization, out-of-memory workloads and data caching.
141
141
  - Vectorized operations on Python object fields: sum, count, avg, etc.
142
- - Vector search on embeddings.
142
+ - Optimized vector search.
143
143
 
144
144
 
145
145
  Quick Start
@@ -164,7 +164,7 @@ where each image has a matching JSON file like `cat.1009.json`:
164
164
  "inference": {"class": "dog", "confidence": 0.68}
165
165
  }
166
166
 
167
- Example of downloading only high-confidence cat images using JSON metadata:
167
+ Example of downloading only "high-confidence cat" inferred images using JSON metadata:
168
168
 
169
169
 
170
170
  .. code:: py
@@ -234,7 +234,7 @@ detected are then copied to the local directory.
234
234
  LLM judging chatbots
235
235
  =============================
236
236
 
237
- LLMs can work as efficient universal classifiers. In the example below,
237
+ LLMs can work as universal classifiers. In the example below,
238
238
  we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
239
239
  Mistral API key at https://console.mistral.ai
240
240
 
@@ -18,31 +18,30 @@ AI 🔗 DataChain
18
18
 
19
19
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
20
20
  It is made to organize your unstructured data into datasets and wrangle it at scale on
21
- your local machine.
21
+ your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
22
22
 
23
23
  Key Features
24
24
  ============
25
25
 
26
26
  📂 **Storage as a Source of Truth.**
27
- - Process unstructured data without redundant copies: S3, GCP, Azure, and local
27
+ - Process unstructured data without redundant copies from S3, GCP, Azure, and local
28
28
  file systems.
29
- - Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
30
- - Join files and metadata together into persistent, versioned, columnar datasets.
29
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
30
+ - Unite files and metadata together into persistent, versioned, columnar datasets.
31
31
 
32
32
  🐍 **Python-friendly data pipelines.**
33
33
  - Operate on Python objects and object fields.
34
- - Built-in parallelization and out-of-memory compute without a need in SQL or
35
- Spark jobs.
34
+ - Built-in parallelization and out-of-memory compute without SQL or Spark.
36
35
 
37
36
  🧠 **Data Enrichment and Processing.**
38
- - Generate metadata columns using local AI models and LLM APIs.
39
- - Filter, join, and group by AI metadata. Vector similarity search.
40
- - Pass datasets to Pytorch and Tensorflow, or export back into storage.
37
+ - Generate metadata using local AI models and LLM APIs.
38
+ - Filter, join, and group by metadata. Search by vector embeddings.
39
+ - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
41
40
 
42
41
  🚀 **Efficiency.**
43
42
  - Parallelization, out-of-memory workloads and data caching.
44
43
  - Vectorized operations on Python object fields: sum, count, avg, etc.
45
- - Vector search on embeddings.
44
+ - Optimized vector search.
46
45
 
47
46
 
48
47
  Quick Start
@@ -67,7 +66,7 @@ where each image has a matching JSON file like `cat.1009.json`:
67
66
  "inference": {"class": "dog", "confidence": 0.68}
68
67
  }
69
68
 
70
- Example of downloading only high-confidence cat images using JSON metadata:
69
+ Example of downloading only "high-confidence cat" inferred images using JSON metadata:
71
70
 
72
71
 
73
72
  .. code:: py
@@ -137,7 +136,7 @@ detected are then copied to the local directory.
137
136
  LLM judging chatbots
138
137
  =============================
139
138
 
140
- LLMs can work as efficient universal classifiers. In the example below,
139
+ LLMs can work as universal classifiers. In the example below,
141
140
  we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
142
141
  Mistral API key at https://console.mistral.ai
143
142
 
@@ -31,7 +31,7 @@ def path_len_benchmark(path):
31
31
 
32
32
  # Run in chain
33
33
  DataChain.from_storage(
34
- path="gs://datachain-demo/dogs-and-cats/",
34
+ "gs://datachain-demo/dogs-and-cats/",
35
35
  ).settings(parallel=-1).map(
36
36
  path_len_benchmark,
37
37
  params=["file.path"],
@@ -11,7 +11,7 @@ def path_len(path):
11
11
  if __name__ == "__main__":
12
12
  # Run in chain
13
13
  DataChain.from_storage(
14
- path="gs://datachain-demo/dogs-and-cats/",
14
+ uri="gs://datachain-demo/dogs-and-cats/",
15
15
  ).map(
16
16
  path_len,
17
17
  params=["file.path"],
@@ -1,5 +1,5 @@
1
1
  #
2
- # pip install unstructured[pdf] nltk==3.8.1 huggingface_hub[hf_transfer]
2
+ # pip install unstructured[pdf] huggingface_hub[hf_transfer]
3
3
  #
4
4
  import os
5
5
 
@@ -1,13 +1,11 @@
1
1
  import datachain.error
2
2
  from datachain import C, DataChain
3
- from datachain.lib.model_store import ModelStore
4
3
  from datachain.lib.webdataset import process_webdataset
5
- from datachain.lib.webdataset_laion import LaionMeta, WDSLaion
4
+ from datachain.lib.webdataset_laion import WDSLaion
6
5
  from datachain.sql import literal
7
6
  from datachain.sql.functions import array, greatest, least, string
8
7
 
9
8
  name = "wds"
10
- ModelStore.register(LaionMeta)
11
9
  try:
12
10
  wds = DataChain.from_dataset(name=name)
13
11
  except datachain.error.DatasetNotFoundError:
@@ -45,7 +45,8 @@ dependencies = [
45
45
  "datamodel-code-generator>=0.25",
46
46
  "Pillow>=10.0.0,<11",
47
47
  "msgpack>=1.0.4,<2",
48
- "psutil"
48
+ "psutil",
49
+ "huggingface_hub"
49
50
  ]
50
51
 
51
52
  [project.optional-dependencies]
@@ -71,7 +72,7 @@ vector = [
71
72
  ]
72
73
  hf = [
73
74
  "numba>=0.60.0",
74
- "datasets[audio,vision]"
75
+ "datasets[audio,vision]>=2.21.0"
75
76
  ]
76
77
  tests = [
77
78
  "datachain[torch,remote,vector,hf]",
@@ -156,8 +156,6 @@ class QueryResult(NamedTuple):
156
156
  dataset: Optional[DatasetRecord]
157
157
  version: Optional[int]
158
158
  output: str
159
- preview: Optional[list[dict]]
160
- metrics: dict[str, Any]
161
159
 
162
160
 
163
161
  class DatasetRowsFetcher(NodesThreadPool):
@@ -1020,20 +1018,6 @@ class Catalog:
1020
1018
 
1021
1019
  return node_groups
1022
1020
 
1023
- def unlist_source(self, uri: StorageURI) -> None:
1024
- self.metastore.clone(uri=uri).mark_storage_not_indexed(uri)
1025
-
1026
- def storage_stats(self, uri: StorageURI) -> Optional[DatasetStats]:
1027
- """
1028
- Returns tuple with storage stats: total number of rows and total dataset size.
1029
- """
1030
- partial_path = self.metastore.get_last_partial_path(uri)
1031
- if partial_path is None:
1032
- return None
1033
- dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
1034
-
1035
- return self.dataset_stats(dataset.name, dataset.latest_version)
1036
-
1037
1021
  def create_dataset(
1038
1022
  self,
1039
1023
  name: str,
@@ -1297,19 +1281,6 @@ class Catalog:
1297
1281
 
1298
1282
  return self.get_dataset(name)
1299
1283
 
1300
- def register_new_dataset(
1301
- self,
1302
- source_dataset: DatasetRecord,
1303
- source_version: int,
1304
- target_name: str,
1305
- ) -> DatasetRecord:
1306
- target_dataset = self.metastore.create_dataset(
1307
- target_name,
1308
- query_script=source_dataset.query_script,
1309
- schema=source_dataset.serialized_schema,
1310
- )
1311
- return self.register_dataset(source_dataset, source_version, target_dataset, 1)
1312
-
1313
1284
  def register_dataset(
1314
1285
  self,
1315
1286
  dataset: DatasetRecord,
@@ -1422,17 +1393,18 @@ class Catalog:
1422
1393
 
1423
1394
  return direct_dependencies
1424
1395
 
1425
- def ls_datasets(self) -> Iterator[DatasetRecord]:
1396
+ def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetRecord]:
1426
1397
  datasets = self.metastore.list_datasets()
1427
1398
  for d in datasets:
1428
- if not d.is_bucket_listing:
1399
+ if not d.is_bucket_listing or include_listing:
1429
1400
  yield d
1430
1401
 
1431
1402
  def list_datasets_versions(
1432
1403
  self,
1404
+ include_listing: bool = False,
1433
1405
  ) -> Iterator[tuple[DatasetRecord, "DatasetVersion", Optional["Job"]]]:
1434
1406
  """Iterate over all dataset versions with related jobs."""
1435
- datasets = list(self.ls_datasets())
1407
+ datasets = list(self.ls_datasets(include_listing=include_listing))
1436
1408
 
1437
1409
  # preselect dataset versions jobs from db to avoid multiple queries
1438
1410
  jobs_ids: set[str] = {
@@ -1560,17 +1532,8 @@ class Catalog:
1560
1532
  version = self.get_dataset(dataset_name).get_version(dataset_version)
1561
1533
 
1562
1534
  file_signals_values = {}
1563
- file_schemas = {}
1564
- # TODO: To remove after we properly fix deserialization
1565
- for signal, type_name in version.feature_schema.items():
1566
- from datachain.lib.model_store import ModelStore
1567
-
1568
- type_name_parsed, v = ModelStore.parse_name_version(type_name)
1569
- fr = ModelStore.get(type_name_parsed, v)
1570
- if fr and issubclass(fr, File):
1571
- file_schemas[signal] = type_name
1572
1535
 
1573
- schema = SignalSchema.deserialize(file_schemas)
1536
+ schema = SignalSchema.deserialize(version.feature_schema)
1574
1537
  for file_signals in schema.get_signals(File):
1575
1538
  prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
1576
1539
  file_signals_values[file_signals] = {
@@ -1641,15 +1604,6 @@ class Catalog:
1641
1604
  for source in data_sources: # type: ignore [union-attr]
1642
1605
  yield source, source.ls(fields)
1643
1606
 
1644
- def ls_storage_uris(self) -> Iterator[str]:
1645
- yield from self.metastore.get_all_storage_uris()
1646
-
1647
- def get_storage(self, uri: StorageURI) -> Storage:
1648
- return self.metastore.get_storage(uri)
1649
-
1650
- def ls_storages(self) -> list[Storage]:
1651
- return self.metastore.list_storages()
1652
-
1653
1607
  def pull_dataset(
1654
1608
  self,
1655
1609
  dataset_uri: str,
@@ -1883,10 +1837,6 @@ class Catalog:
1883
1837
  envs: Optional[Mapping[str, str]] = None,
1884
1838
  python_executable: Optional[str] = None,
1885
1839
  save: bool = False,
1886
- save_as: Optional[str] = None,
1887
- preview_limit: int = 10,
1888
- preview_offset: int = 0,
1889
- preview_columns: Optional[list[str]] = None,
1890
1840
  capture_output: bool = True,
1891
1841
  output_hook: Callable[[str], None] = noop,
1892
1842
  params: Optional[dict[str, str]] = None,
@@ -1914,9 +1864,8 @@ class Catalog:
1914
1864
  C.size > 1000
1915
1865
  )
1916
1866
  """
1917
- from datachain.query.dataset import ExecutionResult
1918
1867
 
1919
- feature_file = tempfile.NamedTemporaryFile(
1868
+ feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
1920
1869
  dir=os.getcwd(), suffix=".py", delete=False
1921
1870
  )
1922
1871
  _, feature_module = os.path.split(feature_file.name)
@@ -1931,11 +1880,7 @@ class Catalog:
1931
1880
  feature_module,
1932
1881
  output_hook,
1933
1882
  params,
1934
- preview_columns,
1935
- preview_limit,
1936
- preview_offset,
1937
1883
  save,
1938
- save_as,
1939
1884
  job_id,
1940
1885
  )
1941
1886
  finally:
@@ -1964,25 +1909,18 @@ class Catalog:
1964
1909
  )
1965
1910
 
1966
1911
  try:
1967
- response = json.loads(response_text)
1912
+ result = json.loads(response_text)
1968
1913
  except ValueError:
1969
- response = {}
1970
- exec_result = ExecutionResult(**response)
1914
+ result = None
1971
1915
 
1972
1916
  dataset: Optional[DatasetRecord] = None
1973
1917
  version: Optional[int] = None
1974
- if save or save_as:
1918
+ if save:
1975
1919
  dataset, version = self.save_result(
1976
- query_script, exec_result, output, version, job_id
1920
+ query_script, result, output, version, job_id
1977
1921
  )
1978
1922
 
1979
- return QueryResult(
1980
- dataset=dataset,
1981
- version=version,
1982
- output=output,
1983
- preview=exec_result.preview,
1984
- metrics=exec_result.metrics,
1985
- )
1923
+ return QueryResult(dataset=dataset, version=version, output=output)
1986
1924
 
1987
1925
  def run_query(
1988
1926
  self,
@@ -1994,11 +1932,7 @@ class Catalog:
1994
1932
  feature_module: str,
1995
1933
  output_hook: Callable[[str], None],
1996
1934
  params: Optional[dict[str, str]],
1997
- preview_columns: Optional[list[str]],
1998
- preview_limit: int,
1999
- preview_offset: int,
2000
1935
  save: bool,
2001
- save_as: Optional[str],
2002
1936
  job_id: Optional[str],
2003
1937
  ) -> tuple[list[str], subprocess.Popen, str]:
2004
1938
  try:
@@ -2013,10 +1947,6 @@ class Catalog:
2013
1947
  raise QueryScriptCompileError(
2014
1948
  f"Query script failed to compile, reason: {exc}"
2015
1949
  ) from exc
2016
- if save_as and save_as.startswith(QUERY_DATASET_PREFIX):
2017
- raise ValueError(
2018
- f"Cannot use {QUERY_DATASET_PREFIX} prefix for dataset name"
2019
- )
2020
1950
  r, w = os.pipe()
2021
1951
  if os.name == "nt":
2022
1952
  import msvcrt
@@ -2039,15 +1969,7 @@ class Catalog:
2039
1969
  {
2040
1970
  "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
2041
1971
  "PYTHONPATH": os.getcwd(), # For local imports
2042
- "DATACHAIN_QUERY_PREVIEW_ARGS": json.dumps(
2043
- {
2044
- "limit": preview_limit,
2045
- "offset": preview_offset,
2046
- "columns": preview_columns,
2047
- }
2048
- ),
2049
1972
  "DATACHAIN_QUERY_SAVE": "1" if save else "",
2050
- "DATACHAIN_QUERY_SAVE_AS": save_as or "",
2051
1973
  "PYTHONUNBUFFERED": "1",
2052
1974
  "DATACHAIN_OUTPUT_FD": str(handle),
2053
1975
  "DATACHAIN_JOB_ID": job_id or "",
@@ -2077,12 +1999,12 @@ class Catalog:
2077
1999
  return lines, proc, response_text
2078
2000
 
2079
2001
  def save_result(self, query_script, exec_result, output, version, job_id):
2080
- if not exec_result.dataset:
2002
+ if not exec_result:
2081
2003
  raise QueryScriptDatasetNotFound(
2082
2004
  "No dataset found after running Query script",
2083
2005
  output=output,
2084
2006
  )
2085
- name, version = exec_result.dataset
2007
+ name, version = exec_result
2086
2008
  # finding returning dataset
2087
2009
  try:
2088
2010
  dataset = self.get_dataset(name)
@@ -14,6 +14,7 @@ import shtab
14
14
 
15
15
  from datachain import utils
16
16
  from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
17
+ from datachain.lib.dc import DataChain
17
18
  from datachain.utils import DataChainDir
18
19
 
19
20
  if TYPE_CHECKING:
@@ -472,9 +473,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
472
473
  query_parser.add_argument(
473
474
  "script", metavar="<script.py>", type=str, help="Filepath for script"
474
475
  )
475
- query_parser.add_argument(
476
- "dataset_name", nargs="?", type=str, help="Save result dataset as"
477
- )
478
476
  query_parser.add_argument(
479
477
  "--parallel",
480
478
  nargs="?",
@@ -487,7 +485,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
487
485
  "N defaults to the CPU count."
488
486
  ),
489
487
  )
490
- add_show_args(query_parser)
491
488
  query_parser.add_argument(
492
489
  "-p",
493
490
  "--param",
@@ -619,18 +616,6 @@ def _ls_urls_flat(
619
616
  raise FileNotFoundError(f"No such file or directory: {source}")
620
617
 
621
618
 
622
- def ls_indexed_storages(catalog: "Catalog", long: bool = False) -> Iterator[str]:
623
- from datachain.node import long_line_str
624
-
625
- storage_uris = catalog.ls_storage_uris()
626
- if long:
627
- for uri in storage_uris:
628
- # TODO: add Storage.created so it can be used here
629
- yield long_line_str(uri, None, "")
630
- else:
631
- yield from storage_uris
632
-
633
-
634
619
  def ls_local(
635
620
  sources,
636
621
  long: bool = False,
@@ -661,8 +646,9 @@ def ls_local(
661
646
  for entry in entries:
662
647
  print(format_ls_entry(entry))
663
648
  else:
664
- for entry in ls_indexed_storages(catalog, long=long):
665
- print(format_ls_entry(entry))
649
+ chain = DataChain.listings()
650
+ for ls in chain.collect("listing"):
651
+ print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
666
652
 
667
653
 
668
654
  def format_ls_entry(entry: str) -> str:
@@ -813,16 +799,10 @@ def show(
813
799
  def query(
814
800
  catalog: "Catalog",
815
801
  script: str,
816
- dataset_name: Optional[str] = None,
817
802
  parallel: Optional[int] = None,
818
- limit: int = 10,
819
- offset: int = 0,
820
- columns: Optional[list[str]] = None,
821
- no_collapse: bool = False,
822
803
  params: Optional[dict[str, str]] = None,
823
804
  ) -> None:
824
805
  from datachain.data_storage import JobQueryType, JobStatus
825
- from datachain.utils import show_records
826
806
 
827
807
  with open(script, encoding="utf-8") as f:
828
808
  script_content = f.read()
@@ -843,13 +823,9 @@ def query(
843
823
  )
844
824
 
845
825
  try:
846
- result = catalog.query(
826
+ catalog.query(
847
827
  script_content,
848
828
  python_executable=python_executable,
849
- save_as=dataset_name,
850
- preview_limit=limit,
851
- preview_offset=offset,
852
- preview_columns=columns,
853
829
  capture_output=False,
854
830
  params=params,
855
831
  job_id=job_id,
@@ -864,10 +840,7 @@ def query(
864
840
  error_stack=error_stack,
865
841
  )
866
842
  raise
867
-
868
- catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE, metrics=result.metrics)
869
-
870
- show_records(result.preview, collapse_columns=not no_collapse)
843
+ catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
871
844
 
872
845
 
873
846
  def clear_cache(catalog: "Catalog"):
@@ -1042,12 +1015,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1042
1015
  query(
1043
1016
  catalog,
1044
1017
  args.script,
1045
- dataset_name=args.dataset_name,
1046
1018
  parallel=args.parallel,
1047
- limit=args.limit,
1048
- offset=args.offset,
1049
- columns=args.columns,
1050
- no_collapse=args.no_collapse,
1051
1019
  params=args.param,
1052
1020
  )
1053
1021
  elif args.command == "apply-udf":
@@ -87,6 +87,7 @@ class Client(ABC):
87
87
  def get_implementation(url: str) -> type["Client"]:
88
88
  from .azure import AzureClient
89
89
  from .gcs import GCSClient
90
+ from .hf import HfClient
90
91
  from .local import FileClient
91
92
  from .s3 import ClientS3
92
93
 
@@ -104,6 +105,8 @@ class Client(ABC):
104
105
  return AzureClient
105
106
  if protocol == FileClient.protocol:
106
107
  return FileClient
108
+ if protocol == HfClient.protocol:
109
+ return HfClient
107
110
 
108
111
  raise NotImplementedError(f"Unsupported protocol: {protocol}")
109
112
 
@@ -0,0 +1,47 @@
1
+ import os
2
+ import posixpath
3
+ from typing import Any, cast
4
+
5
+ from huggingface_hub import HfFileSystem
6
+
7
+ from datachain.lib.file import File
8
+ from datachain.node import Entry
9
+
10
+ from .fsspec import Client
11
+
12
+
13
+ class HfClient(Client):
14
+ FS_CLASS = HfFileSystem
15
+ PREFIX = "hf://"
16
+ protocol = "hf"
17
+
18
+ @classmethod
19
+ def create_fs(cls, **kwargs) -> HfFileSystem:
20
+ if os.environ.get("HF_TOKEN"):
21
+ kwargs["token"] = os.environ["HF_TOKEN"]
22
+
23
+ return cast(HfFileSystem, super().create_fs(**kwargs))
24
+
25
+ def convert_info(self, v: dict[str, Any], path: str) -> Entry:
26
+ return Entry.from_file(
27
+ path=path,
28
+ size=v["size"],
29
+ version=v["last_commit"].oid,
30
+ etag=v.get("blob_id", ""),
31
+ last_modified=v["last_commit"].date,
32
+ )
33
+
34
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
35
+ return File(
36
+ path=path,
37
+ size=v["size"],
38
+ version=v["last_commit"].oid,
39
+ etag=v.get("blob_id", ""),
40
+ last_modified=v["last_commit"].date,
41
+ )
42
+
43
+ async def ls_dir(self, path):
44
+ return self.fs.ls(path, detail=True)
45
+
46
+ def rel_path(self, path):
47
+ return posixpath.relpath(path, self.name)