datachain 0.8.0__tar.gz → 0.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (290) hide show
  1. {datachain-0.8.0 → datachain-0.8.2}/.github/workflows/benchmarks.yml +1 -1
  2. {datachain-0.8.0 → datachain-0.8.2}/.github/workflows/release.yml +1 -1
  3. {datachain-0.8.0 → datachain-0.8.2}/.github/workflows/tests-studio.yml +1 -1
  4. {datachain-0.8.0 → datachain-0.8.2}/.github/workflows/tests.yml +3 -3
  5. {datachain-0.8.0 → datachain-0.8.2}/.pre-commit-config.yaml +1 -1
  6. {datachain-0.8.0/src/datachain.egg-info → datachain-0.8.2}/PKG-INFO +85 -3
  7. {datachain-0.8.0 → datachain-0.8.2}/README.rst +82 -0
  8. {datachain-0.8.0 → datachain-0.8.2}/docs/quick-start.md +10 -8
  9. {datachain-0.8.0 → datachain-0.8.2}/pyproject.toml +2 -2
  10. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/catalog/catalog.py +3 -4
  11. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/gcs.py +10 -0
  12. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/warehouse.py +0 -1
  13. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/arrow.py +82 -58
  14. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/dc.py +12 -57
  15. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/file.py +3 -1
  16. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/listing.py +44 -0
  17. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/udf.py +0 -1
  18. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/batch.py +32 -6
  19. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/dataset.py +17 -17
  20. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/dispatch.py +125 -125
  21. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/session.py +8 -5
  22. datachain-0.8.2/src/datachain/query/udf.py +20 -0
  23. datachain-0.8.2/src/datachain/query/utils.py +42 -0
  24. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/utils.py +1 -1
  25. {datachain-0.8.0 → datachain-0.8.2/src/datachain.egg-info}/PKG-INFO +85 -3
  26. {datachain-0.8.0 → datachain-0.8.2}/src/datachain.egg-info/SOURCES.txt +4 -0
  27. {datachain-0.8.0 → datachain-0.8.2}/src/datachain.egg-info/requires.txt +2 -2
  28. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_catalog.py +6 -2
  29. datachain-0.8.2/tests/func/test_session.py +25 -0
  30. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_arrow.py +26 -0
  31. datachain-0.8.2/tests/unit/test_client_gcs.py +6 -0
  32. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_listing.py +29 -2
  33. {datachain-0.8.0 → datachain-0.8.2}/.cruft.json +0 -0
  34. {datachain-0.8.0 → datachain-0.8.2}/.gitattributes +0 -0
  35. {datachain-0.8.0 → datachain-0.8.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  36. {datachain-0.8.0 → datachain-0.8.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  37. {datachain-0.8.0 → datachain-0.8.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  38. {datachain-0.8.0 → datachain-0.8.2}/.github/codecov.yaml +0 -0
  39. {datachain-0.8.0 → datachain-0.8.2}/.github/dependabot.yml +0 -0
  40. {datachain-0.8.0 → datachain-0.8.2}/.github/workflows/update-template.yaml +0 -0
  41. {datachain-0.8.0 → datachain-0.8.2}/.gitignore +0 -0
  42. {datachain-0.8.0 → datachain-0.8.2}/CODE_OF_CONDUCT.rst +0 -0
  43. {datachain-0.8.0 → datachain-0.8.2}/LICENSE +0 -0
  44. {datachain-0.8.0 → datachain-0.8.2}/docs/assets/captioned_cartoons.png +0 -0
  45. {datachain-0.8.0 → datachain-0.8.2}/docs/assets/datachain-white.svg +0 -0
  46. {datachain-0.8.0 → datachain-0.8.2}/docs/assets/datachain.svg +0 -0
  47. {datachain-0.8.0 → datachain-0.8.2}/docs/contributing.md +0 -0
  48. {datachain-0.8.0 → datachain-0.8.2}/docs/css/github-permalink-style.css +0 -0
  49. {datachain-0.8.0 → datachain-0.8.2}/docs/examples.md +0 -0
  50. {datachain-0.8.0 → datachain-0.8.2}/docs/index.md +0 -0
  51. {datachain-0.8.0 → datachain-0.8.2}/docs/overrides/main.html +0 -0
  52. {datachain-0.8.0 → datachain-0.8.2}/docs/references/datachain.md +0 -0
  53. {datachain-0.8.0 → datachain-0.8.2}/docs/references/datatype.md +0 -0
  54. {datachain-0.8.0 → datachain-0.8.2}/docs/references/file.md +0 -0
  55. {datachain-0.8.0 → datachain-0.8.2}/docs/references/index.md +0 -0
  56. {datachain-0.8.0 → datachain-0.8.2}/docs/references/sql.md +0 -0
  57. {datachain-0.8.0 → datachain-0.8.2}/docs/references/torch.md +0 -0
  58. {datachain-0.8.0 → datachain-0.8.2}/docs/references/udf.md +0 -0
  59. {datachain-0.8.0 → datachain-0.8.2}/docs/tutorials.md +0 -0
  60. {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  61. {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  62. {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/openimage-detect.py +0 -0
  63. {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/ultralytics-bbox.py +0 -0
  64. {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/ultralytics-pose.py +0 -0
  65. {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/ultralytics-segment.py +0 -0
  66. {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/common_sql_functions.py +0 -0
  67. {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/json-csv-reader.py +0 -0
  68. {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/torch-loader.py +0 -0
  69. {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/udfs/parallel.py +0 -0
  70. {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/udfs/simple.py +0 -0
  71. {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/udfs/stateful.py +0 -0
  72. {datachain-0.8.0 → datachain-0.8.2}/examples/llm_and_nlp/claude-query.py +0 -0
  73. {datachain-0.8.0 → datachain-0.8.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  74. {datachain-0.8.0 → datachain-0.8.2}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  75. {datachain-0.8.0 → datachain-0.8.2}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  76. {datachain-0.8.0 → datachain-0.8.2}/examples/multimodal/clip_inference.py +0 -0
  77. {datachain-0.8.0 → datachain-0.8.2}/examples/multimodal/hf_pipeline.py +0 -0
  78. {datachain-0.8.0 → datachain-0.8.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
  79. {datachain-0.8.0 → datachain-0.8.2}/examples/multimodal/wds.py +0 -0
  80. {datachain-0.8.0 → datachain-0.8.2}/examples/multimodal/wds_filtered.py +0 -0
  81. {datachain-0.8.0 → datachain-0.8.2}/mkdocs.yml +0 -0
  82. {datachain-0.8.0 → datachain-0.8.2}/noxfile.py +0 -0
  83. {datachain-0.8.0 → datachain-0.8.2}/setup.cfg +0 -0
  84. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/__init__.py +0 -0
  85. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/__main__.py +0 -0
  86. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/asyn.py +0 -0
  87. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/cache.py +0 -0
  88. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/catalog/__init__.py +0 -0
  89. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/catalog/datasource.py +0 -0
  90. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/catalog/loader.py +0 -0
  91. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/cli.py +0 -0
  92. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/cli_utils.py +0 -0
  93. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/__init__.py +0 -0
  94. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/azure.py +0 -0
  95. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/fileslice.py +0 -0
  96. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/fsspec.py +0 -0
  97. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/hf.py +0 -0
  98. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/local.py +0 -0
  99. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/s3.py +0 -0
  100. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/config.py +0 -0
  101. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/__init__.py +0 -0
  102. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/db_engine.py +0 -0
  103. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/job.py +0 -0
  104. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/metastore.py +0 -0
  105. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/schema.py +0 -0
  106. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/serializer.py +0 -0
  107. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/sqlite.py +0 -0
  108. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/dataset.py +0 -0
  109. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/error.py +0 -0
  110. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/__init__.py +0 -0
  111. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/aggregate.py +0 -0
  112. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/array.py +0 -0
  113. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/base.py +0 -0
  114. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/conditional.py +0 -0
  115. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/func.py +0 -0
  116. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/numeric.py +0 -0
  117. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/path.py +0 -0
  118. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/random.py +0 -0
  119. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/string.py +0 -0
  120. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/window.py +0 -0
  121. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/job.py +0 -0
  122. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/__init__.py +0 -0
  123. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/clip.py +0 -0
  124. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/__init__.py +0 -0
  125. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/flatten.py +0 -0
  126. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
  127. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
  128. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/unflatten.py +0 -0
  129. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  130. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/data_model.py +0 -0
  131. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/dataset_info.py +0 -0
  132. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/diff.py +0 -0
  133. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/hf.py +0 -0
  134. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/image.py +0 -0
  135. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/listing_info.py +0 -0
  136. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/meta_formats.py +0 -0
  137. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/model_store.py +0 -0
  138. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/pytorch.py +0 -0
  139. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/settings.py +0 -0
  140. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/signal_schema.py +0 -0
  141. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/tar.py +0 -0
  142. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/text.py +0 -0
  143. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/udf_signature.py +0 -0
  144. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/utils.py +0 -0
  145. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/vfile.py +0 -0
  146. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/webdataset.py +0 -0
  147. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/webdataset_laion.py +0 -0
  148. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/listing.py +0 -0
  149. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/__init__.py +0 -0
  150. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/bbox.py +0 -0
  151. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/pose.py +0 -0
  152. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/segment.py +0 -0
  153. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/ultralytics/__init__.py +0 -0
  154. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/ultralytics/bbox.py +0 -0
  155. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/ultralytics/pose.py +0 -0
  156. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/ultralytics/segment.py +0 -0
  157. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/node.py +0 -0
  158. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/nodes_fetcher.py +0 -0
  159. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/nodes_thread_pool.py +0 -0
  160. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/progress.py +0 -0
  161. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/py.typed +0 -0
  162. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/__init__.py +0 -0
  163. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/metrics.py +0 -0
  164. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/params.py +0 -0
  165. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/queue.py +0 -0
  166. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/schema.py +0 -0
  167. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/remote/__init__.py +0 -0
  168. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/remote/studio.py +0 -0
  169. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/__init__.py +0 -0
  170. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/default/__init__.py +0 -0
  171. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/default/base.py +0 -0
  172. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/__init__.py +0 -0
  173. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/aggregate.py +0 -0
  174. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/array.py +0 -0
  175. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/conditional.py +0 -0
  176. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/numeric.py +0 -0
  177. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/path.py +0 -0
  178. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/random.py +0 -0
  179. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/string.py +0 -0
  180. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/selectable.py +0 -0
  181. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/sqlite/__init__.py +0 -0
  182. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/sqlite/base.py +0 -0
  183. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/sqlite/types.py +0 -0
  184. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/sqlite/vector.py +0 -0
  185. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/types.py +0 -0
  186. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/utils.py +0 -0
  187. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/studio.py +0 -0
  188. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/telemetry.py +0 -0
  189. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/toolkit/__init__.py +0 -0
  190. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/toolkit/split.py +0 -0
  191. {datachain-0.8.0 → datachain-0.8.2}/src/datachain/torch/__init__.py +0 -0
  192. {datachain-0.8.0 → datachain-0.8.2}/src/datachain.egg-info/dependency_links.txt +0 -0
  193. {datachain-0.8.0 → datachain-0.8.2}/src/datachain.egg-info/entry_points.txt +0 -0
  194. {datachain-0.8.0 → datachain-0.8.2}/src/datachain.egg-info/top_level.txt +0 -0
  195. {datachain-0.8.0 → datachain-0.8.2}/tests/__init__.py +0 -0
  196. {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/__init__.py +0 -0
  197. {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/conftest.py +0 -0
  198. {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  199. {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/datasets/.dvc/config +0 -0
  200. {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/datasets/.gitignore +0 -0
  201. {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  202. {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/test_datachain.py +0 -0
  203. {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/test_ls.py +0 -0
  204. {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/test_version.py +0 -0
  205. {datachain-0.8.0 → datachain-0.8.2}/tests/conftest.py +0 -0
  206. {datachain-0.8.0 → datachain-0.8.2}/tests/data.py +0 -0
  207. {datachain-0.8.0 → datachain-0.8.2}/tests/examples/__init__.py +0 -0
  208. {datachain-0.8.0 → datachain-0.8.2}/tests/examples/test_examples.py +0 -0
  209. {datachain-0.8.0 → datachain-0.8.2}/tests/examples/test_wds_e2e.py +0 -0
  210. {datachain-0.8.0 → datachain-0.8.2}/tests/examples/wds_data.py +0 -0
  211. {datachain-0.8.0 → datachain-0.8.2}/tests/func/__init__.py +0 -0
  212. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_client.py +0 -0
  213. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_datachain.py +0 -0
  214. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_dataset_query.py +0 -0
  215. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_datasets.py +0 -0
  216. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_feature_pickling.py +0 -0
  217. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_listing.py +0 -0
  218. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_ls.py +0 -0
  219. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_meta_formats.py +0 -0
  220. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_metrics.py +0 -0
  221. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_pull.py +0 -0
  222. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_pytorch.py +0 -0
  223. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_query.py +0 -0
  224. {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_toolkit.py +0 -0
  225. {datachain-0.8.0 → datachain-0.8.2}/tests/scripts/feature_class.py +0 -0
  226. {datachain-0.8.0 → datachain-0.8.2}/tests/scripts/feature_class_exception.py +0 -0
  227. {datachain-0.8.0 → datachain-0.8.2}/tests/scripts/feature_class_parallel.py +0 -0
  228. {datachain-0.8.0 → datachain-0.8.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  229. {datachain-0.8.0 → datachain-0.8.2}/tests/scripts/name_len_slow.py +0 -0
  230. {datachain-0.8.0 → datachain-0.8.2}/tests/test_atomicity.py +0 -0
  231. {datachain-0.8.0 → datachain-0.8.2}/tests/test_cli_e2e.py +0 -0
  232. {datachain-0.8.0 → datachain-0.8.2}/tests/test_cli_studio.py +0 -0
  233. {datachain-0.8.0 → datachain-0.8.2}/tests/test_query_e2e.py +0 -0
  234. {datachain-0.8.0 → datachain-0.8.2}/tests/test_telemetry.py +0 -0
  235. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/__init__.py +0 -0
  236. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/__init__.py +0 -0
  237. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/conftest.py +0 -0
  238. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_clip.py +0 -0
  239. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_datachain.py +0 -0
  240. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  241. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_datachain_merge.py +0 -0
  242. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_diff.py +0 -0
  243. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_feature.py +0 -0
  244. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_feature_utils.py +0 -0
  245. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_file.py +0 -0
  246. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_hf.py +0 -0
  247. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_image.py +0 -0
  248. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_listing_info.py +0 -0
  249. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_models.py +0 -0
  250. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_schema.py +0 -0
  251. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_signal_schema.py +0 -0
  252. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_sql_to_python.py +0 -0
  253. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_text.py +0 -0
  254. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_udf_signature.py +0 -0
  255. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_utils.py +0 -0
  256. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_webdataset.py +0 -0
  257. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/__init__.py +0 -0
  258. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/sqlite/__init__.py +0 -0
  259. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/sqlite/test_types.py +0 -0
  260. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
  261. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_array.py +0 -0
  262. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_conditional.py +0 -0
  263. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_path.py +0 -0
  264. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_random.py +0 -0
  265. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_selectable.py +0 -0
  266. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_string.py +0 -0
  267. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_asyn.py +0 -0
  268. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_cache.py +0 -0
  269. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_catalog.py +0 -0
  270. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_catalog_loader.py +0 -0
  271. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_cli_parsing.py +0 -0
  272. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_client.py +0 -0
  273. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_client_s3.py +0 -0
  274. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_config.py +0 -0
  275. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_data_storage.py +0 -0
  276. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_database_engine.py +0 -0
  277. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_dataset.py +0 -0
  278. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_dispatch.py +0 -0
  279. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_fileslice.py +0 -0
  280. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_func.py +0 -0
  281. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_metastore.py +0 -0
  282. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_module_exports.py +0 -0
  283. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_query.py +0 -0
  284. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_query_metrics.py +0 -0
  285. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_query_params.py +0 -0
  286. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_serializer.py +0 -0
  287. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_session.py +0 -0
  288. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_utils.py +0 -0
  289. {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_warehouse.py +0 -0
  290. {datachain-0.8.0 → datachain-0.8.2}/tests/utils.py +0 -0
@@ -25,7 +25,7 @@ jobs:
25
25
  python-version: '3.12'
26
26
 
27
27
  - name: Setup uv
28
- uses: astral-sh/setup-uv@v4
28
+ uses: astral-sh/setup-uv@v5
29
29
  with:
30
30
  enable-cache: true
31
31
  cache-suffix: benchmarks
@@ -27,7 +27,7 @@ jobs:
27
27
  python-version: '3.12'
28
28
 
29
29
  - name: Setup uv
30
- uses: astral-sh/setup-uv@v4
30
+ uses: astral-sh/setup-uv@v5
31
31
 
32
32
  - name: Install nox
33
33
  run: uv pip install nox --system
@@ -81,7 +81,7 @@ jobs:
81
81
  python-version: ${{ matrix.pyv }}
82
82
 
83
83
  - name: Setup uv
84
- uses: astral-sh/setup-uv@v4
84
+ uses: astral-sh/setup-uv@v5
85
85
  with:
86
86
  enable-cache: true
87
87
  cache-suffix: studio
@@ -37,7 +37,7 @@ jobs:
37
37
  python-version: '3.9'
38
38
 
39
39
  - name: Setup uv
40
- uses: astral-sh/setup-uv@v4
40
+ uses: astral-sh/setup-uv@v5
41
41
  with:
42
42
  enable-cache: true
43
43
  cache-suffix: lint
@@ -94,7 +94,7 @@ jobs:
94
94
  python-version: ${{ matrix.pyv }}
95
95
 
96
96
  - name: Setup uv
97
- uses: astral-sh/setup-uv@v4
97
+ uses: astral-sh/setup-uv@v5
98
98
  with:
99
99
  enable-cache: true
100
100
  cache-suffix: tests-${{ matrix.pyv }}
@@ -157,7 +157,7 @@ jobs:
157
157
  python-version: ${{ matrix.pyv }}
158
158
 
159
159
  - name: Setup uv
160
- uses: astral-sh/setup-uv@v4
160
+ uses: astral-sh/setup-uv@v5
161
161
  with:
162
162
  enable-cache: true
163
163
  cache-suffix: examples-${{ matrix.pyv }}
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.8.3'
27
+ rev: 'v0.8.4'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.8.0
3
+ Version: 0.8.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -84,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
84
84
  Requires-Dist: scipy; extra == "tests"
85
85
  Provides-Extra: dev
86
86
  Requires-Dist: datachain[docs,tests]; extra == "dev"
87
- Requires-Dist: mypy==1.13.0; extra == "dev"
87
+ Requires-Dist: mypy==1.14.0; extra == "dev"
88
88
  Requires-Dist: types-python-dateutil; extra == "dev"
89
89
  Requires-Dist: types-pytz; extra == "dev"
90
90
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -99,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
99
99
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
100
100
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
101
101
  Requires-Dist: onnx==1.16.1; extra == "examples"
102
- Requires-Dist: ultralytics==8.3.50; extra == "examples"
102
+ Requires-Dist: ultralytics==8.3.53; extra == "examples"
103
103
 
104
104
  ================
105
105
  |logo| DataChain
@@ -145,6 +145,88 @@ Getting Started
145
145
  Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
146
146
  to get started with `DataChain` and learn more.
147
147
 
148
+ .. code:: bash
149
+
150
+ pip install datachain
151
+
152
+
153
+ Example: download subset of files based on metadata
154
+ ---------------------------------------------------
155
+
156
+ Sometimes users only need to download a specific subset of files from cloud storage,
157
+ rather than the entire dataset.
158
+ For example, you could use a JSON file's metadata to download just cat images with
159
+ high confidence scores.
160
+
161
+
162
+ .. code:: py
163
+
164
+ from datachain import Column, DataChain
165
+
166
+ meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
167
+ images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
168
+
169
+ images_id = images.map(id=lambda file: file.path.split('.')[-2])
170
+ annotated = images_id.merge(meta, on="id", right_on="meta.id")
171
+
172
+ likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
173
+ & (Column("meta.inference.class_") == "cat"))
174
+ likely_cats.export_files("high-confidence-cats/", signal="file")
175
+
176
+
177
+ Example: LLM based text-file evaluation
178
+ ---------------------------------------
179
+
180
+ In this example, we evaluate chatbot conversations stored in text files
181
+ using LLM based evaluation.
182
+
183
+ .. code:: shell
184
+
185
+ $ pip install mistralai # Requires version >=1.0.0
186
+ $ export MISTRAL_API_KEY=_your_key_
187
+
188
+ Python code:
189
+
190
+ .. code:: py
191
+
192
+ from mistralai import Mistral
193
+ from datachain import File, DataChain, Column
194
+
195
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
196
+
197
+ def eval_dialogue(file: File) -> bool:
198
+ client = Mistral()
199
+ response = client.chat.complete(
200
+ model="open-mixtral-8x22b",
201
+ messages=[{"role": "system", "content": PROMPT},
202
+ {"role": "user", "content": file.read()}])
203
+ result = response.choices[0].message.content
204
+ return result.lower().startswith("success")
205
+
206
+ chain = (
207
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
208
+ .settings(parallel=4, cache=True)
209
+ .map(is_success=eval_dialogue)
210
+ .save("mistral_files")
211
+ )
212
+
213
+ successful_chain = chain.filter(Column("is_success") == True)
214
+ successful_chain.export_files("./output_mistral")
215
+
216
+ print(f"{successful_chain.count()} files were exported")
217
+
218
+
219
+
220
+ With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
221
+
222
+ .. code:: shell
223
+
224
+ $ ls output_mistral/datachain-demo/chatbot-KiT/
225
+ 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
226
+ $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
227
+ 31
228
+
229
+
148
230
  Key Features
149
231
  ============
150
232
 
@@ -42,6 +42,88 @@ Getting Started
42
42
  Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
43
43
  to get started with `DataChain` and learn more.
44
44
 
45
+ .. code:: bash
46
+
47
+ pip install datachain
48
+
49
+
50
+ Example: download subset of files based on metadata
51
+ ---------------------------------------------------
52
+
53
+ Sometimes users only need to download a specific subset of files from cloud storage,
54
+ rather than the entire dataset.
55
+ For example, you could use a JSON file's metadata to download just cat images with
56
+ high confidence scores.
57
+
58
+
59
+ .. code:: py
60
+
61
+ from datachain import Column, DataChain
62
+
63
+ meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
64
+ images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
65
+
66
+ images_id = images.map(id=lambda file: file.path.split('.')[-2])
67
+ annotated = images_id.merge(meta, on="id", right_on="meta.id")
68
+
69
+ likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
70
+ & (Column("meta.inference.class_") == "cat"))
71
+ likely_cats.export_files("high-confidence-cats/", signal="file")
72
+
73
+
74
+ Example: LLM based text-file evaluation
75
+ ---------------------------------------
76
+
77
+ In this example, we evaluate chatbot conversations stored in text files
78
+ using LLM based evaluation.
79
+
80
+ .. code:: shell
81
+
82
+ $ pip install mistralai # Requires version >=1.0.0
83
+ $ export MISTRAL_API_KEY=_your_key_
84
+
85
+ Python code:
86
+
87
+ .. code:: py
88
+
89
+ from mistralai import Mistral
90
+ from datachain import File, DataChain, Column
91
+
92
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
93
+
94
+ def eval_dialogue(file: File) -> bool:
95
+ client = Mistral()
96
+ response = client.chat.complete(
97
+ model="open-mixtral-8x22b",
98
+ messages=[{"role": "system", "content": PROMPT},
99
+ {"role": "user", "content": file.read()}])
100
+ result = response.choices[0].message.content
101
+ return result.lower().startswith("success")
102
+
103
+ chain = (
104
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
105
+ .settings(parallel=4, cache=True)
106
+ .map(is_success=eval_dialogue)
107
+ .save("mistral_files")
108
+ )
109
+
110
+ successful_chain = chain.filter(Column("is_success") == True)
111
+ successful_chain.export_files("./output_mistral")
112
+
113
+ print(f"{successful_chain.count()} files were exported")
114
+
115
+
116
+
117
+ With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
118
+
119
+ .. code:: shell
120
+
121
+ $ ls output_mistral/datachain-demo/chatbot-KiT/
122
+ 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
123
+ $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
124
+ 31
125
+
126
+
45
127
  Key Features
46
128
  ============
47
129
 
@@ -39,8 +39,8 @@ using JSON metadata:
39
39
  ``` py
40
40
  from datachain import Column, DataChain
41
41
 
42
- meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
43
- images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
42
+ meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
43
+ images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
44
44
 
45
45
  images_id = images.map(id=lambda file: file.path.split('.')[-2])
46
46
  annotated = images_id.merge(meta, on="id", right_on="meta.id")
@@ -59,6 +59,8 @@ Batch inference with a simple sentiment model using the
59
59
  pip install transformers
60
60
  ```
61
61
 
62
+ Note, `transformers` works only if `torch`, `tensorflow` >= 2.0, or `flax` are installed.
63
+
62
64
  The code below downloads files from the cloud, and applies a
63
65
  user-defined function to each one of them. All files with a positive
64
66
  sentiment detected are then copied to the local directory.
@@ -76,7 +78,7 @@ def is_positive_dialogue_ending(file) -> bool:
76
78
 
77
79
  chain = (
78
80
  DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
79
- object_name="file", type="text")
81
+ object_name="file", type="text", anon=True)
80
82
  .settings(parallel=8, cache=True)
81
83
  .map(is_positive=is_positive_dialogue_ending)
82
84
  .save("file_response")
@@ -114,13 +116,14 @@ DataChain can parallelize API calls; the free Mistral tier supports up
114
116
  to 4 requests at the same time.
115
117
 
116
118
  ``` py
119
+ import os
117
120
  from mistralai import Mistral
118
121
  from datachain import File, DataChain, Column
119
122
 
120
123
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
121
124
 
122
125
  def eval_dialogue(file: File) -> bool:
123
- client = Mistral()
126
+ client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
124
127
  response = client.chat.complete(
125
128
  model="open-mixtral-8x22b",
126
129
  messages=[{"role": "system", "content": PROMPT},
@@ -129,8 +132,7 @@ def eval_dialogue(file: File) -> bool:
129
132
  return result.lower().startswith("success")
130
133
 
131
134
  chain = (
132
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
133
- .settings(parallel=4, cache=True)
135
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
134
136
  .map(is_success=eval_dialogue)
135
137
  .save("mistral_files")
136
138
  )
@@ -175,7 +177,7 @@ def eval_dialog(file: File) -> ChatCompletionResponse:
175
177
  {"role": "user", "content": file.read()}])
176
178
 
177
179
  chain = (
178
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
180
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
179
181
  .settings(parallel=4, cache=True)
180
182
  .map(response=eval_dialog)
181
183
  .map(status=lambda response: response.choices[0].message.content.lower()[:7])
@@ -271,7 +273,7 @@ from datachain import C, DataChain
271
273
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
272
274
 
273
275
  chain = (
274
- DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
276
+ DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
275
277
  .map(label=lambda name: name.split(".")[0], params=["file.name"])
276
278
  .select("file", "label").to_pytorch(
277
279
  transform=processor.image_processor,
@@ -96,7 +96,7 @@ tests = [
96
96
  ]
97
97
  dev = [
98
98
  "datachain[docs,tests]",
99
- "mypy==1.13.0",
99
+ "mypy==1.14.0",
100
100
  "types-python-dateutil",
101
101
  "types-pytz",
102
102
  "types-PyYAML",
@@ -112,7 +112,7 @@ examples = [
112
112
  "pdfplumber==0.11.4",
113
113
  "huggingface_hub[hf_transfer]",
114
114
  "onnx==1.16.1",
115
- "ultralytics==8.3.50"
115
+ "ultralytics==8.3.53"
116
116
  ]
117
117
 
118
118
  [project.urls]
@@ -52,6 +52,7 @@ from datachain.error import (
52
52
  QueryScriptCancelError,
53
53
  QueryScriptRunError,
54
54
  )
55
+ from datachain.lib.listing import get_listing
55
56
  from datachain.node import DirType, Node, NodeWithPath
56
57
  from datachain.nodes_thread_pool import NodesThreadPool
57
58
  from datachain.remote.studio import StudioClient
@@ -599,7 +600,7 @@ class Catalog:
599
600
  source, session=self.session, update=update, object_name=object_name
600
601
  )
601
602
 
602
- list_ds_name, list_uri, list_path, _ = DataChain.parse_uri(
603
+ list_ds_name, list_uri, list_path, _ = get_listing(
603
604
  source, self.session, update=update
604
605
  )
605
606
 
@@ -697,11 +698,9 @@ class Catalog:
697
698
  )
698
699
  indexed_sources = []
699
700
  for source in dataset_sources:
700
- from datachain.lib.dc import DataChain
701
-
702
701
  client = self.get_client(source, **client_config)
703
702
  uri = client.uri
704
- dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
703
+ dataset_name, _, _, _ = get_listing(uri, self.session)
705
704
  listing = Listing(
706
705
  self.metastore.clone(),
707
706
  self.warehouse.clone(),
@@ -32,6 +32,16 @@ class GCSClient(Client):
32
32
 
33
33
  return cast(GCSFileSystem, super().create_fs(**kwargs))
34
34
 
35
+ def url(self, path: str, expires: int = 3600, **kwargs) -> str:
36
+ """
37
+ Generate a signed URL for the given path.
38
+ If the client is anonymous, a public URL is returned instead
39
+ (see https://cloud.google.com/storage/docs/access-public-data#api-link).
40
+ """
41
+ if self.fs.storage_options.get("token") == "anon":
42
+ return f"https://storage.googleapis.com/{self.name}/{path}"
43
+ return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
44
+
35
45
  @staticmethod
36
46
  def parse_timestamp(timestamp: str) -> datetime:
37
47
  """
@@ -216,7 +216,6 @@ class AbstractWarehouse(ABC, Serializable):
216
216
  limit = query._limit
217
217
  paginated_query = query.limit(page_size)
218
218
 
219
- results = None
220
219
  offset = 0
221
220
  num_yielded = 0
222
221
 
@@ -1,9 +1,11 @@
1
1
  from collections.abc import Sequence
2
- from tempfile import NamedTemporaryFile
2
+ from itertools import islice
3
3
  from typing import TYPE_CHECKING, Any, Optional
4
4
 
5
+ import fsspec.implementations.reference
5
6
  import orjson
6
7
  import pyarrow as pa
8
+ from fsspec.core import split_protocol
7
9
  from pyarrow.dataset import CsvFileFormat, dataset
8
10
  from tqdm import tqdm
9
11
 
@@ -25,7 +27,18 @@ if TYPE_CHECKING:
25
27
  DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
26
28
 
27
29
 
30
+ class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
31
+ def _open(self, path, mode="rb", *args, **kwargs):
32
+ # overriding because `fsspec`'s `ReferenceFileSystem._open`
33
+ # reads the whole file in-memory.
34
+ (uri,) = self.references[path]
35
+ protocol, _ = split_protocol(uri)
36
+ return self.fss[protocol]._open(uri, mode, *args, **kwargs)
37
+
38
+
28
39
  class ArrowGenerator(Generator):
40
+ DEFAULT_BATCH_SIZE = 2**17 # same as `pyarrow._dataset._DEFAULT_BATCH_SIZE`
41
+
29
42
  def __init__(
30
43
  self,
31
44
  input_schema: Optional["pa.Schema"] = None,
@@ -55,57 +68,80 @@ class ArrowGenerator(Generator):
55
68
  def process(self, file: File):
56
69
  if file._caching_enabled:
57
70
  file.ensure_cached()
58
- path = file.get_local_path()
59
- ds = dataset(path, schema=self.input_schema, **self.kwargs)
60
- elif self.nrows:
61
- path = _nrows_file(file, self.nrows)
62
- ds = dataset(path, schema=self.input_schema, **self.kwargs)
71
+ cache_path = file.get_local_path()
72
+ fs_path = file.path
73
+ fs = ReferenceFileSystem({fs_path: [cache_path]})
63
74
  else:
64
- path = file.get_path()
65
- ds = dataset(
66
- path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
67
- )
75
+ fs, fs_path = file.get_fs(), file.get_path()
76
+
77
+ ds = dataset(fs_path, schema=self.input_schema, filesystem=fs, **self.kwargs)
78
+
68
79
  hf_schema = _get_hf_schema(ds.schema)
69
80
  use_datachain_schema = (
70
81
  bool(ds.schema.metadata)
71
82
  and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in ds.schema.metadata
72
83
  )
73
- index = 0
74
- with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
75
- for record_batch in ds.to_batches():
76
- for record in record_batch.to_pylist():
77
- if use_datachain_schema and self.output_schema:
78
- vals = [_nested_model_instantiate(record, self.output_schema)]
79
- else:
80
- vals = list(record.values())
81
- if self.output_schema:
82
- fields = self.output_schema.model_fields
83
- vals_dict = {}
84
- for i, ((field, field_info), val) in enumerate(
85
- zip(fields.items(), vals)
86
- ):
87
- anno = field_info.annotation
88
- if hf_schema:
89
- from datachain.lib.hf import convert_feature
90
-
91
- feat = list(hf_schema[0].values())[i]
92
- vals_dict[field] = convert_feature(val, feat, anno)
93
- elif ModelStore.is_pydantic(anno):
94
- vals_dict[field] = anno(**val) # type: ignore[misc]
95
- else:
96
- vals_dict[field] = val
97
- vals = [self.output_schema(**vals_dict)]
98
- if self.source:
99
- kwargs: dict = self.kwargs
100
- # Can't serialize CsvFileFormat; may lose formatting options.
101
- if isinstance(kwargs.get("format"), CsvFileFormat):
102
- kwargs["format"] = "csv"
103
- arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
104
- yield [arrow_file, *vals]
105
- else:
106
- yield vals
107
- index += 1
108
- pbar.update(len(record_batch))
84
+
85
+ kw = {}
86
+ if self.nrows:
87
+ kw = {"batch_size": min(self.DEFAULT_BATCH_SIZE, self.nrows)}
88
+
89
+ def iter_records():
90
+ for record_batch in ds.to_batches(**kw):
91
+ yield from record_batch.to_pylist()
92
+
93
+ it = islice(iter_records(), self.nrows)
94
+ with tqdm(it, desc="Parsed by pyarrow", unit="rows", total=self.nrows) as pbar:
95
+ for index, record in enumerate(pbar):
96
+ yield self._process_record(
97
+ record, file, index, hf_schema, use_datachain_schema
98
+ )
99
+
100
+ def _process_record(
101
+ self,
102
+ record: dict[str, Any],
103
+ file: File,
104
+ index: int,
105
+ hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
106
+ use_datachain_schema: bool,
107
+ ):
108
+ if use_datachain_schema and self.output_schema:
109
+ vals = [_nested_model_instantiate(record, self.output_schema)]
110
+ else:
111
+ vals = self._process_non_datachain_record(record, hf_schema)
112
+
113
+ if self.source:
114
+ kwargs: dict = self.kwargs
115
+ # Can't serialize CsvFileFormat; may lose formatting options.
116
+ if isinstance(kwargs.get("format"), CsvFileFormat):
117
+ kwargs["format"] = "csv"
118
+ arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
119
+ return [arrow_file, *vals]
120
+ return vals
121
+
122
+ def _process_non_datachain_record(
123
+ self,
124
+ record: dict[str, Any],
125
+ hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
126
+ ):
127
+ vals = list(record.values())
128
+ if not self.output_schema:
129
+ return vals
130
+
131
+ fields = self.output_schema.model_fields
132
+ vals_dict = {}
133
+ for i, ((field, field_info), val) in enumerate(zip(fields.items(), vals)):
134
+ anno = field_info.annotation
135
+ if hf_schema:
136
+ from datachain.lib.hf import convert_feature
137
+
138
+ feat = list(hf_schema[0].values())[i]
139
+ vals_dict[field] = convert_feature(val, feat, anno)
140
+ elif ModelStore.is_pydantic(anno):
141
+ vals_dict[field] = anno(**val) # type: ignore[misc]
142
+ else:
143
+ vals_dict[field] = val
144
+ return [self.output_schema(**vals_dict)]
109
145
 
110
146
 
111
147
  def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
@@ -190,18 +226,6 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
190
226
  raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
191
227
 
192
228
 
193
- def _nrows_file(file: File, nrows: int) -> str:
194
- tf = NamedTemporaryFile(delete=False) # noqa: SIM115
195
- with file.open(mode="r") as reader:
196
- with open(tf.name, "a") as writer:
197
- for row, line in enumerate(reader):
198
- if row >= nrows:
199
- break
200
- writer.write(line)
201
- writer.write("\n")
202
- return tf.name
203
-
204
-
205
229
  def _get_hf_schema(
206
230
  schema: "pa.Schema",
207
231
  ) -> Optional[tuple["Features", dict[str, "DataType"]]]: