datachain 0.8.0__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (290) hide show
  1. {datachain-0.8.0 → datachain-0.8.1}/.github/workflows/benchmarks.yml +1 -1
  2. {datachain-0.8.0 → datachain-0.8.1}/.github/workflows/release.yml +1 -1
  3. {datachain-0.8.0 → datachain-0.8.1}/.github/workflows/tests-studio.yml +1 -1
  4. {datachain-0.8.0 → datachain-0.8.1}/.github/workflows/tests.yml +3 -3
  5. {datachain-0.8.0 → datachain-0.8.1}/.pre-commit-config.yaml +1 -1
  6. {datachain-0.8.0/src/datachain.egg-info → datachain-0.8.1}/PKG-INFO +3 -3
  7. {datachain-0.8.0 → datachain-0.8.1}/docs/quick-start.md +4 -2
  8. {datachain-0.8.0 → datachain-0.8.1}/pyproject.toml +2 -2
  9. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/catalog/catalog.py +3 -4
  10. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/gcs.py +9 -0
  11. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/warehouse.py +0 -1
  12. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/arrow.py +82 -58
  13. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/dc.py +12 -57
  14. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/file.py +3 -1
  15. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/listing.py +44 -0
  16. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/udf.py +0 -1
  17. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/batch.py +32 -6
  18. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/dataset.py +17 -17
  19. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/dispatch.py +125 -125
  20. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/session.py +8 -5
  21. datachain-0.8.1/src/datachain/query/udf.py +20 -0
  22. datachain-0.8.1/src/datachain/query/utils.py +42 -0
  23. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/utils.py +1 -1
  24. {datachain-0.8.0 → datachain-0.8.1/src/datachain.egg-info}/PKG-INFO +3 -3
  25. {datachain-0.8.0 → datachain-0.8.1}/src/datachain.egg-info/SOURCES.txt +4 -0
  26. {datachain-0.8.0 → datachain-0.8.1}/src/datachain.egg-info/requires.txt +2 -2
  27. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_catalog.py +6 -2
  28. datachain-0.8.1/tests/func/test_session.py +25 -0
  29. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_arrow.py +26 -0
  30. datachain-0.8.1/tests/unit/test_client_gcs.py +17 -0
  31. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_listing.py +29 -2
  32. {datachain-0.8.0 → datachain-0.8.1}/.cruft.json +0 -0
  33. {datachain-0.8.0 → datachain-0.8.1}/.gitattributes +0 -0
  34. {datachain-0.8.0 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  35. {datachain-0.8.0 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  36. {datachain-0.8.0 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  37. {datachain-0.8.0 → datachain-0.8.1}/.github/codecov.yaml +0 -0
  38. {datachain-0.8.0 → datachain-0.8.1}/.github/dependabot.yml +0 -0
  39. {datachain-0.8.0 → datachain-0.8.1}/.github/workflows/update-template.yaml +0 -0
  40. {datachain-0.8.0 → datachain-0.8.1}/.gitignore +0 -0
  41. {datachain-0.8.0 → datachain-0.8.1}/CODE_OF_CONDUCT.rst +0 -0
  42. {datachain-0.8.0 → datachain-0.8.1}/LICENSE +0 -0
  43. {datachain-0.8.0 → datachain-0.8.1}/README.rst +0 -0
  44. {datachain-0.8.0 → datachain-0.8.1}/docs/assets/captioned_cartoons.png +0 -0
  45. {datachain-0.8.0 → datachain-0.8.1}/docs/assets/datachain-white.svg +0 -0
  46. {datachain-0.8.0 → datachain-0.8.1}/docs/assets/datachain.svg +0 -0
  47. {datachain-0.8.0 → datachain-0.8.1}/docs/contributing.md +0 -0
  48. {datachain-0.8.0 → datachain-0.8.1}/docs/css/github-permalink-style.css +0 -0
  49. {datachain-0.8.0 → datachain-0.8.1}/docs/examples.md +0 -0
  50. {datachain-0.8.0 → datachain-0.8.1}/docs/index.md +0 -0
  51. {datachain-0.8.0 → datachain-0.8.1}/docs/overrides/main.html +0 -0
  52. {datachain-0.8.0 → datachain-0.8.1}/docs/references/datachain.md +0 -0
  53. {datachain-0.8.0 → datachain-0.8.1}/docs/references/datatype.md +0 -0
  54. {datachain-0.8.0 → datachain-0.8.1}/docs/references/file.md +0 -0
  55. {datachain-0.8.0 → datachain-0.8.1}/docs/references/index.md +0 -0
  56. {datachain-0.8.0 → datachain-0.8.1}/docs/references/sql.md +0 -0
  57. {datachain-0.8.0 → datachain-0.8.1}/docs/references/torch.md +0 -0
  58. {datachain-0.8.0 → datachain-0.8.1}/docs/references/udf.md +0 -0
  59. {datachain-0.8.0 → datachain-0.8.1}/docs/tutorials.md +0 -0
  60. {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  61. {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  62. {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/openimage-detect.py +0 -0
  63. {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
  64. {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/ultralytics-pose.py +0 -0
  65. {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/ultralytics-segment.py +0 -0
  66. {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/common_sql_functions.py +0 -0
  67. {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/json-csv-reader.py +0 -0
  68. {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/torch-loader.py +0 -0
  69. {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/udfs/parallel.py +0 -0
  70. {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/udfs/simple.py +0 -0
  71. {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/udfs/stateful.py +0 -0
  72. {datachain-0.8.0 → datachain-0.8.1}/examples/llm_and_nlp/claude-query.py +0 -0
  73. {datachain-0.8.0 → datachain-0.8.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  74. {datachain-0.8.0 → datachain-0.8.1}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  75. {datachain-0.8.0 → datachain-0.8.1}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  76. {datachain-0.8.0 → datachain-0.8.1}/examples/multimodal/clip_inference.py +0 -0
  77. {datachain-0.8.0 → datachain-0.8.1}/examples/multimodal/hf_pipeline.py +0 -0
  78. {datachain-0.8.0 → datachain-0.8.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  79. {datachain-0.8.0 → datachain-0.8.1}/examples/multimodal/wds.py +0 -0
  80. {datachain-0.8.0 → datachain-0.8.1}/examples/multimodal/wds_filtered.py +0 -0
  81. {datachain-0.8.0 → datachain-0.8.1}/mkdocs.yml +0 -0
  82. {datachain-0.8.0 → datachain-0.8.1}/noxfile.py +0 -0
  83. {datachain-0.8.0 → datachain-0.8.1}/setup.cfg +0 -0
  84. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/__init__.py +0 -0
  85. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/__main__.py +0 -0
  86. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/asyn.py +0 -0
  87. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/cache.py +0 -0
  88. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/catalog/__init__.py +0 -0
  89. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/catalog/datasource.py +0 -0
  90. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/catalog/loader.py +0 -0
  91. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/cli.py +0 -0
  92. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/cli_utils.py +0 -0
  93. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/__init__.py +0 -0
  94. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/azure.py +0 -0
  95. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/fileslice.py +0 -0
  96. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/fsspec.py +0 -0
  97. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/hf.py +0 -0
  98. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/local.py +0 -0
  99. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/s3.py +0 -0
  100. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/config.py +0 -0
  101. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/__init__.py +0 -0
  102. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/db_engine.py +0 -0
  103. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/job.py +0 -0
  104. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/metastore.py +0 -0
  105. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/schema.py +0 -0
  106. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/serializer.py +0 -0
  107. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/sqlite.py +0 -0
  108. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/dataset.py +0 -0
  109. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/error.py +0 -0
  110. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/__init__.py +0 -0
  111. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/aggregate.py +0 -0
  112. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/array.py +0 -0
  113. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/base.py +0 -0
  114. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/conditional.py +0 -0
  115. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/func.py +0 -0
  116. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/numeric.py +0 -0
  117. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/path.py +0 -0
  118. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/random.py +0 -0
  119. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/string.py +0 -0
  120. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/window.py +0 -0
  121. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/job.py +0 -0
  122. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/__init__.py +0 -0
  123. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/clip.py +0 -0
  124. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/__init__.py +0 -0
  125. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/flatten.py +0 -0
  126. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  127. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  128. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/unflatten.py +0 -0
  129. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  130. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/data_model.py +0 -0
  131. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/dataset_info.py +0 -0
  132. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/diff.py +0 -0
  133. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/hf.py +0 -0
  134. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/image.py +0 -0
  135. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/listing_info.py +0 -0
  136. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/meta_formats.py +0 -0
  137. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/model_store.py +0 -0
  138. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/pytorch.py +0 -0
  139. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/settings.py +0 -0
  140. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/signal_schema.py +0 -0
  141. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/tar.py +0 -0
  142. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/text.py +0 -0
  143. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/udf_signature.py +0 -0
  144. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/utils.py +0 -0
  145. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/vfile.py +0 -0
  146. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/webdataset.py +0 -0
  147. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/webdataset_laion.py +0 -0
  148. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/listing.py +0 -0
  149. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/__init__.py +0 -0
  150. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/bbox.py +0 -0
  151. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/pose.py +0 -0
  152. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/segment.py +0 -0
  153. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/ultralytics/__init__.py +0 -0
  154. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/ultralytics/bbox.py +0 -0
  155. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/ultralytics/pose.py +0 -0
  156. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/ultralytics/segment.py +0 -0
  157. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/node.py +0 -0
  158. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/nodes_fetcher.py +0 -0
  159. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/nodes_thread_pool.py +0 -0
  160. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/progress.py +0 -0
  161. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/py.typed +0 -0
  162. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/__init__.py +0 -0
  163. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/metrics.py +0 -0
  164. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/params.py +0 -0
  165. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/queue.py +0 -0
  166. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/schema.py +0 -0
  167. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/remote/__init__.py +0 -0
  168. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/remote/studio.py +0 -0
  169. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/__init__.py +0 -0
  170. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/default/__init__.py +0 -0
  171. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/default/base.py +0 -0
  172. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/__init__.py +0 -0
  173. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/aggregate.py +0 -0
  174. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/array.py +0 -0
  175. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/conditional.py +0 -0
  176. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/numeric.py +0 -0
  177. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/path.py +0 -0
  178. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/random.py +0 -0
  179. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/string.py +0 -0
  180. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/selectable.py +0 -0
  181. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  182. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/sqlite/base.py +0 -0
  183. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/sqlite/types.py +0 -0
  184. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/sqlite/vector.py +0 -0
  185. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/types.py +0 -0
  186. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/utils.py +0 -0
  187. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/studio.py +0 -0
  188. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/telemetry.py +0 -0
  189. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/toolkit/__init__.py +0 -0
  190. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/toolkit/split.py +0 -0
  191. {datachain-0.8.0 → datachain-0.8.1}/src/datachain/torch/__init__.py +0 -0
  192. {datachain-0.8.0 → datachain-0.8.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  193. {datachain-0.8.0 → datachain-0.8.1}/src/datachain.egg-info/entry_points.txt +0 -0
  194. {datachain-0.8.0 → datachain-0.8.1}/src/datachain.egg-info/top_level.txt +0 -0
  195. {datachain-0.8.0 → datachain-0.8.1}/tests/__init__.py +0 -0
  196. {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/__init__.py +0 -0
  197. {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/conftest.py +0 -0
  198. {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  199. {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  200. {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/datasets/.gitignore +0 -0
  201. {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  202. {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/test_datachain.py +0 -0
  203. {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/test_ls.py +0 -0
  204. {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/test_version.py +0 -0
  205. {datachain-0.8.0 → datachain-0.8.1}/tests/conftest.py +0 -0
  206. {datachain-0.8.0 → datachain-0.8.1}/tests/data.py +0 -0
  207. {datachain-0.8.0 → datachain-0.8.1}/tests/examples/__init__.py +0 -0
  208. {datachain-0.8.0 → datachain-0.8.1}/tests/examples/test_examples.py +0 -0
  209. {datachain-0.8.0 → datachain-0.8.1}/tests/examples/test_wds_e2e.py +0 -0
  210. {datachain-0.8.0 → datachain-0.8.1}/tests/examples/wds_data.py +0 -0
  211. {datachain-0.8.0 → datachain-0.8.1}/tests/func/__init__.py +0 -0
  212. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_client.py +0 -0
  213. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_datachain.py +0 -0
  214. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_dataset_query.py +0 -0
  215. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_datasets.py +0 -0
  216. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_feature_pickling.py +0 -0
  217. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_listing.py +0 -0
  218. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_ls.py +0 -0
  219. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_meta_formats.py +0 -0
  220. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_metrics.py +0 -0
  221. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_pull.py +0 -0
  222. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_pytorch.py +0 -0
  223. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_query.py +0 -0
  224. {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_toolkit.py +0 -0
  225. {datachain-0.8.0 → datachain-0.8.1}/tests/scripts/feature_class.py +0 -0
  226. {datachain-0.8.0 → datachain-0.8.1}/tests/scripts/feature_class_exception.py +0 -0
  227. {datachain-0.8.0 → datachain-0.8.1}/tests/scripts/feature_class_parallel.py +0 -0
  228. {datachain-0.8.0 → datachain-0.8.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  229. {datachain-0.8.0 → datachain-0.8.1}/tests/scripts/name_len_slow.py +0 -0
  230. {datachain-0.8.0 → datachain-0.8.1}/tests/test_atomicity.py +0 -0
  231. {datachain-0.8.0 → datachain-0.8.1}/tests/test_cli_e2e.py +0 -0
  232. {datachain-0.8.0 → datachain-0.8.1}/tests/test_cli_studio.py +0 -0
  233. {datachain-0.8.0 → datachain-0.8.1}/tests/test_query_e2e.py +0 -0
  234. {datachain-0.8.0 → datachain-0.8.1}/tests/test_telemetry.py +0 -0
  235. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/__init__.py +0 -0
  236. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/__init__.py +0 -0
  237. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/conftest.py +0 -0
  238. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_clip.py +0 -0
  239. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_datachain.py +0 -0
  240. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  241. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  242. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_diff.py +0 -0
  243. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_feature.py +0 -0
  244. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_feature_utils.py +0 -0
  245. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_file.py +0 -0
  246. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_hf.py +0 -0
  247. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_image.py +0 -0
  248. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_listing_info.py +0 -0
  249. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_models.py +0 -0
  250. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_schema.py +0 -0
  251. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_signal_schema.py +0 -0
  252. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  253. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_text.py +0 -0
  254. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_udf_signature.py +0 -0
  255. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_utils.py +0 -0
  256. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_webdataset.py +0 -0
  257. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/__init__.py +0 -0
  258. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  259. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/sqlite/test_types.py +0 -0
  260. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  261. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_array.py +0 -0
  262. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_conditional.py +0 -0
  263. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_path.py +0 -0
  264. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_random.py +0 -0
  265. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_selectable.py +0 -0
  266. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_string.py +0 -0
  267. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_asyn.py +0 -0
  268. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_cache.py +0 -0
  269. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_catalog.py +0 -0
  270. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_catalog_loader.py +0 -0
  271. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_cli_parsing.py +0 -0
  272. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_client.py +0 -0
  273. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_client_s3.py +0 -0
  274. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_config.py +0 -0
  275. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_data_storage.py +0 -0
  276. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_database_engine.py +0 -0
  277. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_dataset.py +0 -0
  278. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_dispatch.py +0 -0
  279. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_fileslice.py +0 -0
  280. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_func.py +0 -0
  281. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_metastore.py +0 -0
  282. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_module_exports.py +0 -0
  283. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_query.py +0 -0
  284. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_query_metrics.py +0 -0
  285. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_query_params.py +0 -0
  286. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_serializer.py +0 -0
  287. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_session.py +0 -0
  288. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_utils.py +0 -0
  289. {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_warehouse.py +0 -0
  290. {datachain-0.8.0 → datachain-0.8.1}/tests/utils.py +0 -0
@@ -25,7 +25,7 @@ jobs:
25
25
  python-version: '3.12'
26
26
 
27
27
  - name: Setup uv
28
- uses: astral-sh/setup-uv@v4
28
+ uses: astral-sh/setup-uv@v5
29
29
  with:
30
30
  enable-cache: true
31
31
  cache-suffix: benchmarks
@@ -27,7 +27,7 @@ jobs:
27
27
  python-version: '3.12'
28
28
 
29
29
  - name: Setup uv
30
- uses: astral-sh/setup-uv@v4
30
+ uses: astral-sh/setup-uv@v5
31
31
 
32
32
  - name: Install nox
33
33
  run: uv pip install nox --system
@@ -81,7 +81,7 @@ jobs:
81
81
  python-version: ${{ matrix.pyv }}
82
82
 
83
83
  - name: Setup uv
84
- uses: astral-sh/setup-uv@v4
84
+ uses: astral-sh/setup-uv@v5
85
85
  with:
86
86
  enable-cache: true
87
87
  cache-suffix: studio
@@ -37,7 +37,7 @@ jobs:
37
37
  python-version: '3.9'
38
38
 
39
39
  - name: Setup uv
40
- uses: astral-sh/setup-uv@v4
40
+ uses: astral-sh/setup-uv@v5
41
41
  with:
42
42
  enable-cache: true
43
43
  cache-suffix: lint
@@ -94,7 +94,7 @@ jobs:
94
94
  python-version: ${{ matrix.pyv }}
95
95
 
96
96
  - name: Setup uv
97
- uses: astral-sh/setup-uv@v4
97
+ uses: astral-sh/setup-uv@v5
98
98
  with:
99
99
  enable-cache: true
100
100
  cache-suffix: tests-${{ matrix.pyv }}
@@ -157,7 +157,7 @@ jobs:
157
157
  python-version: ${{ matrix.pyv }}
158
158
 
159
159
  - name: Setup uv
160
- uses: astral-sh/setup-uv@v4
160
+ uses: astral-sh/setup-uv@v5
161
161
  with:
162
162
  enable-cache: true
163
163
  cache-suffix: examples-${{ matrix.pyv }}
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.8.3'
27
+ rev: 'v0.8.4'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.8.0
3
+ Version: 0.8.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -84,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
84
84
  Requires-Dist: scipy; extra == "tests"
85
85
  Provides-Extra: dev
86
86
  Requires-Dist: datachain[docs,tests]; extra == "dev"
87
- Requires-Dist: mypy==1.13.0; extra == "dev"
87
+ Requires-Dist: mypy==1.14.0; extra == "dev"
88
88
  Requires-Dist: types-python-dateutil; extra == "dev"
89
89
  Requires-Dist: types-pytz; extra == "dev"
90
90
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -99,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
99
99
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
100
100
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
101
101
  Requires-Dist: onnx==1.16.1; extra == "examples"
102
- Requires-Dist: ultralytics==8.3.50; extra == "examples"
102
+ Requires-Dist: ultralytics==8.3.53; extra == "examples"
103
103
 
104
104
  ================
105
105
  |logo| DataChain
@@ -59,6 +59,8 @@ Batch inference with a simple sentiment model using the
59
59
  pip install transformers
60
60
  ```
61
61
 
62
+ Note, `transformers` works only if `torch`, `tensorflow` >= 2.0, or `flax` are installed.
63
+
62
64
  The code below downloads files from the cloud, and applies a
63
65
  user-defined function to each one of them. All files with a positive
64
66
  sentiment detected are then copied to the local directory.
@@ -114,13 +116,14 @@ DataChain can parallelize API calls; the free Mistral tier supports up
114
116
  to 4 requests at the same time.
115
117
 
116
118
  ``` py
119
+ import os
117
120
  from mistralai import Mistral
118
121
  from datachain import File, DataChain, Column
119
122
 
120
123
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
121
124
 
122
125
  def eval_dialogue(file: File) -> bool:
123
- client = Mistral()
126
+ client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
124
127
  response = client.chat.complete(
125
128
  model="open-mixtral-8x22b",
126
129
  messages=[{"role": "system", "content": PROMPT},
@@ -130,7 +133,6 @@ def eval_dialogue(file: File) -> bool:
130
133
 
131
134
  chain = (
132
135
  DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
133
- .settings(parallel=4, cache=True)
134
136
  .map(is_success=eval_dialogue)
135
137
  .save("mistral_files")
136
138
  )
@@ -96,7 +96,7 @@ tests = [
96
96
  ]
97
97
  dev = [
98
98
  "datachain[docs,tests]",
99
- "mypy==1.13.0",
99
+ "mypy==1.14.0",
100
100
  "types-python-dateutil",
101
101
  "types-pytz",
102
102
  "types-PyYAML",
@@ -112,7 +112,7 @@ examples = [
112
112
  "pdfplumber==0.11.4",
113
113
  "huggingface_hub[hf_transfer]",
114
114
  "onnx==1.16.1",
115
- "ultralytics==8.3.50"
115
+ "ultralytics==8.3.53"
116
116
  ]
117
117
 
118
118
  [project.urls]
@@ -52,6 +52,7 @@ from datachain.error import (
52
52
  QueryScriptCancelError,
53
53
  QueryScriptRunError,
54
54
  )
55
+ from datachain.lib.listing import get_listing
55
56
  from datachain.node import DirType, Node, NodeWithPath
56
57
  from datachain.nodes_thread_pool import NodesThreadPool
57
58
  from datachain.remote.studio import StudioClient
@@ -599,7 +600,7 @@ class Catalog:
599
600
  source, session=self.session, update=update, object_name=object_name
600
601
  )
601
602
 
602
- list_ds_name, list_uri, list_path, _ = DataChain.parse_uri(
603
+ list_ds_name, list_uri, list_path, _ = get_listing(
603
604
  source, self.session, update=update
604
605
  )
605
606
 
@@ -697,11 +698,9 @@ class Catalog:
697
698
  )
698
699
  indexed_sources = []
699
700
  for source in dataset_sources:
700
- from datachain.lib.dc import DataChain
701
-
702
701
  client = self.get_client(source, **client_config)
703
702
  uri = client.uri
704
- dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
703
+ dataset_name, _, _, _ = get_listing(uri, self.session)
705
704
  listing = Listing(
706
705
  self.metastore.clone(),
707
706
  self.warehouse.clone(),
@@ -32,6 +32,15 @@ class GCSClient(Client):
32
32
 
33
33
  return cast(GCSFileSystem, super().create_fs(**kwargs))
34
34
 
35
+ def url(self, path: str, expires: int = 3600, **kwargs) -> str:
36
+ try:
37
+ return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
38
+ except AttributeError as exc:
39
+ is_anon = self.fs.storage_options.get("token") == "anon"
40
+ if is_anon and "you need a private key to sign credentials" in str(exc):
41
+ return f"https://storage.googleapis.com/{self.name}/{path}"
42
+ raise
43
+
35
44
  @staticmethod
36
45
  def parse_timestamp(timestamp: str) -> datetime:
37
46
  """
@@ -216,7 +216,6 @@ class AbstractWarehouse(ABC, Serializable):
216
216
  limit = query._limit
217
217
  paginated_query = query.limit(page_size)
218
218
 
219
- results = None
220
219
  offset = 0
221
220
  num_yielded = 0
222
221
 
@@ -1,9 +1,11 @@
1
1
  from collections.abc import Sequence
2
- from tempfile import NamedTemporaryFile
2
+ from itertools import islice
3
3
  from typing import TYPE_CHECKING, Any, Optional
4
4
 
5
+ import fsspec.implementations.reference
5
6
  import orjson
6
7
  import pyarrow as pa
8
+ from fsspec.core import split_protocol
7
9
  from pyarrow.dataset import CsvFileFormat, dataset
8
10
  from tqdm import tqdm
9
11
 
@@ -25,7 +27,18 @@ if TYPE_CHECKING:
25
27
  DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
26
28
 
27
29
 
30
+ class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
31
+ def _open(self, path, mode="rb", *args, **kwargs):
32
+ # overriding because `fsspec`'s `ReferenceFileSystem._open`
33
+ # reads the whole file in-memory.
34
+ (uri,) = self.references[path]
35
+ protocol, _ = split_protocol(uri)
36
+ return self.fss[protocol]._open(uri, mode, *args, **kwargs)
37
+
38
+
28
39
  class ArrowGenerator(Generator):
40
+ DEFAULT_BATCH_SIZE = 2**17 # same as `pyarrow._dataset._DEFAULT_BATCH_SIZE`
41
+
29
42
  def __init__(
30
43
  self,
31
44
  input_schema: Optional["pa.Schema"] = None,
@@ -55,57 +68,80 @@ class ArrowGenerator(Generator):
55
68
  def process(self, file: File):
56
69
  if file._caching_enabled:
57
70
  file.ensure_cached()
58
- path = file.get_local_path()
59
- ds = dataset(path, schema=self.input_schema, **self.kwargs)
60
- elif self.nrows:
61
- path = _nrows_file(file, self.nrows)
62
- ds = dataset(path, schema=self.input_schema, **self.kwargs)
71
+ cache_path = file.get_local_path()
72
+ fs_path = file.path
73
+ fs = ReferenceFileSystem({fs_path: [cache_path]})
63
74
  else:
64
- path = file.get_path()
65
- ds = dataset(
66
- path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
67
- )
75
+ fs, fs_path = file.get_fs(), file.get_path()
76
+
77
+ ds = dataset(fs_path, schema=self.input_schema, filesystem=fs, **self.kwargs)
78
+
68
79
  hf_schema = _get_hf_schema(ds.schema)
69
80
  use_datachain_schema = (
70
81
  bool(ds.schema.metadata)
71
82
  and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in ds.schema.metadata
72
83
  )
73
- index = 0
74
- with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
75
- for record_batch in ds.to_batches():
76
- for record in record_batch.to_pylist():
77
- if use_datachain_schema and self.output_schema:
78
- vals = [_nested_model_instantiate(record, self.output_schema)]
79
- else:
80
- vals = list(record.values())
81
- if self.output_schema:
82
- fields = self.output_schema.model_fields
83
- vals_dict = {}
84
- for i, ((field, field_info), val) in enumerate(
85
- zip(fields.items(), vals)
86
- ):
87
- anno = field_info.annotation
88
- if hf_schema:
89
- from datachain.lib.hf import convert_feature
90
-
91
- feat = list(hf_schema[0].values())[i]
92
- vals_dict[field] = convert_feature(val, feat, anno)
93
- elif ModelStore.is_pydantic(anno):
94
- vals_dict[field] = anno(**val) # type: ignore[misc]
95
- else:
96
- vals_dict[field] = val
97
- vals = [self.output_schema(**vals_dict)]
98
- if self.source:
99
- kwargs: dict = self.kwargs
100
- # Can't serialize CsvFileFormat; may lose formatting options.
101
- if isinstance(kwargs.get("format"), CsvFileFormat):
102
- kwargs["format"] = "csv"
103
- arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
104
- yield [arrow_file, *vals]
105
- else:
106
- yield vals
107
- index += 1
108
- pbar.update(len(record_batch))
84
+
85
+ kw = {}
86
+ if self.nrows:
87
+ kw = {"batch_size": min(self.DEFAULT_BATCH_SIZE, self.nrows)}
88
+
89
+ def iter_records():
90
+ for record_batch in ds.to_batches(**kw):
91
+ yield from record_batch.to_pylist()
92
+
93
+ it = islice(iter_records(), self.nrows)
94
+ with tqdm(it, desc="Parsed by pyarrow", unit="rows", total=self.nrows) as pbar:
95
+ for index, record in enumerate(pbar):
96
+ yield self._process_record(
97
+ record, file, index, hf_schema, use_datachain_schema
98
+ )
99
+
100
+ def _process_record(
101
+ self,
102
+ record: dict[str, Any],
103
+ file: File,
104
+ index: int,
105
+ hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
106
+ use_datachain_schema: bool,
107
+ ):
108
+ if use_datachain_schema and self.output_schema:
109
+ vals = [_nested_model_instantiate(record, self.output_schema)]
110
+ else:
111
+ vals = self._process_non_datachain_record(record, hf_schema)
112
+
113
+ if self.source:
114
+ kwargs: dict = self.kwargs
115
+ # Can't serialize CsvFileFormat; may lose formatting options.
116
+ if isinstance(kwargs.get("format"), CsvFileFormat):
117
+ kwargs["format"] = "csv"
118
+ arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
119
+ return [arrow_file, *vals]
120
+ return vals
121
+
122
+ def _process_non_datachain_record(
123
+ self,
124
+ record: dict[str, Any],
125
+ hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
126
+ ):
127
+ vals = list(record.values())
128
+ if not self.output_schema:
129
+ return vals
130
+
131
+ fields = self.output_schema.model_fields
132
+ vals_dict = {}
133
+ for i, ((field, field_info), val) in enumerate(zip(fields.items(), vals)):
134
+ anno = field_info.annotation
135
+ if hf_schema:
136
+ from datachain.lib.hf import convert_feature
137
+
138
+ feat = list(hf_schema[0].values())[i]
139
+ vals_dict[field] = convert_feature(val, feat, anno)
140
+ elif ModelStore.is_pydantic(anno):
141
+ vals_dict[field] = anno(**val) # type: ignore[misc]
142
+ else:
143
+ vals_dict[field] = val
144
+ return [self.output_schema(**vals_dict)]
109
145
 
110
146
 
111
147
  def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
@@ -190,18 +226,6 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
190
226
  raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
191
227
 
192
228
 
193
- def _nrows_file(file: File, nrows: int) -> str:
194
- tf = NamedTemporaryFile(delete=False) # noqa: SIM115
195
- with file.open(mode="r") as reader:
196
- with open(tf.name, "a") as writer:
197
- for row, line in enumerate(reader):
198
- if row >= nrows:
199
- break
200
- writer.write(line)
201
- writer.write("\n")
202
- return tf.name
203
-
204
-
205
229
  def _get_hf_schema(
206
230
  schema: "pa.Schema",
207
231
  ) -> Optional[tuple["Features", dict[str, "DataType"]]]:
@@ -11,7 +11,6 @@ from typing import (
11
11
  BinaryIO,
12
12
  Callable,
13
13
  ClassVar,
14
- Literal,
15
14
  Optional,
16
15
  TypeVar,
17
16
  Union,
@@ -24,8 +23,6 @@ from pydantic import BaseModel
24
23
  from sqlalchemy.sql.functions import GenericFunction
25
24
  from sqlalchemy.sql.sqltypes import NullType
26
25
 
27
- from datachain.client import Client
28
- from datachain.client.local import FileClient
29
26
  from datachain.dataset import DatasetRecord
30
27
  from datachain.func.base import Function
31
28
  from datachain.func.func import Func
@@ -33,13 +30,9 @@ from datachain.lib.convert.python_to_sql import python_to_sql
33
30
  from datachain.lib.convert.values_to_tuples import values_to_tuples
34
31
  from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
35
32
  from datachain.lib.dataset_info import DatasetInfo
36
- from datachain.lib.file import ArrowRow, File, get_file_type
33
+ from datachain.lib.file import ArrowRow, File, FileType, get_file_type
37
34
  from datachain.lib.file import ExportPlacement as FileExportPlacement
38
- from datachain.lib.listing import (
39
- list_bucket,
40
- ls,
41
- parse_listing_uri,
42
- )
35
+ from datachain.lib.listing import get_listing, list_bucket, ls
43
36
  from datachain.lib.listing_info import ListingInfo
44
37
  from datachain.lib.meta_formats import read_meta
45
38
  from datachain.lib.model_store import ModelStore
@@ -403,53 +396,12 @@ class DataChain:
403
396
  self.signals_schema |= signals_schema
404
397
  return self
405
398
 
406
- @classmethod
407
- def parse_uri(
408
- cls, uri: str, session: Session, update: bool = False
409
- ) -> tuple[str, str, str, bool]:
410
- """Returns correct listing dataset name that must be used for saving listing
411
- operation. It takes into account existing listings and reusability of those.
412
- It also returns boolean saying if returned dataset name is reused / already
413
- exists or not, and it returns correct listing path that should be used to find
414
- rows based on uri.
415
- """
416
- catalog = session.catalog
417
- cache = catalog.cache
418
- client_config = catalog.client_config
419
-
420
- client = Client.get_client(uri, cache, **client_config)
421
- ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
422
- listing = None
423
-
424
- listings = [
425
- ls
426
- for ls in catalog.listings()
427
- if not ls.is_expired and ls.contains(ds_name)
428
- ]
429
-
430
- if listings:
431
- if update:
432
- # choosing the smallest possible one to minimize update time
433
- listing = sorted(listings, key=lambda ls: len(ls.name))[0]
434
- else:
435
- # no need to update, choosing the most recent one
436
- listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
437
-
438
- if isinstance(client, FileClient) and listing and listing.name != ds_name:
439
- # For local file system we need to fix listing path / prefix
440
- # if we are reusing existing listing
441
- list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
442
-
443
- ds_name = listing.name if listing else ds_name
444
-
445
- return ds_name, list_uri, list_path, bool(listing)
446
-
447
399
  @classmethod
448
400
  def from_storage(
449
401
  cls,
450
402
  uri,
451
403
  *,
452
- type: Literal["binary", "text", "image"] = "binary",
404
+ type: FileType = "binary",
453
405
  session: Optional[Session] = None,
454
406
  settings: Optional[dict] = None,
455
407
  in_memory: bool = False,
@@ -482,7 +434,7 @@ class DataChain:
482
434
  cache = session.catalog.cache
483
435
  client_config = session.catalog.client_config
484
436
 
485
- list_ds_name, list_uri, list_path, list_ds_exists = cls.parse_uri(
437
+ list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
486
438
  uri, session, update=update
487
439
  )
488
440
 
@@ -548,7 +500,7 @@ class DataChain:
548
500
  def from_json(
549
501
  cls,
550
502
  path,
551
- type: Literal["binary", "text", "image"] = "text",
503
+ type: FileType = "text",
552
504
  spec: Optional[DataType] = None,
553
505
  schema_from: Optional[str] = "auto",
554
506
  jmespath: Optional[str] = None,
@@ -605,7 +557,9 @@ class DataChain:
605
557
  nrows=nrows,
606
558
  )
607
559
  }
608
- return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
560
+ # disable prefetch if nrows is set
561
+ settings = {"prefetch": 0} if nrows else {}
562
+ return chain.settings(**settings).gen(**signal_dict) # type: ignore[misc, arg-type]
609
563
 
610
564
  def explode(
611
565
  self,
@@ -1942,7 +1896,10 @@ class DataChain:
1942
1896
 
1943
1897
  if source:
1944
1898
  output = {"source": ArrowRow} | output # type: ignore[assignment,operator]
1945
- return self.gen(
1899
+
1900
+ # disable prefetch if nrows is set
1901
+ settings = {"prefetch": 0} if nrows else {}
1902
+ return self.settings(**settings).gen( # type: ignore[arg-type]
1946
1903
  ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
1947
1904
  )
1948
1905
 
@@ -2024,8 +1981,6 @@ class DataChain:
2024
1981
  else:
2025
1982
  msg = f"error parsing csv - incompatible output type {type(output)}"
2026
1983
  raise DatasetPrepareError(chain.name, msg)
2027
- elif nrows:
2028
- nrows += 1
2029
1984
 
2030
1985
  parse_options = ParseOptions(delimiter=delimiter)
2031
1986
  read_options = ReadOptions(column_names=column_names)
@@ -39,6 +39,8 @@ logger = logging.getLogger("datachain")
39
39
  # how to create file path when exporting
40
40
  ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
41
41
 
42
+ FileType = Literal["binary", "text", "image"]
43
+
42
44
 
43
45
  class VFileError(DataChainError):
44
46
  def __init__(self, file: "File", message: str, vtype: str = ""):
@@ -470,7 +472,7 @@ class ArrowRow(DataModel):
470
472
  return record_batch.to_pylist()[0]
471
473
 
472
474
 
473
- def get_file_type(type_: Literal["binary", "text", "image"] = "binary") -> type[File]:
475
+ def get_file_type(type_: FileType = "binary") -> type[File]:
474
476
  file: type[File] = File
475
477
  if type_ == "text":
476
478
  file = TextFile
@@ -15,6 +15,7 @@ from datachain.utils import uses_glob
15
15
 
16
16
  if TYPE_CHECKING:
17
17
  from datachain.lib.dc import DataChain
18
+ from datachain.query.session import Session
18
19
 
19
20
  LISTING_TTL = 4 * 60 * 60 # cached listing lasts 4 hours
20
21
  LISTING_PREFIX = "lst__" # listing datasets start with this name
@@ -108,3 +109,46 @@ def listing_uri_from_name(dataset_name: str) -> str:
108
109
  if not is_listing_dataset(dataset_name):
109
110
  raise ValueError(f"Dataset {dataset_name} is not a listing")
110
111
  return dataset_name.removeprefix(LISTING_PREFIX)
112
+
113
+
114
+ def get_listing(
115
+ uri: str, session: "Session", update: bool = False
116
+ ) -> tuple[str, str, str, bool]:
117
+ """Returns correct listing dataset name that must be used for saving listing
118
+ operation. It takes into account existing listings and reusability of those.
119
+ It also returns boolean saying if returned dataset name is reused / already
120
+ exists or not (on update it always returns False - just because there was no
121
+ reason to complicate it so far). And it returns correct listing path that should
122
+ be used to find rows based on uri.
123
+ """
124
+ from datachain.client.local import FileClient
125
+
126
+ catalog = session.catalog
127
+ cache = catalog.cache
128
+ client_config = catalog.client_config
129
+
130
+ client = Client.get_client(uri, cache, **client_config)
131
+ ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
132
+ listing = None
133
+
134
+ listings = [
135
+ ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
136
+ ]
137
+
138
+ # if no need to update - choosing the most recent one;
139
+ # otherwise, we'll using the exact original `ds_name`` in this case:
140
+ # - if a "bigger" listing exists, we don't want to update it, it's better
141
+ # to create a new "smaller" one on "update=True"
142
+ # - if an exact listing exists it will have the same name as `ds_name`
143
+ # anyway below
144
+ if listings and not update:
145
+ listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
146
+
147
+ # for local file system we need to fix listing path / prefix
148
+ # if we are reusing existing listing
149
+ if isinstance(client, FileClient) and listing and listing.name != ds_name:
150
+ list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
151
+
152
+ ds_name = listing.name if listing else ds_name
153
+
154
+ return ds_name, list_uri, list_path, bool(listing)
@@ -85,7 +85,6 @@ class UDFAdapter:
85
85
  udf_fields: "Sequence[str]",
86
86
  udf_inputs: "Iterable[RowsOutput]",
87
87
  catalog: "Catalog",
88
- is_generator: bool,
89
88
  cache: bool,
90
89
  download_cb: Callback = DEFAULT_CALLBACK,
91
90
  processed_cb: Callback = DEFAULT_CALLBACK,