datachain 0.8.1__tar.gz → 0.8.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (292) hide show
  1. {datachain-0.8.1/src/datachain.egg-info → datachain-0.8.3}/PKG-INFO +84 -2
  2. {datachain-0.8.1 → datachain-0.8.3}/README.rst +82 -0
  3. {datachain-0.8.1 → datachain-0.8.3}/docs/quick-start.md +6 -6
  4. {datachain-0.8.1 → datachain-0.8.3}/pyproject.toml +1 -1
  5. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/cache.py +4 -2
  6. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/catalog/catalog.py +100 -54
  7. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/catalog/datasource.py +4 -6
  8. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/azure.py +21 -1
  9. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/fsspec.py +35 -9
  10. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/gcs.py +16 -7
  11. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/local.py +4 -4
  12. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/s3.py +10 -0
  13. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/dataset.py +1 -0
  14. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/dc.py +15 -3
  15. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/listing.py +18 -3
  16. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/listing.py +1 -5
  17. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/node.py +27 -1
  18. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/session.py +1 -1
  19. {datachain-0.8.1 → datachain-0.8.3/src/datachain.egg-info}/PKG-INFO +84 -2
  20. {datachain-0.8.1 → datachain-0.8.3}/src/datachain.egg-info/SOURCES.txt +1 -0
  21. {datachain-0.8.1 → datachain-0.8.3}/src/datachain.egg-info/requires.txt +1 -1
  22. datachain-0.8.3/tests/func/fake-service-account-credentials.json +9 -0
  23. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_catalog.py +150 -12
  24. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_datachain.py +6 -2
  25. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_pull.py +1 -0
  26. datachain-0.8.3/tests/unit/test_client_gcs.py +14 -0
  27. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_client_s3.py +6 -0
  28. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_listing.py +1 -1
  29. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_session.py +5 -0
  30. datachain-0.8.1/tests/unit/test_client_gcs.py +0 -17
  31. {datachain-0.8.1 → datachain-0.8.3}/.cruft.json +0 -0
  32. {datachain-0.8.1 → datachain-0.8.3}/.gitattributes +0 -0
  33. {datachain-0.8.1 → datachain-0.8.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  34. {datachain-0.8.1 → datachain-0.8.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  35. {datachain-0.8.1 → datachain-0.8.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  36. {datachain-0.8.1 → datachain-0.8.3}/.github/codecov.yaml +0 -0
  37. {datachain-0.8.1 → datachain-0.8.3}/.github/dependabot.yml +0 -0
  38. {datachain-0.8.1 → datachain-0.8.3}/.github/workflows/benchmarks.yml +0 -0
  39. {datachain-0.8.1 → datachain-0.8.3}/.github/workflows/release.yml +0 -0
  40. {datachain-0.8.1 → datachain-0.8.3}/.github/workflows/tests-studio.yml +0 -0
  41. {datachain-0.8.1 → datachain-0.8.3}/.github/workflows/tests.yml +0 -0
  42. {datachain-0.8.1 → datachain-0.8.3}/.github/workflows/update-template.yaml +0 -0
  43. {datachain-0.8.1 → datachain-0.8.3}/.gitignore +0 -0
  44. {datachain-0.8.1 → datachain-0.8.3}/.pre-commit-config.yaml +0 -0
  45. {datachain-0.8.1 → datachain-0.8.3}/CODE_OF_CONDUCT.rst +0 -0
  46. {datachain-0.8.1 → datachain-0.8.3}/LICENSE +0 -0
  47. {datachain-0.8.1 → datachain-0.8.3}/docs/assets/captioned_cartoons.png +0 -0
  48. {datachain-0.8.1 → datachain-0.8.3}/docs/assets/datachain-white.svg +0 -0
  49. {datachain-0.8.1 → datachain-0.8.3}/docs/assets/datachain.svg +0 -0
  50. {datachain-0.8.1 → datachain-0.8.3}/docs/contributing.md +0 -0
  51. {datachain-0.8.1 → datachain-0.8.3}/docs/css/github-permalink-style.css +0 -0
  52. {datachain-0.8.1 → datachain-0.8.3}/docs/examples.md +0 -0
  53. {datachain-0.8.1 → datachain-0.8.3}/docs/index.md +0 -0
  54. {datachain-0.8.1 → datachain-0.8.3}/docs/overrides/main.html +0 -0
  55. {datachain-0.8.1 → datachain-0.8.3}/docs/references/datachain.md +0 -0
  56. {datachain-0.8.1 → datachain-0.8.3}/docs/references/datatype.md +0 -0
  57. {datachain-0.8.1 → datachain-0.8.3}/docs/references/file.md +0 -0
  58. {datachain-0.8.1 → datachain-0.8.3}/docs/references/index.md +0 -0
  59. {datachain-0.8.1 → datachain-0.8.3}/docs/references/sql.md +0 -0
  60. {datachain-0.8.1 → datachain-0.8.3}/docs/references/torch.md +0 -0
  61. {datachain-0.8.1 → datachain-0.8.3}/docs/references/udf.md +0 -0
  62. {datachain-0.8.1 → datachain-0.8.3}/docs/tutorials.md +0 -0
  63. {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  64. {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  65. {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/openimage-detect.py +0 -0
  66. {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/ultralytics-bbox.py +0 -0
  67. {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/ultralytics-pose.py +0 -0
  68. {datachain-0.8.1 → datachain-0.8.3}/examples/computer_vision/ultralytics-segment.py +0 -0
  69. {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/common_sql_functions.py +0 -0
  70. {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/json-csv-reader.py +0 -0
  71. {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/torch-loader.py +0 -0
  72. {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/udfs/parallel.py +0 -0
  73. {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/udfs/simple.py +0 -0
  74. {datachain-0.8.1 → datachain-0.8.3}/examples/get_started/udfs/stateful.py +0 -0
  75. {datachain-0.8.1 → datachain-0.8.3}/examples/llm_and_nlp/claude-query.py +0 -0
  76. {datachain-0.8.1 → datachain-0.8.3}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  77. {datachain-0.8.1 → datachain-0.8.3}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  78. {datachain-0.8.1 → datachain-0.8.3}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  79. {datachain-0.8.1 → datachain-0.8.3}/examples/multimodal/clip_inference.py +0 -0
  80. {datachain-0.8.1 → datachain-0.8.3}/examples/multimodal/hf_pipeline.py +0 -0
  81. {datachain-0.8.1 → datachain-0.8.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
  82. {datachain-0.8.1 → datachain-0.8.3}/examples/multimodal/wds.py +0 -0
  83. {datachain-0.8.1 → datachain-0.8.3}/examples/multimodal/wds_filtered.py +0 -0
  84. {datachain-0.8.1 → datachain-0.8.3}/mkdocs.yml +0 -0
  85. {datachain-0.8.1 → datachain-0.8.3}/noxfile.py +0 -0
  86. {datachain-0.8.1 → datachain-0.8.3}/setup.cfg +0 -0
  87. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/__init__.py +0 -0
  88. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/__main__.py +0 -0
  89. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/asyn.py +0 -0
  90. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/catalog/__init__.py +0 -0
  91. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/catalog/loader.py +0 -0
  92. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/cli.py +0 -0
  93. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/cli_utils.py +0 -0
  94. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/__init__.py +0 -0
  95. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/fileslice.py +0 -0
  96. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/client/hf.py +0 -0
  97. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/config.py +0 -0
  98. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/__init__.py +0 -0
  99. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/db_engine.py +0 -0
  100. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/job.py +0 -0
  101. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/metastore.py +0 -0
  102. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/schema.py +0 -0
  103. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/serializer.py +0 -0
  104. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/sqlite.py +0 -0
  105. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/data_storage/warehouse.py +0 -0
  106. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/error.py +0 -0
  107. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/__init__.py +0 -0
  108. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/aggregate.py +0 -0
  109. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/array.py +0 -0
  110. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/base.py +0 -0
  111. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/conditional.py +0 -0
  112. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/func.py +0 -0
  113. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/numeric.py +0 -0
  114. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/path.py +0 -0
  115. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/random.py +0 -0
  116. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/string.py +0 -0
  117. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/func/window.py +0 -0
  118. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/job.py +0 -0
  119. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/__init__.py +0 -0
  120. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/arrow.py +0 -0
  121. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/clip.py +0 -0
  122. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/__init__.py +0 -0
  123. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/flatten.py +0 -0
  124. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
  125. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
  126. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/unflatten.py +0 -0
  127. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  128. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/data_model.py +0 -0
  129. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/dataset_info.py +0 -0
  130. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/diff.py +0 -0
  131. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/file.py +0 -0
  132. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/hf.py +0 -0
  133. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/image.py +0 -0
  134. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/listing_info.py +0 -0
  135. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/meta_formats.py +0 -0
  136. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/model_store.py +0 -0
  137. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/pytorch.py +0 -0
  138. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/settings.py +0 -0
  139. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/signal_schema.py +0 -0
  140. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/tar.py +0 -0
  141. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/text.py +0 -0
  142. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/udf.py +0 -0
  143. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/udf_signature.py +0 -0
  144. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/utils.py +0 -0
  145. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/vfile.py +0 -0
  146. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/webdataset.py +0 -0
  147. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/lib/webdataset_laion.py +0 -0
  148. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/__init__.py +0 -0
  149. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/bbox.py +0 -0
  150. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/pose.py +0 -0
  151. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/segment.py +0 -0
  152. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/ultralytics/__init__.py +0 -0
  153. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/ultralytics/bbox.py +0 -0
  154. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/ultralytics/pose.py +0 -0
  155. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/model/ultralytics/segment.py +0 -0
  156. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/nodes_fetcher.py +0 -0
  157. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/nodes_thread_pool.py +0 -0
  158. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/progress.py +0 -0
  159. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/py.typed +0 -0
  160. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/__init__.py +0 -0
  161. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/batch.py +0 -0
  162. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/dataset.py +0 -0
  163. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/dispatch.py +0 -0
  164. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/metrics.py +0 -0
  165. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/params.py +0 -0
  166. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/queue.py +0 -0
  167. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/schema.py +0 -0
  168. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/udf.py +0 -0
  169. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/query/utils.py +0 -0
  170. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/remote/__init__.py +0 -0
  171. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/remote/studio.py +0 -0
  172. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/__init__.py +0 -0
  173. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/default/__init__.py +0 -0
  174. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/default/base.py +0 -0
  175. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/__init__.py +0 -0
  176. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/aggregate.py +0 -0
  177. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/array.py +0 -0
  178. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/conditional.py +0 -0
  179. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/numeric.py +0 -0
  180. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/path.py +0 -0
  181. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/random.py +0 -0
  182. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/functions/string.py +0 -0
  183. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/selectable.py +0 -0
  184. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/sqlite/__init__.py +0 -0
  185. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/sqlite/base.py +0 -0
  186. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/sqlite/types.py +0 -0
  187. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/sqlite/vector.py +0 -0
  188. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/types.py +0 -0
  189. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/sql/utils.py +0 -0
  190. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/studio.py +0 -0
  191. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/telemetry.py +0 -0
  192. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/toolkit/__init__.py +0 -0
  193. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/toolkit/split.py +0 -0
  194. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/torch/__init__.py +0 -0
  195. {datachain-0.8.1 → datachain-0.8.3}/src/datachain/utils.py +0 -0
  196. {datachain-0.8.1 → datachain-0.8.3}/src/datachain.egg-info/dependency_links.txt +0 -0
  197. {datachain-0.8.1 → datachain-0.8.3}/src/datachain.egg-info/entry_points.txt +0 -0
  198. {datachain-0.8.1 → datachain-0.8.3}/src/datachain.egg-info/top_level.txt +0 -0
  199. {datachain-0.8.1 → datachain-0.8.3}/tests/__init__.py +0 -0
  200. {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/__init__.py +0 -0
  201. {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/conftest.py +0 -0
  202. {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  203. {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/datasets/.dvc/config +0 -0
  204. {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/datasets/.gitignore +0 -0
  205. {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  206. {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/test_datachain.py +0 -0
  207. {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/test_ls.py +0 -0
  208. {datachain-0.8.1 → datachain-0.8.3}/tests/benchmarks/test_version.py +0 -0
  209. {datachain-0.8.1 → datachain-0.8.3}/tests/conftest.py +0 -0
  210. {datachain-0.8.1 → datachain-0.8.3}/tests/data.py +0 -0
  211. {datachain-0.8.1 → datachain-0.8.3}/tests/examples/__init__.py +0 -0
  212. {datachain-0.8.1 → datachain-0.8.3}/tests/examples/test_examples.py +0 -0
  213. {datachain-0.8.1 → datachain-0.8.3}/tests/examples/test_wds_e2e.py +0 -0
  214. {datachain-0.8.1 → datachain-0.8.3}/tests/examples/wds_data.py +0 -0
  215. {datachain-0.8.1 → datachain-0.8.3}/tests/func/__init__.py +0 -0
  216. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_client.py +0 -0
  217. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_dataset_query.py +0 -0
  218. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_datasets.py +0 -0
  219. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_feature_pickling.py +0 -0
  220. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_listing.py +0 -0
  221. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_ls.py +0 -0
  222. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_meta_formats.py +0 -0
  223. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_metrics.py +0 -0
  224. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_pytorch.py +0 -0
  225. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_query.py +0 -0
  226. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_session.py +0 -0
  227. {datachain-0.8.1 → datachain-0.8.3}/tests/func/test_toolkit.py +0 -0
  228. {datachain-0.8.1 → datachain-0.8.3}/tests/scripts/feature_class.py +0 -0
  229. {datachain-0.8.1 → datachain-0.8.3}/tests/scripts/feature_class_exception.py +0 -0
  230. {datachain-0.8.1 → datachain-0.8.3}/tests/scripts/feature_class_parallel.py +0 -0
  231. {datachain-0.8.1 → datachain-0.8.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  232. {datachain-0.8.1 → datachain-0.8.3}/tests/scripts/name_len_slow.py +0 -0
  233. {datachain-0.8.1 → datachain-0.8.3}/tests/test_atomicity.py +0 -0
  234. {datachain-0.8.1 → datachain-0.8.3}/tests/test_cli_e2e.py +0 -0
  235. {datachain-0.8.1 → datachain-0.8.3}/tests/test_cli_studio.py +0 -0
  236. {datachain-0.8.1 → datachain-0.8.3}/tests/test_query_e2e.py +0 -0
  237. {datachain-0.8.1 → datachain-0.8.3}/tests/test_telemetry.py +0 -0
  238. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/__init__.py +0 -0
  239. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/__init__.py +0 -0
  240. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/conftest.py +0 -0
  241. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_arrow.py +0 -0
  242. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_clip.py +0 -0
  243. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_datachain.py +0 -0
  244. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  245. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_datachain_merge.py +0 -0
  246. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_diff.py +0 -0
  247. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_feature.py +0 -0
  248. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_feature_utils.py +0 -0
  249. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_file.py +0 -0
  250. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_hf.py +0 -0
  251. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_image.py +0 -0
  252. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_listing_info.py +0 -0
  253. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_models.py +0 -0
  254. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_schema.py +0 -0
  255. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_signal_schema.py +0 -0
  256. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_sql_to_python.py +0 -0
  257. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_text.py +0 -0
  258. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_udf_signature.py +0 -0
  259. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_utils.py +0 -0
  260. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/lib/test_webdataset.py +0 -0
  261. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/__init__.py +0 -0
  262. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/sqlite/__init__.py +0 -0
  263. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/sqlite/test_types.py +0 -0
  264. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
  265. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_array.py +0 -0
  266. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_conditional.py +0 -0
  267. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_path.py +0 -0
  268. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_random.py +0 -0
  269. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_selectable.py +0 -0
  270. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/sql/test_string.py +0 -0
  271. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_asyn.py +0 -0
  272. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_cache.py +0 -0
  273. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_catalog.py +0 -0
  274. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_catalog_loader.py +0 -0
  275. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_cli_parsing.py +0 -0
  276. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_client.py +0 -0
  277. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_config.py +0 -0
  278. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_data_storage.py +0 -0
  279. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_database_engine.py +0 -0
  280. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_dataset.py +0 -0
  281. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_dispatch.py +0 -0
  282. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_fileslice.py +0 -0
  283. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_func.py +0 -0
  284. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_metastore.py +0 -0
  285. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_module_exports.py +0 -0
  286. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_query.py +0 -0
  287. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_query_metrics.py +0 -0
  288. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_query_params.py +0 -0
  289. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_serializer.py +0 -0
  290. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_utils.py +0 -0
  291. {datachain-0.8.1 → datachain-0.8.3}/tests/unit/test_warehouse.py +0 -0
  292. {datachain-0.8.1 → datachain-0.8.3}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.8.1
3
+ Version: 0.8.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -72,7 +72,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
72
72
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
73
73
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
74
74
  Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
75
- Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
75
+ Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
76
76
  Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
77
77
  Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
78
78
  Requires-Dist: virtualenv; extra == "tests"
@@ -145,6 +145,88 @@ Getting Started
145
145
  Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
146
146
  to get started with `DataChain` and learn more.
147
147
 
148
+ .. code:: bash
149
+
150
+ pip install datachain
151
+
152
+
153
+ Example: download subset of files based on metadata
154
+ ---------------------------------------------------
155
+
156
+ Sometimes users only need to download a specific subset of files from cloud storage,
157
+ rather than the entire dataset.
158
+ For example, you could use a JSON file's metadata to download just cat images with
159
+ high confidence scores.
160
+
161
+
162
+ .. code:: py
163
+
164
+ from datachain import Column, DataChain
165
+
166
+ meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
167
+ images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
168
+
169
+ images_id = images.map(id=lambda file: file.path.split('.')[-2])
170
+ annotated = images_id.merge(meta, on="id", right_on="meta.id")
171
+
172
+ likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
173
+ & (Column("meta.inference.class_") == "cat"))
174
+ likely_cats.export_files("high-confidence-cats/", signal="file")
175
+
176
+
177
+ Example: LLM based text-file evaluation
178
+ ---------------------------------------
179
+
180
+ In this example, we evaluate chatbot conversations stored in text files
181
+ using LLM based evaluation.
182
+
183
+ .. code:: shell
184
+
185
+ $ pip install mistralai # Requires version >=1.0.0
186
+ $ export MISTRAL_API_KEY=_your_key_
187
+
188
+ Python code:
189
+
190
+ .. code:: py
191
+
192
+ from mistralai import Mistral
193
+ from datachain import File, DataChain, Column
194
+
195
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
196
+
197
+ def eval_dialogue(file: File) -> bool:
198
+ client = Mistral()
199
+ response = client.chat.complete(
200
+ model="open-mixtral-8x22b",
201
+ messages=[{"role": "system", "content": PROMPT},
202
+ {"role": "user", "content": file.read()}])
203
+ result = response.choices[0].message.content
204
+ return result.lower().startswith("success")
205
+
206
+ chain = (
207
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
208
+ .settings(parallel=4, cache=True)
209
+ .map(is_success=eval_dialogue)
210
+ .save("mistral_files")
211
+ )
212
+
213
+ successful_chain = chain.filter(Column("is_success") == True)
214
+ successful_chain.export_files("./output_mistral")
215
+
216
+ print(f"{successful_chain.count()} files were exported")
217
+
218
+
219
+
220
+ With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
221
+
222
+ .. code:: shell
223
+
224
+ $ ls output_mistral/datachain-demo/chatbot-KiT/
225
+ 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
226
+ $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
227
+ 31
228
+
229
+
148
230
  Key Features
149
231
  ============
150
232
 
@@ -42,6 +42,88 @@ Getting Started
42
42
  Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
43
43
  to get started with `DataChain` and learn more.
44
44
 
45
+ .. code:: bash
46
+
47
+ pip install datachain
48
+
49
+
50
+ Example: download subset of files based on metadata
51
+ ---------------------------------------------------
52
+
53
+ Sometimes users only need to download a specific subset of files from cloud storage,
54
+ rather than the entire dataset.
55
+ For example, you could use a JSON file's metadata to download just cat images with
56
+ high confidence scores.
57
+
58
+
59
+ .. code:: py
60
+
61
+ from datachain import Column, DataChain
62
+
63
+ meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
64
+ images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
65
+
66
+ images_id = images.map(id=lambda file: file.path.split('.')[-2])
67
+ annotated = images_id.merge(meta, on="id", right_on="meta.id")
68
+
69
+ likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
70
+ & (Column("meta.inference.class_") == "cat"))
71
+ likely_cats.export_files("high-confidence-cats/", signal="file")
72
+
73
+
74
+ Example: LLM based text-file evaluation
75
+ ---------------------------------------
76
+
77
+ In this example, we evaluate chatbot conversations stored in text files
78
+ using LLM based evaluation.
79
+
80
+ .. code:: shell
81
+
82
+ $ pip install mistralai # Requires version >=1.0.0
83
+ $ export MISTRAL_API_KEY=_your_key_
84
+
85
+ Python code:
86
+
87
+ .. code:: py
88
+
89
+ from mistralai import Mistral
90
+ from datachain import File, DataChain, Column
91
+
92
+ PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
93
+
94
+ def eval_dialogue(file: File) -> bool:
95
+ client = Mistral()
96
+ response = client.chat.complete(
97
+ model="open-mixtral-8x22b",
98
+ messages=[{"role": "system", "content": PROMPT},
99
+ {"role": "user", "content": file.read()}])
100
+ result = response.choices[0].message.content
101
+ return result.lower().startswith("success")
102
+
103
+ chain = (
104
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
105
+ .settings(parallel=4, cache=True)
106
+ .map(is_success=eval_dialogue)
107
+ .save("mistral_files")
108
+ )
109
+
110
+ successful_chain = chain.filter(Column("is_success") == True)
111
+ successful_chain.export_files("./output_mistral")
112
+
113
+ print(f"{successful_chain.count()} files were exported")
114
+
115
+
116
+
117
+ With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
118
+
119
+ .. code:: shell
120
+
121
+ $ ls output_mistral/datachain-demo/chatbot-KiT/
122
+ 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
123
+ $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
124
+ 31
125
+
126
+
45
127
  Key Features
46
128
  ============
47
129
 
@@ -39,8 +39,8 @@ using JSON metadata:
39
39
  ``` py
40
40
  from datachain import Column, DataChain
41
41
 
42
- meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
43
- images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
42
+ meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
43
+ images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
44
44
 
45
45
  images_id = images.map(id=lambda file: file.path.split('.')[-2])
46
46
  annotated = images_id.merge(meta, on="id", right_on="meta.id")
@@ -78,7 +78,7 @@ def is_positive_dialogue_ending(file) -> bool:
78
78
 
79
79
  chain = (
80
80
  DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
81
- object_name="file", type="text")
81
+ object_name="file", type="text", anon=True)
82
82
  .settings(parallel=8, cache=True)
83
83
  .map(is_positive=is_positive_dialogue_ending)
84
84
  .save("file_response")
@@ -132,7 +132,7 @@ def eval_dialogue(file: File) -> bool:
132
132
  return result.lower().startswith("success")
133
133
 
134
134
  chain = (
135
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
135
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
136
136
  .map(is_success=eval_dialogue)
137
137
  .save("mistral_files")
138
138
  )
@@ -177,7 +177,7 @@ def eval_dialog(file: File) -> ChatCompletionResponse:
177
177
  {"role": "user", "content": file.read()}])
178
178
 
179
179
  chain = (
180
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
180
+ DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
181
181
  .settings(parallel=4, cache=True)
182
182
  .map(response=eval_dialog)
183
183
  .map(status=lambda response: response.choices[0].message.content.lower()[:7])
@@ -273,7 +273,7 @@ from datachain import C, DataChain
273
273
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
274
274
 
275
275
  chain = (
276
- DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
276
+ DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
277
277
  .map(label=lambda name: name.split(".")[0], params=["file.name"])
278
278
  .select("file", "label").to_pytorch(
279
279
  transform=processor.image_processor,
@@ -83,7 +83,7 @@ tests = [
83
83
  "pytest-sugar>=0.9.6",
84
84
  "pytest-cov>=4.1.0",
85
85
  "pytest-mock>=3.12.0",
86
- "pytest-servers[all]>=0.5.8",
86
+ "pytest-servers[all]>=0.5.9",
87
87
  "pytest-benchmark[histogram]",
88
88
  "pytest-xdist>=3.3.1",
89
89
  "virtualenv",
@@ -61,14 +61,16 @@ class DataChainCache:
61
61
  tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
62
62
  size = file.size
63
63
  if size < 0:
64
- size = await client.get_size(from_path)
64
+ size = await client.get_size(from_path, version_id=file.version)
65
65
  cb = callback or TqdmCallback(
66
66
  tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
67
67
  tqdm_cls=Tqdm,
68
68
  size=size,
69
69
  )
70
70
  try:
71
- await client.get_file(from_path, tmp_info, callback=cb)
71
+ await client.get_file(
72
+ from_path, tmp_info, callback=cb, version_id=file.version
73
+ )
72
74
  finally:
73
75
  if not callback:
74
76
  cb.close()
@@ -240,7 +240,8 @@ class DatasetRowsFetcher(NodesThreadPool):
240
240
  class NodeGroup:
241
241
  """Class for a group of nodes from the same source"""
242
242
 
243
- listing: "Listing"
243
+ listing: Optional["Listing"]
244
+ client: "Client"
244
245
  sources: list[DataSource]
245
246
 
246
247
  # The source path within the bucket
@@ -268,9 +269,7 @@ class NodeGroup:
268
269
  Download this node group to cache.
269
270
  """
270
271
  if self.sources:
271
- self.listing.client.fetch_nodes(
272
- self.iternodes(recursive), shared_progress_bar=pbar
273
- )
272
+ self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
274
273
 
275
274
 
276
275
  def check_output_dataset_file(
@@ -375,7 +374,7 @@ def collect_nodes_for_cp(
375
374
 
376
375
  # Collect all sources to process
377
376
  for node_group in node_groups:
378
- listing: Listing = node_group.listing
377
+ listing: Optional[Listing] = node_group.listing
379
378
  valid_sources: list[DataSource] = []
380
379
  for dsrc in node_group.sources:
381
380
  if dsrc.is_single_object():
@@ -383,6 +382,7 @@ def collect_nodes_for_cp(
383
382
  total_files += 1
384
383
  valid_sources.append(dsrc)
385
384
  else:
385
+ assert listing
386
386
  node = dsrc.node
387
387
  if not recursive:
388
388
  print(f"{node.full_path} is a directory (not copied).")
@@ -433,37 +433,51 @@ def instantiate_node_groups(
433
433
  )
434
434
 
435
435
  output_dir = output
436
+ output_file = None
436
437
  if copy_to_filename:
437
438
  output_dir = os.path.dirname(output)
438
439
  if not output_dir:
439
440
  output_dir = "."
441
+ output_file = os.path.basename(output)
440
442
 
441
443
  # Instantiate these nodes
442
444
  for node_group in node_groups:
443
445
  if not node_group.sources:
444
446
  continue
445
- listing: Listing = node_group.listing
447
+ listing: Optional[Listing] = node_group.listing
446
448
  source_path: str = node_group.source_path
447
449
 
448
450
  copy_dir_contents = always_copy_dir_contents or source_path.endswith("/")
449
- instantiated_nodes = listing.collect_nodes_to_instantiate(
450
- node_group.sources,
451
- copy_to_filename,
452
- recursive,
453
- copy_dir_contents,
454
- source_path,
455
- node_group.is_edatachain,
456
- node_group.is_dataset,
457
- )
458
- if not virtual_only:
459
- listing.instantiate_nodes(
460
- instantiated_nodes,
461
- output_dir,
462
- total_files,
463
- force=force,
464
- shared_progress_bar=instantiate_progress_bar,
451
+ if not listing:
452
+ source = node_group.sources[0]
453
+ client = source.client
454
+ node = NodeWithPath(source.node, [output_file or source.node.path])
455
+ instantiated_nodes = [node]
456
+ if not virtual_only:
457
+ node.instantiate(
458
+ client, output_dir, instantiate_progress_bar, force=force
459
+ )
460
+ else:
461
+ instantiated_nodes = listing.collect_nodes_to_instantiate(
462
+ node_group.sources,
463
+ copy_to_filename,
464
+ recursive,
465
+ copy_dir_contents,
466
+ source_path,
467
+ node_group.is_edatachain,
468
+ node_group.is_dataset,
465
469
  )
470
+ if not virtual_only:
471
+ listing.instantiate_nodes(
472
+ instantiated_nodes,
473
+ output_dir,
474
+ total_files,
475
+ force=force,
476
+ shared_progress_bar=instantiate_progress_bar,
477
+ )
478
+
466
479
  node_group.instantiated_nodes = instantiated_nodes
480
+
467
481
  if instantiate_progress_bar:
468
482
  instantiate_progress_bar.close()
469
483
 
@@ -592,7 +606,7 @@ class Catalog:
592
606
  client_config=None,
593
607
  object_name="file",
594
608
  skip_indexing=False,
595
- ) -> tuple["Listing", str]:
609
+ ) -> tuple[Optional["Listing"], "Client", str]:
596
610
  from datachain.lib.dc import DataChain
597
611
  from datachain.listing import Listing
598
612
 
@@ -603,16 +617,19 @@ class Catalog:
603
617
  list_ds_name, list_uri, list_path, _ = get_listing(
604
618
  source, self.session, update=update
605
619
  )
620
+ lst = None
621
+ client = Client.get_client(list_uri, self.cache, **self.client_config)
622
+
623
+ if list_ds_name:
624
+ lst = Listing(
625
+ self.metastore.clone(),
626
+ self.warehouse.clone(),
627
+ client,
628
+ dataset_name=list_ds_name,
629
+ object_name=object_name,
630
+ )
606
631
 
607
- lst = Listing(
608
- self.metastore.clone(),
609
- self.warehouse.clone(),
610
- Client.get_client(list_uri, self.cache, **self.client_config),
611
- dataset_name=list_ds_name,
612
- object_name=object_name,
613
- )
614
-
615
- return lst, list_path
632
+ return lst, client, list_path
616
633
 
617
634
  def _remove_dataset_rows_and_warehouse_info(
618
635
  self, dataset: DatasetRecord, version: int, **kwargs
@@ -635,13 +652,13 @@ class Catalog:
635
652
  ) -> Optional[list["DataSource"]]:
636
653
  enlisted_sources = []
637
654
  for src in sources: # Opt: parallel
638
- listing, file_path = self.enlist_source(
655
+ listing, client, file_path = self.enlist_source(
639
656
  src,
640
657
  update,
641
658
  client_config=client_config or self.client_config,
642
659
  skip_indexing=skip_indexing,
643
660
  )
644
- enlisted_sources.append((listing, file_path))
661
+ enlisted_sources.append((listing, client, file_path))
645
662
 
646
663
  if only_index:
647
664
  # sometimes we don't really need listing result (e.g on indexing process)
@@ -649,10 +666,16 @@ class Catalog:
649
666
  return None
650
667
 
651
668
  dsrc_all: list[DataSource] = []
652
- for listing, file_path in enlisted_sources:
653
- nodes = listing.expand_path(file_path)
654
- dir_only = file_path.endswith("/")
655
- dsrc_all.extend(DataSource(listing, node, dir_only) for node in nodes)
669
+ for listing, client, file_path in enlisted_sources:
670
+ if not listing:
671
+ nodes = [Node.from_file(client.get_file_info(file_path))]
672
+ dir_only = False
673
+ else:
674
+ nodes = listing.expand_path(file_path)
675
+ dir_only = file_path.endswith("/")
676
+ dsrc_all.extend(
677
+ DataSource(listing, client, node, dir_only) for node in nodes
678
+ )
656
679
  return dsrc_all
657
680
 
658
681
  def enlist_sources_grouped(
@@ -667,7 +690,7 @@ class Catalog:
667
690
 
668
691
  def _row_to_node(d: dict[str, Any]) -> Node:
669
692
  del d["file__source"]
670
- return Node.from_dict(d)
693
+ return Node.from_row(d)
671
694
 
672
695
  enlisted_sources: list[tuple[bool, bool, Any]] = []
673
696
  client_config = client_config or self.client_config
@@ -677,7 +700,7 @@ class Catalog:
677
700
  edatachain_data = parse_edatachain_file(src)
678
701
  indexed_sources = []
679
702
  for ds in edatachain_data:
680
- listing, source_path = self.enlist_source(
703
+ listing, _, source_path = self.enlist_source(
681
704
  ds["data-source"]["uri"],
682
705
  update,
683
706
  client_config=client_config,
@@ -701,6 +724,7 @@ class Catalog:
701
724
  client = self.get_client(source, **client_config)
702
725
  uri = client.uri
703
726
  dataset_name, _, _, _ = get_listing(uri, self.session)
727
+ assert dataset_name
704
728
  listing = Listing(
705
729
  self.metastore.clone(),
706
730
  self.warehouse.clone(),
@@ -713,6 +737,7 @@ class Catalog:
713
737
  indexed_sources.append(
714
738
  (
715
739
  listing,
740
+ client,
716
741
  source,
717
742
  [_row_to_node(r) for r in rows],
718
743
  ds_name,
@@ -722,25 +747,28 @@ class Catalog:
722
747
 
723
748
  enlisted_sources.append((False, True, indexed_sources))
724
749
  else:
725
- listing, source_path = self.enlist_source(
750
+ listing, client, source_path = self.enlist_source(
726
751
  src, update, client_config=client_config
727
752
  )
728
- enlisted_sources.append((False, False, (listing, source_path)))
753
+ enlisted_sources.append((False, False, (listing, client, source_path)))
729
754
 
730
755
  node_groups = []
731
756
  for is_datachain, is_dataset, payload in enlisted_sources: # Opt: parallel
732
757
  if is_dataset:
733
758
  for (
734
759
  listing,
760
+ client,
735
761
  source_path,
736
762
  nodes,
737
763
  dataset_name,
738
764
  dataset_version,
739
765
  ) in payload:
740
- dsrc = [DataSource(listing, node) for node in nodes]
766
+ assert listing
767
+ dsrc = [DataSource(listing, client, node) for node in nodes]
741
768
  node_groups.append(
742
769
  NodeGroup(
743
770
  listing,
771
+ client,
744
772
  dsrc,
745
773
  source_path,
746
774
  dataset_name=dataset_name,
@@ -749,18 +777,30 @@ class Catalog:
749
777
  )
750
778
  elif is_datachain:
751
779
  for listing, source_path, paths in payload:
752
- dsrc = [DataSource(listing, listing.resolve_path(p)) for p in paths]
780
+ assert listing
781
+ dsrc = [
782
+ DataSource(listing, listing.client, listing.resolve_path(p))
783
+ for p in paths
784
+ ]
753
785
  node_groups.append(
754
- NodeGroup(listing, dsrc, source_path, is_edatachain=True)
786
+ NodeGroup(
787
+ listing,
788
+ listing.client,
789
+ dsrc,
790
+ source_path,
791
+ is_edatachain=True,
792
+ )
755
793
  )
756
794
  else:
757
- listing, source_path = payload
758
- as_container = source_path.endswith("/")
759
- dsrc = [
760
- DataSource(listing, n, as_container)
761
- for n in listing.expand_path(source_path, use_glob=not no_glob)
762
- ]
763
- node_groups.append(NodeGroup(listing, dsrc, source_path))
795
+ listing, client, source_path = payload
796
+ if not listing:
797
+ nodes = [Node.from_file(client.get_file_info(source_path))]
798
+ as_container = False
799
+ else:
800
+ as_container = source_path.endswith("/")
801
+ nodes = listing.expand_path(source_path, use_glob=not no_glob)
802
+ dsrc = [DataSource(listing, client, n, as_container) for n in nodes]
803
+ node_groups.append(NodeGroup(listing, client, dsrc, source_path))
764
804
 
765
805
  return node_groups
766
806
 
@@ -1196,10 +1236,16 @@ class Catalog:
1196
1236
 
1197
1237
  return q.to_db_records()
1198
1238
 
1199
- def signed_url(self, source: str, path: str, client_config=None) -> str:
1239
+ def signed_url(
1240
+ self,
1241
+ source: str,
1242
+ path: str,
1243
+ version_id: Optional[str] = None,
1244
+ client_config=None,
1245
+ ) -> str:
1200
1246
  client_config = client_config or self.client_config
1201
1247
  client = Client.get_client(source, self.cache, **client_config)
1202
- return client.url(path)
1248
+ return client.url(path, version_id=version_id)
1203
1249
 
1204
1250
  def export_dataset_table(
1205
1251
  self,
@@ -4,21 +4,19 @@ from datachain.node import DirType, NodeWithPath
4
4
 
5
5
 
6
6
  class DataSource:
7
- def __init__(self, listing, node, as_container=False):
7
+ def __init__(self, listing, client, node, as_container=False):
8
8
  self.listing = listing
9
+ self.client = client
9
10
  self.node = node
10
11
  self.as_container = (
11
12
  as_container # Indicates whether a .tar file is handled as a container
12
13
  )
13
14
 
14
- def get_full_path(self):
15
- return self.get_node_full_path(self.node)
16
-
17
15
  def get_node_full_path(self, node):
18
- return self.listing.client.get_full_path(node.full_path)
16
+ return self.client.get_full_path(node.full_path)
19
17
 
20
18
  def get_node_full_path_from_path(self, full_path):
21
- return self.listing.client.get_full_path(full_path)
19
+ return self.client.get_full_path(full_path)
22
20
 
23
21
  def is_single_object(self):
24
22
  return self.node.dir_type == DirType.FILE or (
@@ -1,4 +1,5 @@
1
- from typing import Any
1
+ from typing import Any, Optional
2
+ from urllib.parse import parse_qs, urlsplit, urlunsplit
2
3
 
3
4
  from adlfs import AzureBlobFileSystem
4
5
  from tqdm import tqdm
@@ -25,6 +26,16 @@ class AzureClient(Client):
25
26
  size=v.get("size", ""),
26
27
  )
27
28
 
29
+ def url(self, path: str, expires: int = 3600, **kwargs) -> str:
30
+ """
31
+ Generate a signed URL for the given path.
32
+ """
33
+ version_id = kwargs.pop("version_id", None)
34
+ result = self.fs.sign(
35
+ self.get_full_path(path, version_id), expiration=expires, **kwargs
36
+ )
37
+ return result + (f"&versionid={version_id}" if version_id else "")
38
+
28
39
  async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
29
40
  prefix = start_prefix
30
41
  if prefix:
@@ -57,4 +68,13 @@ class AzureClient(Client):
57
68
  finally:
58
69
  result_queue.put_nowait(None)
59
70
 
71
+ @classmethod
72
+ def version_path(cls, path: str, version_id: Optional[str]) -> str:
73
+ parts = list(urlsplit(path))
74
+ query = parse_qs(parts[3])
75
+ if "versionid" in query:
76
+ raise ValueError("path already includes a version query")
77
+ parts[3] = f"versionid={version_id}" if version_id else ""
78
+ return urlunsplit(parts)
79
+
60
80
  _fetch_default = _fetch_flat