datachain 0.14.0__tar.gz → 0.14.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (354) hide show
  1. {datachain-0.14.0/src/datachain.egg-info → datachain-0.14.1}/PKG-INFO +2 -2
  2. {datachain-0.14.0 → datachain-0.14.1}/pyproject.toml +3 -2
  3. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/catalog/catalog.py +1 -1
  4. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/fsspec.py +3 -3
  5. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/json.py +1 -1
  6. datachain-0.14.1/src/datachain/lib/dc/storage.py +170 -0
  7. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/dataset.py +39 -16
  8. {datachain-0.14.0 → datachain-0.14.1/src/datachain.egg-info}/PKG-INFO +2 -2
  9. {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/test_ls.py +1 -1
  10. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_datachain.py +85 -5
  11. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_ls.py +1 -1
  12. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_datachain.py +4 -4
  13. datachain-0.14.0/src/datachain/lib/dc/storage.py +0 -118
  14. {datachain-0.14.0 → datachain-0.14.1}/.cruft.json +0 -0
  15. {datachain-0.14.0 → datachain-0.14.1}/.gitattributes +0 -0
  16. {datachain-0.14.0 → datachain-0.14.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  17. {datachain-0.14.0 → datachain-0.14.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  18. {datachain-0.14.0 → datachain-0.14.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  19. {datachain-0.14.0 → datachain-0.14.1}/.github/codecov.yaml +0 -0
  20. {datachain-0.14.0 → datachain-0.14.1}/.github/dependabot.yml +0 -0
  21. {datachain-0.14.0 → datachain-0.14.1}/.github/workflows/benchmarks.yml +0 -0
  22. {datachain-0.14.0 → datachain-0.14.1}/.github/workflows/release.yml +0 -0
  23. {datachain-0.14.0 → datachain-0.14.1}/.github/workflows/tests-studio.yml +0 -0
  24. {datachain-0.14.0 → datachain-0.14.1}/.github/workflows/tests.yml +0 -0
  25. {datachain-0.14.0 → datachain-0.14.1}/.github/workflows/update-template.yaml +0 -0
  26. {datachain-0.14.0 → datachain-0.14.1}/.gitignore +0 -0
  27. {datachain-0.14.0 → datachain-0.14.1}/.pre-commit-config.yaml +0 -0
  28. {datachain-0.14.0 → datachain-0.14.1}/CODE_OF_CONDUCT.rst +0 -0
  29. {datachain-0.14.0 → datachain-0.14.1}/LICENSE +0 -0
  30. {datachain-0.14.0 → datachain-0.14.1}/README.rst +0 -0
  31. {datachain-0.14.0 → datachain-0.14.1}/docs/assets/captioned_cartoons.png +0 -0
  32. {datachain-0.14.0 → datachain-0.14.1}/docs/assets/datachain-white.svg +0 -0
  33. {datachain-0.14.0 → datachain-0.14.1}/docs/assets/datachain.svg +0 -0
  34. {datachain-0.14.0 → datachain-0.14.1}/docs/contributing.md +0 -0
  35. {datachain-0.14.0 → datachain-0.14.1}/docs/css/github-permalink-style.css +0 -0
  36. {datachain-0.14.0 → datachain-0.14.1}/docs/examples.md +0 -0
  37. {datachain-0.14.0 → datachain-0.14.1}/docs/index.md +0 -0
  38. {datachain-0.14.0 → datachain-0.14.1}/docs/overrides/main.html +0 -0
  39. {datachain-0.14.0 → datachain-0.14.1}/docs/quick-start.md +0 -0
  40. {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/arrowrow.md +0 -0
  41. {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/bbox.md +0 -0
  42. {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/file.md +0 -0
  43. {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/imagefile.md +0 -0
  44. {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/index.md +0 -0
  45. {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/pose.md +0 -0
  46. {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/segment.md +0 -0
  47. {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/tarvfile.md +0 -0
  48. {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/textfile.md +0 -0
  49. {datachain-0.14.0 → datachain-0.14.1}/docs/references/data-types/videofile.md +0 -0
  50. {datachain-0.14.0 → datachain-0.14.1}/docs/references/datachain.md +0 -0
  51. {datachain-0.14.0 → datachain-0.14.1}/docs/references/func.md +0 -0
  52. {datachain-0.14.0 → datachain-0.14.1}/docs/references/index.md +0 -0
  53. {datachain-0.14.0 → datachain-0.14.1}/docs/references/remotes.md +0 -0
  54. {datachain-0.14.0 → datachain-0.14.1}/docs/references/toolkit.md +0 -0
  55. {datachain-0.14.0 → datachain-0.14.1}/docs/references/torch.md +0 -0
  56. {datachain-0.14.0 → datachain-0.14.1}/docs/references/udf.md +0 -0
  57. {datachain-0.14.0 → datachain-0.14.1}/docs/tutorials.md +0 -0
  58. {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  59. {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  60. {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/openimage-detect.py +0 -0
  61. {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
  62. {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/ultralytics-pose.py +0 -0
  63. {datachain-0.14.0 → datachain-0.14.1}/examples/computer_vision/ultralytics-segment.py +0 -0
  64. {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/common_sql_functions.py +0 -0
  65. {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/json-csv-reader.py +0 -0
  66. {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/torch-loader.py +0 -0
  67. {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/udfs/parallel.py +0 -0
  68. {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/udfs/simple.py +0 -0
  69. {datachain-0.14.0 → datachain-0.14.1}/examples/get_started/udfs/stateful.py +0 -0
  70. {datachain-0.14.0 → datachain-0.14.1}/examples/llm_and_nlp/claude-query.py +0 -0
  71. {datachain-0.14.0 → datachain-0.14.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  72. {datachain-0.14.0 → datachain-0.14.1}/examples/multimodal/clip_inference.py +0 -0
  73. {datachain-0.14.0 → datachain-0.14.1}/examples/multimodal/hf_pipeline.py +0 -0
  74. {datachain-0.14.0 → datachain-0.14.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  75. {datachain-0.14.0 → datachain-0.14.1}/examples/multimodal/wds.py +0 -0
  76. {datachain-0.14.0 → datachain-0.14.1}/examples/multimodal/wds_filtered.py +0 -0
  77. {datachain-0.14.0 → datachain-0.14.1}/mkdocs.yml +0 -0
  78. {datachain-0.14.0 → datachain-0.14.1}/noxfile.py +0 -0
  79. {datachain-0.14.0 → datachain-0.14.1}/setup.cfg +0 -0
  80. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/__init__.py +0 -0
  81. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/__main__.py +0 -0
  82. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/asyn.py +0 -0
  83. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cache.py +0 -0
  84. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/catalog/__init__.py +0 -0
  85. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/catalog/datasource.py +0 -0
  86. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/catalog/loader.py +0 -0
  87. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/__init__.py +0 -0
  88. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/__init__.py +0 -0
  89. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/datasets.py +0 -0
  90. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/du.py +0 -0
  91. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/index.py +0 -0
  92. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/ls.py +0 -0
  93. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/misc.py +0 -0
  94. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/query.py +0 -0
  95. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/commands/show.py +0 -0
  96. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/parser/__init__.py +0 -0
  97. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/parser/job.py +0 -0
  98. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/parser/studio.py +0 -0
  99. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/parser/utils.py +0 -0
  100. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/cli/utils.py +0 -0
  101. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/__init__.py +0 -0
  102. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/azure.py +0 -0
  103. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/fileslice.py +0 -0
  104. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/gcs.py +0 -0
  105. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/hf.py +0 -0
  106. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/local.py +0 -0
  107. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/client/s3.py +0 -0
  108. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/config.py +0 -0
  109. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/__init__.py +0 -0
  110. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/db_engine.py +0 -0
  111. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/job.py +0 -0
  112. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/metastore.py +0 -0
  113. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/schema.py +0 -0
  114. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/serializer.py +0 -0
  115. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/sqlite.py +0 -0
  116. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/data_storage/warehouse.py +0 -0
  117. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/dataset.py +0 -0
  118. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/diff/__init__.py +0 -0
  119. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/error.py +0 -0
  120. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/fs/__init__.py +0 -0
  121. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/fs/reference.py +0 -0
  122. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/fs/utils.py +0 -0
  123. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/__init__.py +0 -0
  124. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/aggregate.py +0 -0
  125. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/array.py +0 -0
  126. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/base.py +0 -0
  127. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/conditional.py +0 -0
  128. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/func.py +0 -0
  129. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/numeric.py +0 -0
  130. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/path.py +0 -0
  131. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/random.py +0 -0
  132. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/string.py +0 -0
  133. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/func/window.py +0 -0
  134. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/job.py +0 -0
  135. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/__init__.py +0 -0
  136. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/arrow.py +0 -0
  137. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/clip.py +0 -0
  138. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/__init__.py +0 -0
  139. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/flatten.py +0 -0
  140. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  141. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  142. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/unflatten.py +0 -0
  143. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  144. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/data_model.py +0 -0
  145. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dataset_info.py +0 -0
  146. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/__init__.py +0 -0
  147. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/csv.py +0 -0
  148. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/datachain.py +0 -0
  149. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/datasets.py +0 -0
  150. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/hf.py +0 -0
  151. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/listings.py +0 -0
  152. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/pandas.py +0 -0
  153. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/parquet.py +0 -0
  154. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/records.py +0 -0
  155. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/utils.py +0 -0
  156. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/dc/values.py +0 -0
  157. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/file.py +0 -0
  158. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/hf.py +0 -0
  159. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/image.py +0 -0
  160. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/listing.py +0 -0
  161. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/listing_info.py +0 -0
  162. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/meta_formats.py +0 -0
  163. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/model_store.py +0 -0
  164. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/pytorch.py +0 -0
  165. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/settings.py +0 -0
  166. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/signal_schema.py +0 -0
  167. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/tar.py +0 -0
  168. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/text.py +0 -0
  169. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/udf.py +0 -0
  170. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/udf_signature.py +0 -0
  171. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/utils.py +0 -0
  172. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/video.py +0 -0
  173. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/webdataset.py +0 -0
  174. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/lib/webdataset_laion.py +0 -0
  175. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/listing.py +0 -0
  176. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/__init__.py +0 -0
  177. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/bbox.py +0 -0
  178. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/pose.py +0 -0
  179. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/segment.py +0 -0
  180. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/ultralytics/__init__.py +0 -0
  181. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/ultralytics/bbox.py +0 -0
  182. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/ultralytics/pose.py +0 -0
  183. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/ultralytics/segment.py +0 -0
  184. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/model/utils.py +0 -0
  185. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/node.py +0 -0
  186. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/nodes_fetcher.py +0 -0
  187. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/nodes_thread_pool.py +0 -0
  188. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/progress.py +0 -0
  189. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/py.typed +0 -0
  190. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/__init__.py +0 -0
  191. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/batch.py +0 -0
  192. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/dispatch.py +0 -0
  193. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/metrics.py +0 -0
  194. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/params.py +0 -0
  195. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/queue.py +0 -0
  196. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/schema.py +0 -0
  197. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/session.py +0 -0
  198. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/udf.py +0 -0
  199. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/query/utils.py +0 -0
  200. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/remote/__init__.py +0 -0
  201. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/remote/studio.py +0 -0
  202. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/script_meta.py +0 -0
  203. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/__init__.py +0 -0
  204. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/default/__init__.py +0 -0
  205. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/default/base.py +0 -0
  206. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/__init__.py +0 -0
  207. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/aggregate.py +0 -0
  208. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/array.py +0 -0
  209. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/conditional.py +0 -0
  210. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/numeric.py +0 -0
  211. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/path.py +0 -0
  212. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/random.py +0 -0
  213. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/functions/string.py +0 -0
  214. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/selectable.py +0 -0
  215. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  216. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/sqlite/base.py +0 -0
  217. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/sqlite/types.py +0 -0
  218. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/sqlite/vector.py +0 -0
  219. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/types.py +0 -0
  220. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/sql/utils.py +0 -0
  221. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/studio.py +0 -0
  222. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/telemetry.py +0 -0
  223. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/toolkit/__init__.py +0 -0
  224. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/toolkit/split.py +0 -0
  225. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/torch/__init__.py +0 -0
  226. {datachain-0.14.0 → datachain-0.14.1}/src/datachain/utils.py +0 -0
  227. {datachain-0.14.0 → datachain-0.14.1}/src/datachain.egg-info/SOURCES.txt +0 -0
  228. {datachain-0.14.0 → datachain-0.14.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  229. {datachain-0.14.0 → datachain-0.14.1}/src/datachain.egg-info/entry_points.txt +0 -0
  230. {datachain-0.14.0 → datachain-0.14.1}/src/datachain.egg-info/requires.txt +0 -0
  231. {datachain-0.14.0 → datachain-0.14.1}/src/datachain.egg-info/top_level.txt +0 -0
  232. {datachain-0.14.0 → datachain-0.14.1}/tests/__init__.py +0 -0
  233. {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/__init__.py +0 -0
  234. {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/conftest.py +0 -0
  235. {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  236. {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  237. {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/datasets/.gitignore +0 -0
  238. {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  239. {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/test_datachain.py +0 -0
  240. {datachain-0.14.0 → datachain-0.14.1}/tests/benchmarks/test_version.py +0 -0
  241. {datachain-0.14.0 → datachain-0.14.1}/tests/conftest.py +0 -0
  242. {datachain-0.14.0 → datachain-0.14.1}/tests/data.py +0 -0
  243. {datachain-0.14.0 → datachain-0.14.1}/tests/examples/__init__.py +0 -0
  244. {datachain-0.14.0 → datachain-0.14.1}/tests/examples/test_examples.py +0 -0
  245. {datachain-0.14.0 → datachain-0.14.1}/tests/examples/test_wds_e2e.py +0 -0
  246. {datachain-0.14.0 → datachain-0.14.1}/tests/examples/wds_data.py +0 -0
  247. {datachain-0.14.0 → datachain-0.14.1}/tests/func/__init__.py +0 -0
  248. {datachain-0.14.0 → datachain-0.14.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  249. {datachain-0.14.0 → datachain-0.14.1}/tests/func/data/lena.jpg +0 -0
  250. {datachain-0.14.0 → datachain-0.14.1}/tests/func/fake-service-account-credentials.json +0 -0
  251. {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/__init__.py +0 -0
  252. {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/data/running-mask0.png +0 -0
  253. {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/data/running-mask1.png +0 -0
  254. {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/data/running.jpg +0 -0
  255. {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/data/ships.jpg +0 -0
  256. {datachain-0.14.0 → datachain-0.14.1}/tests/func/model/test_yolo.py +0 -0
  257. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_catalog.py +0 -0
  258. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_client.py +0 -0
  259. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_cloud_transfer.py +0 -0
  260. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_data_storage.py +0 -0
  261. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_datachain_merge.py +0 -0
  262. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_dataset_query.py +0 -0
  263. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_datasets.py +0 -0
  264. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_feature_pickling.py +0 -0
  265. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_file.py +0 -0
  266. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_hf.py +0 -0
  267. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_hidden_field.py +0 -0
  268. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_image.py +0 -0
  269. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_listing.py +0 -0
  270. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_meta_formats.py +0 -0
  271. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_metrics.py +0 -0
  272. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_pull.py +0 -0
  273. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_pytorch.py +0 -0
  274. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_query.py +0 -0
  275. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_session.py +0 -0
  276. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_toolkit.py +0 -0
  277. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_video.py +0 -0
  278. {datachain-0.14.0 → datachain-0.14.1}/tests/func/test_warehouse.py +0 -0
  279. {datachain-0.14.0 → datachain-0.14.1}/tests/scripts/feature_class.py +0 -0
  280. {datachain-0.14.0 → datachain-0.14.1}/tests/scripts/feature_class_exception.py +0 -0
  281. {datachain-0.14.0 → datachain-0.14.1}/tests/scripts/feature_class_parallel.py +0 -0
  282. {datachain-0.14.0 → datachain-0.14.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  283. {datachain-0.14.0 → datachain-0.14.1}/tests/scripts/name_len_slow.py +0 -0
  284. {datachain-0.14.0 → datachain-0.14.1}/tests/test_atomicity.py +0 -0
  285. {datachain-0.14.0 → datachain-0.14.1}/tests/test_cli_e2e.py +0 -0
  286. {datachain-0.14.0 → datachain-0.14.1}/tests/test_cli_studio.py +0 -0
  287. {datachain-0.14.0 → datachain-0.14.1}/tests/test_import_time.py +0 -0
  288. {datachain-0.14.0 → datachain-0.14.1}/tests/test_query_e2e.py +0 -0
  289. {datachain-0.14.0 → datachain-0.14.1}/tests/test_telemetry.py +0 -0
  290. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/__init__.py +0 -0
  291. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/__init__.py +0 -0
  292. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/conftest.py +0 -0
  293. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_arrow.py +0 -0
  294. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_clip.py +0 -0
  295. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  296. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  297. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_diff.py +0 -0
  298. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_feature.py +0 -0
  299. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_feature_utils.py +0 -0
  300. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_file.py +0 -0
  301. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_hf.py +0 -0
  302. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_image.py +0 -0
  303. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_listing_info.py +0 -0
  304. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_python_to_sql.py +0 -0
  305. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_schema.py +0 -0
  306. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_signal_schema.py +0 -0
  307. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  308. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_text.py +0 -0
  309. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_udf_signature.py +0 -0
  310. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_utils.py +0 -0
  311. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/lib/test_webdataset.py +0 -0
  312. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/model/__init__.py +0 -0
  313. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/model/test_bbox.py +0 -0
  314. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/model/test_pose.py +0 -0
  315. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/model/test_segment.py +0 -0
  316. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/model/test_utils.py +0 -0
  317. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/__init__.py +0 -0
  318. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  319. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/sqlite/test_types.py +0 -0
  320. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  321. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_array.py +0 -0
  322. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_conditional.py +0 -0
  323. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_path.py +0 -0
  324. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_random.py +0 -0
  325. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_selectable.py +0 -0
  326. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/sql/test_string.py +0 -0
  327. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_asyn.py +0 -0
  328. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_cache.py +0 -0
  329. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_catalog.py +0 -0
  330. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_catalog_loader.py +0 -0
  331. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_cli_parsing.py +0 -0
  332. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_client.py +0 -0
  333. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_client_gcs.py +0 -0
  334. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_client_s3.py +0 -0
  335. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_config.py +0 -0
  336. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_data_storage.py +0 -0
  337. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_database_engine.py +0 -0
  338. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_dataset.py +0 -0
  339. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_dispatch.py +0 -0
  340. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_fileslice.py +0 -0
  341. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_func.py +0 -0
  342. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_listing.py +0 -0
  343. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_metastore.py +0 -0
  344. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_module_exports.py +0 -0
  345. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_pytorch.py +0 -0
  346. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_query.py +0 -0
  347. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_query_metrics.py +0 -0
  348. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_query_params.py +0 -0
  349. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_script_meta.py +0 -0
  350. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_serializer.py +0 -0
  351. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_session.py +0 -0
  352. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_utils.py +0 -0
  353. {datachain-0.14.0 → datachain-0.14.1}/tests/unit/test_warehouse.py +0 -0
  354. {datachain-0.14.0 → datachain-0.14.1}/tests/utils.py +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.0
3
+ Version: 0.14.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
- License: Apache-2.0
6
+ License-Expression: Apache-2.0
7
7
  Project-URL: Documentation, https://datachain.dvc.ai
8
8
  Project-URL: Issues, https://github.com/iterative/datachain/issues
9
9
  Project-URL: Source, https://github.com/iterative/datachain
@@ -1,12 +1,13 @@
1
1
  [build-system]
2
- requires = ["setuptools>=48", "setuptools_scm[toml]>=6.3.1"]
2
+ requires = ["setuptools>=77", "setuptools_scm[toml]>=6.3.1"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "datachain"
7
7
  description = "Wrangle unstructured AI data at scale"
8
8
  readme = "README.rst"
9
- license = {text = "Apache-2.0"}
9
+ license = "Apache-2.0"
10
+ license-files = ["LICENSE"]
10
11
  authors = [{name = "Dmitry Petrov", email = "support@dvc.org"}]
11
12
  classifiers = [
12
13
  "Programming Language :: Python :: 3",
@@ -588,7 +588,7 @@ class Catalog:
588
588
 
589
589
  from_storage(
590
590
  source, session=self.session, update=update, object_name=object_name
591
- )
591
+ ).exec()
592
592
 
593
593
  list_ds_name, list_uri, list_path, _ = get_listing(
594
594
  source, self.session, update=update
@@ -89,9 +89,9 @@ class Client(ABC):
89
89
  from .local import FileClient
90
90
  from .s3 import ClientS3
91
91
 
92
- protocol = urlparse(str(url)).scheme
92
+ protocol = urlparse(os.fspath(url)).scheme
93
93
 
94
- if not protocol or _is_win_local_path(str(url)):
94
+ if not protocol or _is_win_local_path(os.fspath(url)):
95
95
  return FileClient
96
96
  if protocol == ClientS3.protocol:
97
97
  return ClientS3
@@ -122,7 +122,7 @@ class Client(ABC):
122
122
  source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
123
123
  ) -> "Client":
124
124
  cls = Client.get_implementation(source)
125
- storage_url, _ = cls.split_url(str(source))
125
+ storage_url, _ = cls.split_url(os.fspath(source))
126
126
  if os.name == "nt":
127
127
  storage_url = storage_url.removeprefix("/")
128
128
 
@@ -64,7 +64,7 @@ def from_json(
64
64
  from .storage import from_storage
65
65
 
66
66
  if schema_from == "auto":
67
- schema_from = str(path)
67
+ schema_from = os.fspath(path)
68
68
 
69
69
  def jmespath_to_name(s: str):
70
70
  name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
@@ -0,0 +1,170 @@
1
+ import os.path
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Optional,
5
+ Union,
6
+ )
7
+
8
+ from datachain.lib.file import (
9
+ FileType,
10
+ get_file_type,
11
+ )
12
+ from datachain.lib.listing import (
13
+ get_file_info,
14
+ get_listing,
15
+ list_bucket,
16
+ ls,
17
+ )
18
+ from datachain.query import Session
19
+
20
+ if TYPE_CHECKING:
21
+ from .datachain import DataChain
22
+
23
+
24
+ def from_storage(
25
+ uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
26
+ *,
27
+ type: FileType = "binary",
28
+ session: Optional[Session] = None,
29
+ settings: Optional[dict] = None,
30
+ in_memory: bool = False,
31
+ recursive: Optional[bool] = True,
32
+ object_name: str = "file",
33
+ update: bool = False,
34
+ anon: bool = False,
35
+ client_config: Optional[dict] = None,
36
+ ) -> "DataChain":
37
+ """Get data from storage(s) as a list of file with all file attributes.
38
+ It returns the chain itself as usual.
39
+
40
+ Parameters:
41
+ uri : storage URI with directory or list of URIs.
42
+ URIs must start with storage prefix such
43
+ as `s3://`, `gs://`, `az://` or "file:///"
44
+ type : read file as "binary", "text", or "image" data. Default is "binary".
45
+ recursive : search recursively for the given path.
46
+ object_name : Created object column name.
47
+ update : force storage reindexing. Default is False.
48
+ anon : If True, we will treat cloud bucket as public one
49
+ client_config : Optional client configuration for the storage client.
50
+
51
+ Returns:
52
+ DataChain: A DataChain object containing the file information.
53
+
54
+ Examples:
55
+ Simple call from s3:
56
+ ```python
57
+ import datachain as dc
58
+ chain = dc.from_storage("s3://my-bucket/my-dir")
59
+ ```
60
+
61
+ Multiple URIs:
62
+ ```python
63
+ chain = dc.from_storage([
64
+ "s3://bucket1/dir1",
65
+ "s3://bucket2/dir2"
66
+ ])
67
+ ```
68
+
69
+ With AWS S3-compatible storage:
70
+ ```python
71
+ chain = dc.from_storage(
72
+ "s3://my-bucket/my-dir",
73
+ client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
74
+ )
75
+ ```
76
+
77
+ Pass existing session
78
+ ```py
79
+ session = Session.get()
80
+ chain = dc.from_storage([
81
+ "path/to/dir1",
82
+ "path/to/dir2"
83
+ ], session=session, recursive=True)
84
+ ```
85
+
86
+ Note:
87
+ When using multiple URIs with `update=True`, the function optimizes by
88
+ avoiding redundant updates for URIs pointing to the same storage location.
89
+ """
90
+ from .datachain import DataChain
91
+ from .datasets import from_dataset
92
+ from .records import from_records
93
+ from .values import from_values
94
+
95
+ file_type = get_file_type(type)
96
+
97
+ if anon:
98
+ client_config = (client_config or {}) | {"anon": True}
99
+ session = Session.get(session, client_config=client_config, in_memory=in_memory)
100
+ cache = session.catalog.cache
101
+ client_config = session.catalog.client_config
102
+
103
+ uris = uri if isinstance(uri, (list, tuple)) else [uri]
104
+
105
+ if not uris:
106
+ raise ValueError("No URIs provided")
107
+
108
+ storage_chain = None
109
+ listed_ds_name = set()
110
+ file_values = []
111
+
112
+ for single_uri in uris:
113
+ list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
114
+ single_uri, session, update=update
115
+ )
116
+
117
+ # list_ds_name is None if object is a file, we don't want to use cache
118
+ # or do listing in that case - just read that single object
119
+ if not list_ds_name:
120
+ file_values.append(
121
+ get_file_info(list_uri, cache, client_config=client_config)
122
+ )
123
+ continue
124
+
125
+ dc = from_dataset(list_ds_name, session=session, settings=settings)
126
+ dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
127
+
128
+ if update or not list_ds_exists:
129
+
130
+ def lst_fn(ds_name, lst_uri):
131
+ # disable prefetch for listing, as it pre-downloads all files
132
+ (
133
+ from_records(
134
+ DataChain.DEFAULT_FILE_RECORD,
135
+ session=session,
136
+ settings=settings,
137
+ in_memory=in_memory,
138
+ )
139
+ .settings(prefetch=0)
140
+ .gen(
141
+ list_bucket(lst_uri, cache, client_config=client_config),
142
+ output={f"{object_name}": file_type},
143
+ )
144
+ .save(ds_name, listing=True)
145
+ )
146
+
147
+ dc._query.add_before_steps(
148
+ lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
149
+ )
150
+
151
+ chain = ls(dc, list_path, recursive=recursive, object_name=object_name)
152
+
153
+ storage_chain = storage_chain.union(chain) if storage_chain else chain
154
+ listed_ds_name.add(list_ds_name)
155
+
156
+ if file_values:
157
+ file_chain = from_values(
158
+ session=session,
159
+ settings=settings,
160
+ in_memory=in_memory,
161
+ file=file_values,
162
+ )
163
+ file_chain.signals_schema = file_chain.signals_schema.mutate(
164
+ {f"{object_name}": file_type}
165
+ )
166
+ storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain
167
+
168
+ assert storage_chain is not None
169
+
170
+ return storage_chain
@@ -47,6 +47,7 @@ from datachain.error import (
47
47
  QueryScriptCancelError,
48
48
  )
49
49
  from datachain.func.base import Function
50
+ from datachain.lib.listing import is_listing_dataset
50
51
  from datachain.lib.udf import UDFAdapter, _get_cache
51
52
  from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
52
53
  from datachain.query.schema import C, UDFParamSpec, normalize_param
@@ -151,13 +152,6 @@ def step_result(
151
152
  )
152
153
 
153
154
 
154
- class StartingStep(ABC):
155
- """An initial query processing step, referencing a data source."""
156
-
157
- @abstractmethod
158
- def apply(self) -> "StepResult": ...
159
-
160
-
161
155
  @frozen
162
156
  class Step(ABC):
163
157
  """A query processing step (filtering, mutation, etc.)"""
@@ -170,7 +164,7 @@ class Step(ABC):
170
164
 
171
165
 
172
166
  @frozen
173
- class QueryStep(StartingStep):
167
+ class QueryStep:
174
168
  catalog: "Catalog"
175
169
  dataset_name: str
176
170
  dataset_version: int
@@ -1097,26 +1091,42 @@ class DatasetQuery:
1097
1091
  self.temp_table_names: list[str] = []
1098
1092
  self.dependencies: set[DatasetDependencyType] = set()
1099
1093
  self.table = self.get_table()
1100
- self.starting_step: StartingStep
1094
+ self.starting_step: Optional[QueryStep] = None
1101
1095
  self.name: Optional[str] = None
1102
1096
  self.version: Optional[int] = None
1103
1097
  self.feature_schema: Optional[dict] = None
1104
1098
  self.column_types: Optional[dict[str, Any]] = None
1099
+ self.before_steps: list[Callable] = []
1105
1100
 
1106
- self.name = name
1101
+ self.list_ds_name: Optional[str] = None
1107
1102
 
1108
- if fallback_to_studio and is_token_set():
1109
- ds = self.catalog.get_dataset_with_remote_fallback(name, version)
1103
+ self.name = name
1104
+ self.dialect = self.catalog.warehouse.db.dialect
1105
+ if version:
1106
+ self.version = version
1107
+
1108
+ if is_listing_dataset(name):
1109
+ # not setting query step yet as listing dataset might not exist at
1110
+ # this point
1111
+ self.list_ds_name = name
1112
+ elif fallback_to_studio and is_token_set():
1113
+ self._set_starting_step(
1114
+ self.catalog.get_dataset_with_remote_fallback(name, version)
1115
+ )
1110
1116
  else:
1111
- ds = self.catalog.get_dataset(name)
1117
+ self._set_starting_step(self.catalog.get_dataset(name))
1118
+
1119
+ def _set_starting_step(self, ds: "DatasetRecord") -> None:
1120
+ if not self.version:
1121
+ self.version = ds.latest_version
1112
1122
 
1113
- self.version = version or ds.latest_version
1123
+ self.starting_step = QueryStep(self.catalog, ds.name, self.version)
1124
+
1125
+ # at this point we know our starting dataset so setting up schemas
1114
1126
  self.feature_schema = ds.get_version(self.version).feature_schema
1115
1127
  self.column_types = copy(ds.schema)
1116
1128
  if "sys__id" in self.column_types:
1117
1129
  self.column_types.pop("sys__id")
1118
- self.starting_step = QueryStep(self.catalog, name, self.version)
1119
- self.dialect = self.catalog.warehouse.db.dialect
1120
1130
 
1121
1131
  def __iter__(self):
1122
1132
  return iter(self.db_results())
@@ -1180,11 +1190,23 @@ class DatasetQuery:
1180
1190
  col.table = self.table
1181
1191
  return col
1182
1192
 
1193
+ def add_before_steps(self, fn: Callable) -> None:
1194
+ """
1195
+ Setting custom function to be run before applying steps
1196
+ """
1197
+ self.before_steps.append(fn)
1198
+
1183
1199
  def apply_steps(self) -> QueryGenerator:
1184
1200
  """
1185
1201
  Apply the steps in the query and return the resulting
1186
1202
  sqlalchemy.SelectBase.
1187
1203
  """
1204
+ for fn in self.before_steps:
1205
+ fn()
1206
+
1207
+ if self.list_ds_name:
1208
+ # at this point we know what is our starting listing dataset name
1209
+ self._set_starting_step(self.catalog.get_dataset(self.list_ds_name)) # type: ignore [arg-type]
1188
1210
  query = self.clone()
1189
1211
 
1190
1212
  index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
@@ -1203,6 +1225,7 @@ class DatasetQuery:
1203
1225
  query = query.filter(C.sys__rand % total == index)
1204
1226
  query.steps = query.steps[-1:] + query.steps[:-1]
1205
1227
 
1228
+ assert query.starting_step
1206
1229
  result = query.starting_step.apply()
1207
1230
  self.dependencies.update(result.dependencies)
1208
1231
 
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.0
3
+ Version: 0.14.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
- License: Apache-2.0
6
+ License-Expression: Apache-2.0
7
7
  Project-URL: Documentation, https://datachain.dvc.ai
8
8
  Project-URL: Issues, https://github.com/iterative/datachain/issues
9
9
  Project-URL: Source, https://github.com/iterative/datachain
@@ -2,5 +2,5 @@ from datachain.cli import ls
2
2
 
3
3
 
4
4
  def test_ls(benchmark, tmp_dir):
5
- bucket = "s3://noaa-bathymetry-pds/"
5
+ bucket = "s3://noaa-dcdb-bathymetry-pds/"
6
6
  benchmark.pedantic(ls, args=([bucket],), kwargs={"client_config": {"anon": True}})
@@ -9,6 +9,7 @@ import uuid
9
9
  from collections.abc import Iterator
10
10
  from datetime import datetime, timedelta, timezone
11
11
  from pathlib import Path
12
+ from unittest.mock import patch
12
13
 
13
14
  import numpy as np
14
15
  import pandas as pd
@@ -152,7 +153,7 @@ def test_from_storage_partials(cloud_test_catalog):
152
153
  return name
153
154
 
154
155
  dogs_uri = f"{src_uri}/dogs"
155
- dc.from_storage(dogs_uri, session=session)
156
+ dc.from_storage(dogs_uri, session=session).exec()
156
157
  assert _get_listing_datasets(session) == [
157
158
  f"{_list_dataset_name(dogs_uri)}@v1",
158
159
  ]
@@ -162,7 +163,7 @@ def test_from_storage_partials(cloud_test_catalog):
162
163
  f"{_list_dataset_name(dogs_uri)}@v1",
163
164
  ]
164
165
 
165
- dc.from_storage(src_uri, session=session)
166
+ dc.from_storage(src_uri, session=session).exec()
166
167
  assert _get_listing_datasets(session) == sorted(
167
168
  [
168
169
  f"{_list_dataset_name(dogs_uri)}@v1",
@@ -170,7 +171,7 @@ def test_from_storage_partials(cloud_test_catalog):
170
171
  ]
171
172
  )
172
173
 
173
- dc.from_storage(f"{src_uri}/cats", session=session)
174
+ dc.from_storage(f"{src_uri}/cats", session=session).exec()
174
175
  assert _get_listing_datasets(session) == sorted(
175
176
  [
176
177
  f"{_list_dataset_name(dogs_uri)}@v1",
@@ -196,14 +197,14 @@ def test_from_storage_partials_with_update(cloud_test_catalog):
196
197
  return name
197
198
 
198
199
  uri = f"{src_uri}/cats"
199
- dc.from_storage(uri, session=session)
200
+ dc.from_storage(uri, session=session).exec()
200
201
  assert _get_listing_datasets(session) == sorted(
201
202
  [
202
203
  f"{_list_dataset_name(uri)}@v1",
203
204
  ]
204
205
  )
205
206
 
206
- dc.from_storage(uri, session=session, update=True)
207
+ dc.from_storage(uri, session=session, update=True).exec()
207
208
  assert _get_listing_datasets(session) == sorted(
208
209
  [
209
210
  f"{_list_dataset_name(uri)}@v1",
@@ -369,6 +370,85 @@ def test_export_images_files(test_session, tmp_dir, tmp_path, use_cache):
369
370
  assert images_equal(img["data"], exported_img)
370
371
 
371
372
 
373
+ @pytest.mark.parametrize("use_cache", [True, False])
374
+ def test_from_storage_multiple_uris_files(test_session, tmp_dir, tmp_path, use_cache):
375
+ images = [
376
+ {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
377
+ {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
378
+ ]
379
+
380
+ for img in images:
381
+ img["data"].save(tmp_path / img["name"])
382
+
383
+ dc.from_storage(
384
+ [
385
+ f"file://{tmp_path}/img1.jpg",
386
+ f"file://{tmp_path}/img2.jpg",
387
+ ],
388
+ session=test_session,
389
+ anon=True,
390
+ update=True,
391
+ ).to_storage(tmp_dir / "output", placement="filename")
392
+
393
+ for img in images:
394
+ exported_img = Image.open(tmp_dir / "output" / img["name"])
395
+ assert images_equal(img["data"], exported_img)
396
+
397
+ chain = dc.from_storage(
398
+ [
399
+ f"file://{tmp_path}/img1.jpg",
400
+ f"file://{tmp_path}/img2.jpg",
401
+ f"file://{tmp_dir}/output/*",
402
+ ]
403
+ )
404
+ assert chain.count() == 4
405
+
406
+ chain = dc.from_storage([f"file://{tmp_dir}/output/*"])
407
+ assert chain.count() == 2
408
+
409
+
410
+ @pytest.mark.parametrize(
411
+ "cloud_type",
412
+ ["s3", "azure", "gs"],
413
+ indirect=True,
414
+ )
415
+ def test_from_storage_multiple_uris_cache(cloud_test_catalog):
416
+ ctc = cloud_test_catalog
417
+ src_uri = ctc.src_uri
418
+ session = ctc.session
419
+
420
+ with pytest.raises(ValueError):
421
+ dc.from_storage([]) # No URIs provided
422
+
423
+ with patch(
424
+ "datachain.lib.dc.storage.get_listing", wraps=dc.lib.listing.get_listing
425
+ ) as mock_get_listing:
426
+ chain = dc.from_storage(
427
+ [
428
+ f"{src_uri}/cats",
429
+ f"{src_uri}/dogs",
430
+ f"{src_uri}/cats/cat*",
431
+ f"{src_uri}/dogs/dog*",
432
+ ],
433
+ session=session,
434
+ update=True,
435
+ ).exec()
436
+ assert chain.count() == 11
437
+
438
+ files = chain.collect("file")
439
+ assert {f.name for f in files} == {
440
+ "cat1",
441
+ "cat2",
442
+ "dog1",
443
+ "dog2",
444
+ "dog3",
445
+ "dog4",
446
+ }
447
+
448
+ # Verify from_records was called exactly twice
449
+ assert mock_get_listing.call_count == 4 # TODO FIX THIS
450
+
451
+
372
452
  def test_from_storage_path_object(test_session, tmp_dir, tmp_path):
373
453
  images = [
374
454
  {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
@@ -32,7 +32,7 @@ def test_ls_no_args(cloud_test_catalog, cloud_type, capsys):
32
32
  catalog = session.catalog
33
33
  src = cloud_test_catalog.src_uri
34
34
 
35
- dc.from_storage(src, session=session).collect()
35
+ dc.from_storage(src, session=session).exec()
36
36
  ls([], catalog=catalog)
37
37
  captured = capsys.readouterr()
38
38
  assert captured.out == f"{src}/@v1\n"
@@ -339,7 +339,7 @@ def test_listings(test_session, tmp_dir):
339
339
  df.to_parquet(tmp_dir / "df.parquet")
340
340
 
341
341
  uri = tmp_dir.as_uri()
342
- dc.from_storage(uri, session=test_session)
342
+ dc.from_storage(uri, session=test_session).exec()
343
343
 
344
344
  # check that listing is not returned as normal dataset
345
345
  assert not any(
@@ -370,13 +370,13 @@ def test_listings_reindex(test_session, tmp_dir):
370
370
 
371
371
  uri = tmp_dir.as_uri()
372
372
 
373
- dc.from_storage(uri, session=test_session)
373
+ dc.from_storage(uri, session=test_session).exec()
374
374
  assert len(list(dc.listings(session=test_session).collect("listing"))) == 1
375
375
 
376
- dc.from_storage(uri, session=test_session)
376
+ dc.from_storage(uri, session=test_session).exec()
377
377
  assert len(list(dc.listings(session=test_session).collect("listing"))) == 1
378
378
 
379
- dc.from_storage(uri, session=test_session, update=True)
379
+ dc.from_storage(uri, session=test_session, update=True).exec()
380
380
  listings = list(dc.listings(session=test_session).collect("listing"))
381
381
  assert len(listings) == 2
382
382
  listings.sort(key=lambda lst: lst.version)
@@ -1,118 +0,0 @@
1
- import os.path
2
- from typing import (
3
- TYPE_CHECKING,
4
- Optional,
5
- Union,
6
- )
7
-
8
- from datachain.lib.file import (
9
- File,
10
- FileType,
11
- get_file_type,
12
- )
13
- from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
14
- from datachain.query import Session
15
-
16
- if TYPE_CHECKING:
17
- from .datachain import DataChain
18
-
19
-
20
- def from_storage(
21
- uri: Union[str, os.PathLike[str]],
22
- *,
23
- type: FileType = "binary",
24
- session: Optional[Session] = None,
25
- settings: Optional[dict] = None,
26
- in_memory: bool = False,
27
- recursive: Optional[bool] = True,
28
- object_name: str = "file",
29
- update: bool = False,
30
- anon: bool = False,
31
- client_config: Optional[dict] = None,
32
- ) -> "DataChain":
33
- """Get data from a storage as a list of file with all file attributes.
34
- It returns the chain itself as usual.
35
-
36
- Parameters:
37
- uri : storage URI with directory. URI must start with storage prefix such
38
- as `s3://`, `gs://`, `az://` or "file:///"
39
- type : read file as "binary", "text", or "image" data. Default is "binary".
40
- recursive : search recursively for the given path.
41
- object_name : Created object column name.
42
- update : force storage reindexing. Default is False.
43
- anon : If True, we will treat cloud bucket as public one
44
- client_config : Optional client configuration for the storage client.
45
-
46
- Example:
47
- Simple call from s3
48
- ```py
49
- import datachain as dc
50
- chain = dc.from_storage("s3://my-bucket/my-dir")
51
- ```
52
-
53
- With AWS S3-compatible storage
54
- ```py
55
- import datachain as dc
56
- chain = dc.from_storage(
57
- "s3://my-bucket/my-dir",
58
- client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
59
- )
60
- ```
61
-
62
- Pass existing session
63
- ```py
64
- session = Session.get()
65
- import datachain as dc
66
- chain = dc.from_storage("s3://my-bucket/my-dir", session=session)
67
- ```
68
- """
69
- from .datachain import DataChain
70
- from .datasets import from_dataset
71
- from .records import from_records
72
- from .values import from_values
73
-
74
- file_type = get_file_type(type)
75
-
76
- if anon:
77
- client_config = (client_config or {}) | {"anon": True}
78
- session = Session.get(session, client_config=client_config, in_memory=in_memory)
79
- cache = session.catalog.cache
80
- client_config = session.catalog.client_config
81
-
82
- list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
83
- uri, session, update=update
84
- )
85
-
86
- # ds_name is None if object is a file, we don't want to use cache
87
- # or do listing in that case - just read that single object
88
- if not list_ds_name:
89
- dc = from_values(
90
- session=session,
91
- settings=settings,
92
- in_memory=in_memory,
93
- file=[get_file_info(list_uri, cache, client_config=client_config)],
94
- )
95
- dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
96
- return dc
97
-
98
- if update or not list_ds_exists:
99
- # disable prefetch for listing, as it pre-downloads all files
100
- (
101
- from_records(
102
- DataChain.DEFAULT_FILE_RECORD,
103
- session=session,
104
- settings=settings,
105
- in_memory=in_memory,
106
- )
107
- .settings(prefetch=0)
108
- .gen(
109
- list_bucket(list_uri, cache, client_config=client_config),
110
- output={f"{object_name}": File},
111
- )
112
- .save(list_ds_name, listing=True)
113
- )
114
-
115
- dc = from_dataset(list_ds_name, session=session, settings=settings)
116
- dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
117
-
118
- return ls(dc, list_path, recursive=recursive, object_name=object_name)
File without changes
File without changes