datachain 0.3.13__tar.gz → 0.3.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (246) hide show
  1. {datachain-0.3.13/src/datachain.egg-info → datachain-0.3.15}/PKG-INFO +1 -1
  2. {datachain-0.3.13 → datachain-0.3.15}/examples/computer_vision/iptc_exif_xmp_lib.py +7 -1
  3. {datachain-0.3.13 → datachain-0.3.15}/examples/computer_vision/llava2_image_desc_lib.py +7 -1
  4. {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/json-csv-reader.py +0 -2
  5. {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/torch-loader.py +6 -1
  6. {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/udfs/stateful.py +2 -2
  7. {datachain-0.3.13 → datachain-0.3.15}/noxfile.py +1 -0
  8. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/asyn.py +4 -9
  9. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/catalog/catalog.py +20 -31
  10. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/azure.py +1 -13
  11. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/fsspec.py +16 -15
  12. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/gcs.py +2 -13
  13. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/hf.py +0 -10
  14. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/local.py +3 -12
  15. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/s3.py +9 -19
  16. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/sqlite.py +10 -1
  17. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/warehouse.py +11 -17
  18. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/dataset.py +1 -1
  19. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/arrow.py +51 -16
  20. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/dc.py +7 -2
  21. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/file.py +76 -2
  22. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/hf.py +23 -6
  23. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/listing.py +8 -7
  24. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/listing_info.py +2 -2
  25. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/model_store.py +2 -2
  26. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/pytorch.py +32 -26
  27. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/signal_schema.py +157 -60
  28. datachain-0.3.15/src/datachain/lib/tar.py +33 -0
  29. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/webdataset.py +3 -59
  30. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/listing.py +6 -8
  31. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/node.py +0 -43
  32. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/dataset.py +2 -6
  33. {datachain-0.3.13 → datachain-0.3.15/src/datachain.egg-info}/PKG-INFO +1 -1
  34. {datachain-0.3.13 → datachain-0.3.15}/src/datachain.egg-info/SOURCES.txt +1 -0
  35. {datachain-0.3.13 → datachain-0.3.15}/tests/conftest.py +35 -0
  36. {datachain-0.3.13 → datachain-0.3.15}/tests/data.py +11 -11
  37. {datachain-0.3.13 → datachain-0.3.15}/tests/examples/test_wds_e2e.py +10 -8
  38. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_catalog.py +28 -3
  39. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_datachain.py +198 -5
  40. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_dataset_query.py +6 -205
  41. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_datasets.py +4 -3
  42. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_feature_pickling.py +66 -1
  43. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_listing.py +2 -1
  44. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_pull.py +1 -2
  45. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_query.py +3 -0
  46. {datachain-0.3.13 → datachain-0.3.15}/tests/test_cli_e2e.py +10 -3
  47. {datachain-0.3.13 → datachain-0.3.15}/tests/test_query_e2e.py +10 -3
  48. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_arrow.py +24 -5
  49. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_datachain.py +3 -2
  50. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_datachain_bootstrap.py +38 -19
  51. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_file.py +84 -1
  52. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_hf.py +8 -8
  53. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_signal_schema.py +260 -8
  54. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_client.py +32 -24
  55. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_id_generator.py +3 -1
  56. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_listing.py +3 -2
  57. {datachain-0.3.13 → datachain-0.3.15}/.cruft.json +0 -0
  58. {datachain-0.3.13 → datachain-0.3.15}/.gitattributes +0 -0
  59. {datachain-0.3.13 → datachain-0.3.15}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  60. {datachain-0.3.13 → datachain-0.3.15}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  61. {datachain-0.3.13 → datachain-0.3.15}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  62. {datachain-0.3.13 → datachain-0.3.15}/.github/codecov.yaml +0 -0
  63. {datachain-0.3.13 → datachain-0.3.15}/.github/dependabot.yml +0 -0
  64. {datachain-0.3.13 → datachain-0.3.15}/.github/workflows/benchmarks.yml +0 -0
  65. {datachain-0.3.13 → datachain-0.3.15}/.github/workflows/release.yml +0 -0
  66. {datachain-0.3.13 → datachain-0.3.15}/.github/workflows/tests-studio.yml +0 -0
  67. {datachain-0.3.13 → datachain-0.3.15}/.github/workflows/tests.yml +0 -0
  68. {datachain-0.3.13 → datachain-0.3.15}/.github/workflows/update-template.yaml +0 -0
  69. {datachain-0.3.13 → datachain-0.3.15}/.gitignore +0 -0
  70. {datachain-0.3.13 → datachain-0.3.15}/.pre-commit-config.yaml +0 -0
  71. {datachain-0.3.13 → datachain-0.3.15}/CODE_OF_CONDUCT.rst +0 -0
  72. {datachain-0.3.13 → datachain-0.3.15}/CONTRIBUTING.rst +0 -0
  73. {datachain-0.3.13 → datachain-0.3.15}/LICENSE +0 -0
  74. {datachain-0.3.13 → datachain-0.3.15}/README.rst +0 -0
  75. {datachain-0.3.13 → datachain-0.3.15}/docs/assets/captioned_cartoons.png +0 -0
  76. {datachain-0.3.13 → datachain-0.3.15}/docs/assets/datachain-white.svg +0 -0
  77. {datachain-0.3.13 → datachain-0.3.15}/docs/assets/datachain.svg +0 -0
  78. {datachain-0.3.13 → datachain-0.3.15}/docs/assets/flowchart.png +0 -0
  79. {datachain-0.3.13 → datachain-0.3.15}/docs/index.md +0 -0
  80. {datachain-0.3.13 → datachain-0.3.15}/docs/references/datachain.md +0 -0
  81. {datachain-0.3.13 → datachain-0.3.15}/docs/references/datatype.md +0 -0
  82. {datachain-0.3.13 → datachain-0.3.15}/docs/references/file.md +0 -0
  83. {datachain-0.3.13 → datachain-0.3.15}/docs/references/index.md +0 -0
  84. {datachain-0.3.13 → datachain-0.3.15}/docs/references/sql.md +0 -0
  85. {datachain-0.3.13 → datachain-0.3.15}/docs/references/torch.md +0 -0
  86. {datachain-0.3.13 → datachain-0.3.15}/docs/references/udf.md +0 -0
  87. {datachain-0.3.13 → datachain-0.3.15}/examples/computer_vision/openimage-detect.py +0 -0
  88. {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/common_sql_functions.py +0 -0
  89. {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/udfs/parallel.py +0 -0
  90. {datachain-0.3.13 → datachain-0.3.15}/examples/get_started/udfs/simple.py +0 -0
  91. {datachain-0.3.13 → datachain-0.3.15}/examples/llm_and_nlp/claude-query.py +0 -0
  92. {datachain-0.3.13 → datachain-0.3.15}/examples/llm_and_nlp/unstructured-text.py +0 -0
  93. {datachain-0.3.13 → datachain-0.3.15}/examples/multimodal/clip_inference.py +0 -0
  94. {datachain-0.3.13 → datachain-0.3.15}/examples/multimodal/hf_pipeline.py +0 -0
  95. {datachain-0.3.13 → datachain-0.3.15}/examples/multimodal/openai_image_desc_lib.py +0 -0
  96. {datachain-0.3.13 → datachain-0.3.15}/examples/multimodal/wds.py +0 -0
  97. {datachain-0.3.13 → datachain-0.3.15}/examples/multimodal/wds_filtered.py +0 -0
  98. {datachain-0.3.13 → datachain-0.3.15}/mkdocs.yml +0 -0
  99. {datachain-0.3.13 → datachain-0.3.15}/pyproject.toml +0 -0
  100. {datachain-0.3.13 → datachain-0.3.15}/setup.cfg +0 -0
  101. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/__init__.py +0 -0
  102. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/__main__.py +0 -0
  103. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/cache.py +0 -0
  104. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/catalog/__init__.py +0 -0
  105. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/catalog/datasource.py +0 -0
  106. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/catalog/loader.py +0 -0
  107. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/cli.py +0 -0
  108. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/cli_utils.py +0 -0
  109. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/__init__.py +0 -0
  110. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/client/fileslice.py +0 -0
  111. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/config.py +0 -0
  112. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/__init__.py +0 -0
  113. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/db_engine.py +0 -0
  114. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/id_generator.py +0 -0
  115. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/job.py +0 -0
  116. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/metastore.py +0 -0
  117. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/schema.py +0 -0
  118. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/data_storage/serializer.py +0 -0
  119. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/error.py +0 -0
  120. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/job.py +0 -0
  121. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/__init__.py +0 -0
  122. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/clip.py +0 -0
  123. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/__init__.py +0 -0
  124. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/flatten.py +0 -0
  125. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/python_to_sql.py +0 -0
  126. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/sql_to_python.py +0 -0
  127. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/unflatten.py +0 -0
  128. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  129. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/data_model.py +0 -0
  130. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/dataset_info.py +0 -0
  131. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/image.py +0 -0
  132. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/meta_formats.py +0 -0
  133. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/settings.py +0 -0
  134. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/text.py +0 -0
  135. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/udf.py +0 -0
  136. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/udf_signature.py +0 -0
  137. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/utils.py +0 -0
  138. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/vfile.py +0 -0
  139. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/lib/webdataset_laion.py +0 -0
  140. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/nodes_fetcher.py +0 -0
  141. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/nodes_thread_pool.py +0 -0
  142. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/progress.py +0 -0
  143. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/py.typed +0 -0
  144. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/__init__.py +0 -0
  145. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/batch.py +0 -0
  146. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/builtins.py +0 -0
  147. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/dispatch.py +0 -0
  148. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/metrics.py +0 -0
  149. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/params.py +0 -0
  150. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/queue.py +0 -0
  151. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/schema.py +0 -0
  152. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/session.py +0 -0
  153. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/query/udf.py +0 -0
  154. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/remote/__init__.py +0 -0
  155. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/remote/studio.py +0 -0
  156. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/__init__.py +0 -0
  157. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/default/__init__.py +0 -0
  158. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/default/base.py +0 -0
  159. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/__init__.py +0 -0
  160. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/array.py +0 -0
  161. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/conditional.py +0 -0
  162. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/path.py +0 -0
  163. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/random.py +0 -0
  164. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/functions/string.py +0 -0
  165. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/selectable.py +0 -0
  166. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/sqlite/__init__.py +0 -0
  167. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/sqlite/base.py +0 -0
  168. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/sqlite/types.py +0 -0
  169. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/sqlite/vector.py +0 -0
  170. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/types.py +0 -0
  171. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/sql/utils.py +0 -0
  172. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/storage.py +0 -0
  173. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/torch/__init__.py +0 -0
  174. {datachain-0.3.13 → datachain-0.3.15}/src/datachain/utils.py +0 -0
  175. {datachain-0.3.13 → datachain-0.3.15}/src/datachain.egg-info/dependency_links.txt +0 -0
  176. {datachain-0.3.13 → datachain-0.3.15}/src/datachain.egg-info/entry_points.txt +0 -0
  177. {datachain-0.3.13 → datachain-0.3.15}/src/datachain.egg-info/requires.txt +0 -0
  178. {datachain-0.3.13 → datachain-0.3.15}/src/datachain.egg-info/top_level.txt +0 -0
  179. {datachain-0.3.13 → datachain-0.3.15}/tests/__init__.py +0 -0
  180. {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/__init__.py +0 -0
  181. {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/conftest.py +0 -0
  182. {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  183. {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/datasets/.dvc/config +0 -0
  184. {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/datasets/.gitignore +0 -0
  185. {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  186. {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/test_datachain.py +0 -0
  187. {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/test_ls.py +0 -0
  188. {datachain-0.3.13 → datachain-0.3.15}/tests/benchmarks/test_version.py +0 -0
  189. {datachain-0.3.13 → datachain-0.3.15}/tests/examples/__init__.py +0 -0
  190. {datachain-0.3.13 → datachain-0.3.15}/tests/examples/test_examples.py +0 -0
  191. {datachain-0.3.13 → datachain-0.3.15}/tests/examples/wds_data.py +0 -0
  192. {datachain-0.3.13 → datachain-0.3.15}/tests/func/__init__.py +0 -0
  193. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_client.py +0 -0
  194. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_ls.py +0 -0
  195. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_meta_formats.py +0 -0
  196. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_metrics.py +0 -0
  197. {datachain-0.3.13 → datachain-0.3.15}/tests/func/test_pytorch.py +0 -0
  198. {datachain-0.3.13 → datachain-0.3.15}/tests/scripts/feature_class.py +0 -0
  199. {datachain-0.3.13 → datachain-0.3.15}/tests/scripts/feature_class_parallel.py +0 -0
  200. {datachain-0.3.13 → datachain-0.3.15}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  201. {datachain-0.3.13 → datachain-0.3.15}/tests/scripts/name_len_slow.py +0 -0
  202. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/__init__.py +0 -0
  203. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/__init__.py +0 -0
  204. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/conftest.py +0 -0
  205. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_clip.py +0 -0
  206. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_datachain_merge.py +0 -0
  207. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_feature.py +0 -0
  208. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_feature_utils.py +0 -0
  209. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_image.py +0 -0
  210. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_schema.py +0 -0
  211. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_sql_to_python.py +0 -0
  212. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_text.py +0 -0
  213. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_udf_signature.py +0 -0
  214. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_utils.py +0 -0
  215. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/lib/test_webdataset.py +0 -0
  216. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/__init__.py +0 -0
  217. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/sqlite/__init__.py +0 -0
  218. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/sqlite/test_utils.py +0 -0
  219. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_array.py +0 -0
  220. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_conditional.py +0 -0
  221. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_path.py +0 -0
  222. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_random.py +0 -0
  223. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_selectable.py +0 -0
  224. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/sql/test_string.py +0 -0
  225. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_asyn.py +0 -0
  226. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_cache.py +0 -0
  227. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_catalog.py +0 -0
  228. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_catalog_loader.py +0 -0
  229. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_cli_parsing.py +0 -0
  230. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_client_s3.py +0 -0
  231. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_data_storage.py +0 -0
  232. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_database_engine.py +0 -0
  233. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_dataset.py +0 -0
  234. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_dispatch.py +0 -0
  235. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_fileslice.py +0 -0
  236. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_metastore.py +0 -0
  237. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_module_exports.py +0 -0
  238. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_query_metrics.py +0 -0
  239. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_query_params.py +0 -0
  240. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_serializer.py +0 -0
  241. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_session.py +0 -0
  242. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_storage.py +0 -0
  243. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_udf.py +0 -0
  244. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_utils.py +0 -0
  245. {datachain-0.3.13 → datachain-0.3.15}/tests/unit/test_warehouse.py +0 -0
  246. {datachain-0.3.13 → datachain-0.3.15}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.13
3
+ Version: 0.3.15
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -1,4 +1,10 @@
1
- # pip install defusedxml
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
2
8
  import json
3
9
 
4
10
  from PIL import (
@@ -1,4 +1,10 @@
1
- # pip install accelerate torch
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
2
8
  import torch
3
9
  from transformers import (
4
10
  AutoProcessor,
@@ -1,5 +1,3 @@
1
- # pip install datamodel-code-generator jmespath
2
-
3
1
  from typing import Optional
4
2
 
5
3
  from pydantic import BaseModel
@@ -1,4 +1,9 @@
1
- # pip install Pillow torchvision
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[torch]
5
+
6
+ """
2
7
 
3
8
  import os
4
9
  from posixpath import basename
@@ -1,7 +1,7 @@
1
1
  """
2
- To install dependencies:
2
+ To install the required dependencies:
3
3
 
4
- pip install open_clip_torch
4
+ pip install datachain[examples]
5
5
 
6
6
  """
7
7
 
@@ -40,6 +40,7 @@ def tests(session: nox.Session) -> None:
40
40
  "--cov-report=xml",
41
41
  "--durations=10",
42
42
  "--numprocesses=logical",
43
+ "--dist=loadgroup",
43
44
  *session.posargs,
44
45
  env={"COVERAGE_FILE": f".coverage.{session.python}"},
45
46
  )
@@ -1,14 +1,8 @@
1
1
  import asyncio
2
- from collections.abc import Awaitable, Coroutine, Iterable
2
+ from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
3
3
  from concurrent.futures import ThreadPoolExecutor
4
4
  from heapq import heappop, heappush
5
- from typing import (
6
- Any,
7
- Callable,
8
- Generic,
9
- Optional,
10
- TypeVar,
11
- )
5
+ from typing import Any, Callable, Generic, Optional, TypeVar
12
6
 
13
7
  from fsspec.asyn import get_loop
14
8
 
@@ -16,6 +10,7 @@ ASYNC_WORKERS = 20
16
10
 
17
11
  InputT = TypeVar("InputT", contravariant=True) # noqa: PLC0105
18
12
  ResultT = TypeVar("ResultT", covariant=True) # noqa: PLC0105
13
+ T = TypeVar("T")
19
14
 
20
15
 
21
16
  class AsyncMapper(Generic[InputT, ResultT]):
@@ -226,7 +221,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
226
221
  self._push_result(self._next_yield, None)
227
222
 
228
223
 
229
- def iter_over_async(ait, loop):
224
+ def iter_over_async(ait: AsyncIterable[T], loop) -> Iterator[T]:
230
225
  """Wrap an asynchronous iterator into a synchronous one"""
231
226
  ait = ait.__aiter__()
232
227
 
@@ -621,10 +621,6 @@ class Catalog:
621
621
  code_ast.body[-1:] = new_expressions
622
622
  return code_ast
623
623
 
624
- def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
625
- config = config or self.client_config
626
- return Client.parse_url(uri, self.cache, **config)
627
-
628
624
  def get_client(self, uri: StorageURI, **config: Any) -> Client:
629
625
  """
630
626
  Return the client corresponding to the given source `uri`.
@@ -651,17 +647,16 @@ class Catalog:
651
647
  partial_path: Optional[str]
652
648
 
653
649
  client_config = client_config or self.client_config
654
- client, path = self.parse_url(source, **client_config)
650
+ uri, path = Client.parse_url(source)
651
+ client = Client.get_client(source, self.cache, **client_config)
655
652
  stem = os.path.basename(os.path.normpath(path))
656
653
  prefix = (
657
654
  posixpath.dirname(path)
658
655
  if glob.has_magic(stem) or client.fs.isfile(source)
659
656
  else path
660
657
  )
661
- storage_dataset_name = Storage.dataset_name(
662
- client.uri, posixpath.join(prefix, "")
663
- )
664
- source_metastore = self.metastore.clone(client.uri)
658
+ storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
659
+ source_metastore = self.metastore.clone(uri)
665
660
 
666
661
  columns = [
667
662
  Column("path", String),
@@ -675,15 +670,13 @@ class Catalog:
675
670
  ]
676
671
 
677
672
  if skip_indexing:
678
- source_metastore.create_storage_if_not_registered(client.uri)
679
- storage = source_metastore.get_storage(client.uri)
680
- source_metastore.init_partial_id(client.uri)
681
- partial_id = source_metastore.get_next_partial_id(client.uri)
673
+ source_metastore.create_storage_if_not_registered(uri)
674
+ storage = source_metastore.get_storage(uri)
675
+ source_metastore.init_partial_id(uri)
676
+ partial_id = source_metastore.get_next_partial_id(uri)
682
677
 
683
- source_metastore = self.metastore.clone(
684
- uri=client.uri, partial_id=partial_id
685
- )
686
- source_metastore.init(client.uri)
678
+ source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
679
+ source_metastore.init(uri)
687
680
 
688
681
  source_warehouse = self.warehouse.clone()
689
682
  dataset = self.create_dataset(
@@ -701,20 +694,16 @@ class Catalog:
701
694
  in_progress,
702
695
  partial_id,
703
696
  partial_path,
704
- ) = source_metastore.register_storage_for_indexing(
705
- client.uri, force_update, prefix
706
- )
697
+ ) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
707
698
  if in_progress:
708
699
  raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
709
700
 
710
701
  if not need_index:
711
702
  assert partial_id is not None
712
703
  assert partial_path is not None
713
- source_metastore = self.metastore.clone(
714
- uri=client.uri, partial_id=partial_id
715
- )
704
+ source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
716
705
  source_warehouse = self.warehouse.clone()
717
- dataset = self.get_dataset(Storage.dataset_name(client.uri, partial_path))
706
+ dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
718
707
  lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
719
708
  logger.debug(
720
709
  "Using cached listing %s. Valid till: %s",
@@ -731,11 +720,11 @@ class Catalog:
731
720
 
732
721
  return lst, path
733
722
 
734
- source_metastore.init_partial_id(client.uri)
735
- partial_id = source_metastore.get_next_partial_id(client.uri)
723
+ source_metastore.init_partial_id(uri)
724
+ partial_id = source_metastore.get_next_partial_id(uri)
736
725
 
737
- source_metastore.init(client.uri)
738
- source_metastore = self.metastore.clone(uri=client.uri, partial_id=partial_id)
726
+ source_metastore.init(uri)
727
+ source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
739
728
 
740
729
  source_warehouse = self.warehouse.clone()
741
730
 
@@ -1370,7 +1359,7 @@ class Catalog:
1370
1359
 
1371
1360
  def signed_url(self, source: str, path: str, client_config=None) -> str:
1372
1361
  client_config = client_config or self.client_config
1373
- client, _ = self.parse_url(source, **client_config)
1362
+ client = Client.get_client(source, self.cache, **client_config)
1374
1363
  return client.url(path)
1375
1364
 
1376
1365
  def export_dataset_table(
@@ -1390,12 +1379,12 @@ class Catalog:
1390
1379
  dataset = self.get_dataset(name)
1391
1380
  return self.warehouse.dataset_table_export_file_names(dataset, version)
1392
1381
 
1393
- def dataset_stats(self, name: str, version: int) -> DatasetStats:
1382
+ def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
1394
1383
  """
1395
1384
  Returns tuple with dataset stats: total number of rows and total dataset size.
1396
1385
  """
1397
1386
  dataset = self.get_dataset(name)
1398
- dataset_version = dataset.get_version(version)
1387
+ dataset_version = dataset.get_version(version or dataset.latest_version)
1399
1388
  return DatasetStats(
1400
1389
  num_objects=dataset_version.num_objects,
1401
1390
  size=dataset_version.size,
@@ -4,7 +4,6 @@ from adlfs import AzureBlobFileSystem
4
4
  from tqdm import tqdm
5
5
 
6
6
  from datachain.lib.file import File
7
- from datachain.node import Entry
8
7
 
9
8
  from .fsspec import DELIMITER, Client, ResultQueue
10
9
 
@@ -14,17 +13,6 @@ class AzureClient(Client):
14
13
  PREFIX = "az://"
15
14
  protocol = "az"
16
15
 
17
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
18
- version_id = v.get("version_id")
19
- return Entry.from_file(
20
- path=path,
21
- etag=v.get("etag", "").strip('"'),
22
- version=version_id or "",
23
- is_latest=version_id is None or bool(v.get("is_current_version")),
24
- last_modified=v["last_modified"],
25
- size=v.get("size", ""),
26
- )
27
-
28
16
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
29
17
  version_id = v.get("version_id")
30
18
  return File(
@@ -57,7 +45,7 @@ class AzureClient(Client):
57
45
  continue
58
46
  info = (await self.fs._details([b]))[0]
59
47
  entries.append(
60
- self.convert_info(info, self.rel_path(info["name"]))
48
+ self.info_to_file(info, self.rel_path(info["name"]))
61
49
  )
62
50
  if entries:
63
51
  await result_queue.put(entries)
@@ -29,7 +29,7 @@ from tqdm import tqdm
29
29
  from datachain.cache import DataChainCache, UniqueId
30
30
  from datachain.client.fileslice import FileSlice, FileWrapper
31
31
  from datachain.error import ClientError as DataChainClientError
32
- from datachain.node import Entry
32
+ from datachain.lib.file import File
33
33
  from datachain.nodes_fetcher import NodesFetcher
34
34
  from datachain.nodes_thread_pool import NodeChunk
35
35
  from datachain.storage import StorageURI
@@ -45,7 +45,7 @@ DELIMITER = "/" # Path delimiter.
45
45
 
46
46
  DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
47
47
 
48
- ResultQueue = asyncio.Queue[Optional[Sequence[Entry]]]
48
+ ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
49
49
 
50
50
 
51
51
  def _is_win_local_path(uri: str) -> bool:
@@ -116,15 +116,16 @@ class Client(ABC):
116
116
  return DATA_SOURCE_URI_PATTERN.match(name) is not None
117
117
 
118
118
  @staticmethod
119
- def parse_url(
120
- source: str,
121
- cache: DataChainCache,
122
- **kwargs,
123
- ) -> tuple["Client", str]:
119
+ def parse_url(source: str) -> tuple[StorageURI, str]:
120
+ cls = Client.get_implementation(source)
121
+ storage_name, rel_path = cls.split_url(source)
122
+ return cls.get_uri(storage_name), rel_path
123
+
124
+ @staticmethod
125
+ def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
124
126
  cls = Client.get_implementation(source)
125
- storage_url, rel_path = cls.split_url(source)
126
- client = cls.from_name(storage_url, cache, kwargs)
127
- return client, rel_path
127
+ storage_url, _ = cls.split_url(source)
128
+ return cls.from_name(storage_url, cache, kwargs)
128
129
 
129
130
  @classmethod
130
131
  def create_fs(cls, **kwargs) -> "AbstractFileSystem":
@@ -188,7 +189,7 @@ class Client(ABC):
188
189
 
189
190
  async def get_current_etag(self, uid: UniqueId) -> str:
190
191
  info = await self.fs._info(self.get_full_path(uid.path))
191
- return self.convert_info(info, "").etag
192
+ return self.info_to_file(info, "").etag
192
193
 
193
194
  async def get_size(self, path: str) -> int:
194
195
  return await self.fs._size(path)
@@ -198,7 +199,7 @@ class Client(ABC):
198
199
 
199
200
  async def scandir(
200
201
  self, start_prefix: str, method: str = "default"
201
- ) -> AsyncIterator[Sequence[Entry]]:
202
+ ) -> AsyncIterator[Sequence[File]]:
202
203
  try:
203
204
  impl = getattr(self, f"_fetch_{method}")
204
205
  except AttributeError:
@@ -264,7 +265,7 @@ class Client(ABC):
264
265
  ) -> None:
265
266
  await self._fetch_nested(start_prefix, result_queue)
266
267
 
267
- async def _fetch_dir(self, prefix, pbar, result_queue) -> set[str]:
268
+ async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
268
269
  path = f"{self.name}/{prefix}"
269
270
  infos = await self.ls_dir(path)
270
271
  files = []
@@ -277,7 +278,7 @@ class Client(ABC):
277
278
  if info["type"] == "directory":
278
279
  subdirs.add(subprefix)
279
280
  else:
280
- files.append(self.convert_info(info, subprefix))
281
+ files.append(self.info_to_file(info, subprefix))
281
282
  if files:
282
283
  await result_queue.put(files)
283
284
  found_count = len(subdirs) + len(files)
@@ -303,7 +304,7 @@ class Client(ABC):
303
304
  return f"{self.PREFIX}{self.name}/{rel_path}"
304
305
 
305
306
  @abstractmethod
306
- def convert_info(self, v: dict[str, Any], parent: str) -> Entry: ...
307
+ def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
307
308
 
308
309
  def fetch_nodes(
309
310
  self,
@@ -10,7 +10,6 @@ from gcsfs import GCSFileSystem
10
10
  from tqdm import tqdm
11
11
 
12
12
  from datachain.lib.file import File
13
- from datachain.node import Entry
14
13
 
15
14
  from .fsspec import DELIMITER, Client, ResultQueue
16
15
 
@@ -108,19 +107,9 @@ class GCSClient(Client):
108
107
  finally:
109
108
  await page_queue.put(None)
110
109
 
111
- def _entry_from_dict(self, d: dict[str, Any]) -> Entry:
110
+ def _entry_from_dict(self, d: dict[str, Any]) -> File:
112
111
  info = self.fs._process_object(self.name, d)
113
- return self.convert_info(info, self.rel_path(info["name"]))
114
-
115
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
116
- return Entry.from_file(
117
- path=path,
118
- etag=v.get("etag", ""),
119
- version=v.get("generation", ""),
120
- is_latest=not v.get("timeDeleted"),
121
- last_modified=self.parse_timestamp(v["updated"]),
122
- size=v.get("size", ""),
123
- )
112
+ return self.info_to_file(info, self.rel_path(info["name"]))
124
113
 
125
114
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
126
115
  return File(
@@ -5,7 +5,6 @@ from typing import Any, cast
5
5
  from huggingface_hub import HfFileSystem
6
6
 
7
7
  from datachain.lib.file import File
8
- from datachain.node import Entry
9
8
 
10
9
  from .fsspec import Client
11
10
 
@@ -22,15 +21,6 @@ class HfClient(Client):
22
21
 
23
22
  return cast(HfFileSystem, super().create_fs(**kwargs))
24
23
 
25
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
26
- return Entry.from_file(
27
- path=path,
28
- size=v["size"],
29
- version=v["last_commit"].oid,
30
- etag=v.get("blob_id", ""),
31
- last_modified=v["last_commit"].date,
32
- )
33
-
34
24
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
35
25
  return File(
36
26
  path=path,
@@ -7,8 +7,8 @@ from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
9
9
 
10
+ from datachain.cache import UniqueId
10
11
  from datachain.lib.file import File
11
- from datachain.node import Entry
12
12
  from datachain.storage import StorageURI
13
13
 
14
14
  from .fsspec import Client
@@ -114,9 +114,9 @@ class FileClient(Client):
114
114
  use_symlinks=use_symlinks,
115
115
  )
116
116
 
117
- async def get_current_etag(self, uid) -> str:
117
+ async def get_current_etag(self, uid: UniqueId) -> str:
118
118
  info = self.fs.info(self.get_full_path(uid.path))
119
- return self.convert_info(info, "").etag
119
+ return self.info_to_file(info, "").etag
120
120
 
121
121
  async def get_size(self, path: str) -> int:
122
122
  return self.fs.size(path)
@@ -136,15 +136,6 @@ class FileClient(Client):
136
136
  full_path += "/"
137
137
  return full_path
138
138
 
139
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
140
- return Entry.from_file(
141
- path=path,
142
- etag=v["mtime"].hex(),
143
- is_latest=True,
144
- last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
145
- size=v.get("size", ""),
146
- )
147
-
148
139
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
149
140
  return File(
150
141
  source=self.uri,
@@ -1,12 +1,11 @@
1
1
  import asyncio
2
- from typing import Any, cast
2
+ from typing import Any, Optional, cast
3
3
 
4
4
  from botocore.exceptions import NoCredentialsError
5
5
  from s3fs import S3FileSystem
6
6
  from tqdm import tqdm
7
7
 
8
8
  from datachain.lib.file import File
9
- from datachain.node import Entry
10
9
 
11
10
  from .fsspec import DELIMITER, Client, ResultQueue
12
11
 
@@ -111,8 +110,9 @@ class ClientS3(Client):
111
110
  ) -> None:
112
111
  await self._fetch_flat(start_prefix, result_queue)
113
112
 
114
- def _entry_from_boto(self, v, bucket, versions=False):
115
- return Entry.from_file(
113
+ def _entry_from_boto(self, v, bucket, versions=False) -> File:
114
+ return File(
115
+ source=self.uri,
116
116
  path=v["Key"],
117
117
  etag=v.get("ETag", "").strip('"'),
118
118
  version=ClientS3.clean_s3_version(v.get("VersionId", "")),
@@ -125,8 +125,8 @@ class ClientS3(Client):
125
125
  self,
126
126
  prefix,
127
127
  pbar,
128
- result_queue,
129
- ):
128
+ result_queue: ResultQueue,
129
+ ) -> set[str]:
130
130
  if prefix:
131
131
  prefix = prefix.lstrip(DELIMITER) + DELIMITER
132
132
  files = []
@@ -141,7 +141,7 @@ class ClientS3(Client):
141
141
  if info["type"] == "directory":
142
142
  subdirs.add(subprefix)
143
143
  else:
144
- files.append(self.convert_info(info, subprefix))
144
+ files.append(self.info_to_file(info, subprefix))
145
145
  pbar.update()
146
146
  found = True
147
147
  if not found:
@@ -152,18 +152,8 @@ class ClientS3(Client):
152
152
  return subdirs
153
153
 
154
154
  @staticmethod
155
- def clean_s3_version(ver):
156
- return ver if ver != "null" else ""
157
-
158
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
159
- return Entry.from_file(
160
- path=path,
161
- etag=v.get("ETag", "").strip('"'),
162
- version=ClientS3.clean_s3_version(v.get("VersionId", "")),
163
- is_latest=v.get("IsLatest", True),
164
- last_modified=v.get("LastModified", ""),
165
- size=v["size"],
166
- )
155
+ def clean_s3_version(ver: Optional[str]) -> str:
156
+ return ver if (ver is not None and ver != "null") else ""
167
157
 
168
158
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
169
159
  return File(
@@ -43,6 +43,8 @@ if TYPE_CHECKING:
43
43
  from sqlalchemy.sql.elements import ColumnElement
44
44
  from sqlalchemy.types import TypeEngine
45
45
 
46
+ from datachain.lib.file import File
47
+
46
48
 
47
49
  logger = logging.getLogger("datachain")
48
50
 
@@ -58,6 +60,10 @@ quote_schema = sqlite_dialect.identifier_preparer.quote_schema
58
60
  quote = sqlite_dialect.identifier_preparer.quote
59
61
 
60
62
 
63
+ def _get_in_memory_uri():
64
+ return "file::memory:?cache=shared"
65
+
66
+
61
67
  def get_retry_sleep_sec(retry_count: int) -> int:
62
68
  return RETRY_START_SEC * (RETRY_FACTOR**retry_count)
63
69
 
@@ -119,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
119
125
  if db_file == ":memory:":
120
126
  # Enable multithreaded usage of the same in-memory db
121
127
  db = sqlite3.connect(
122
- "file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
128
+ _get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
123
129
  )
124
130
  else:
125
131
  db = sqlite3.connect(
@@ -704,6 +710,9 @@ class SQLiteWarehouse(AbstractWarehouse):
704
710
 
705
711
  self.db.execute(insert_query)
706
712
 
713
+ def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
714
+ return (e.model_dump() for e in entries)
715
+
707
716
  def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
708
717
  rows = list(rows)
709
718
  if not rows:
@@ -20,7 +20,7 @@ from datachain.client import Client
20
20
  from datachain.data_storage.schema import convert_rows_custom_column_types
21
21
  from datachain.data_storage.serializer import Serializable
22
22
  from datachain.dataset import DatasetRecord
23
- from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
23
+ from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
24
24
  from datachain.sql.functions import path as pathfunc
25
25
  from datachain.sql.types import Int, SQLType
26
26
  from datachain.storage import StorageURI
@@ -34,6 +34,7 @@ if TYPE_CHECKING:
34
34
  from datachain.data_storage import AbstractIDGenerator, schema
35
35
  from datachain.data_storage.db_engine import DatabaseEngine
36
36
  from datachain.data_storage.schema import DataTable
37
+ from datachain.lib.file import File
37
38
 
38
39
  try:
39
40
  import numpy as np
@@ -401,25 +402,18 @@ class AbstractWarehouse(ABC, Serializable):
401
402
  expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
402
403
  sa.func.count(table.c.sys__id),
403
404
  )
404
- if "file__size" in table.columns:
405
- expressions = (*expressions, sa.func.sum(table.c.file__size))
406
- elif "size" in table.columns:
407
- expressions = (*expressions, sa.func.sum(table.c.size))
405
+ size_columns = [
406
+ c for c in table.columns if c.name == "size" or c.name.endswith("__size")
407
+ ]
408
+ if size_columns:
409
+ expressions = (*expressions, sa.func.sum(sum(size_columns)))
408
410
  query = select(*expressions)
409
411
  ((nrows, *rest),) = self.db.execute(query)
410
- return nrows, rest[0] if rest else None
411
-
412
- def prepare_entries(
413
- self, uri: str, entries: Iterable[Entry]
414
- ) -> list[dict[str, Any]]:
415
- """
416
- Prepares bucket listing entry (row) for inserting into database
417
- """
418
-
419
- def _prepare_entry(entry: Entry):
420
- return attrs.asdict(entry) | {"source": uri}
412
+ return nrows, rest[0] if rest else 0
421
413
 
422
- return [_prepare_entry(e) for e in entries]
414
+ @abstractmethod
415
+ def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
416
+ """Convert File entries so they can be passed on to `insert_rows()`"""
423
417
 
424
418
  @abstractmethod
425
419
  def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
@@ -112,7 +112,7 @@ class DatasetDependency:
112
112
 
113
113
  if is_listing_dataset(dataset_name):
114
114
  dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
115
- dependency_name = listing_uri_from_name(dataset_name)
115
+ dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
116
116
 
117
117
  return cls(
118
118
  id,