datachain 0.3.12__tar.gz → 0.3.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (245) hide show
  1. {datachain-0.3.12/src/datachain.egg-info → datachain-0.3.14}/PKG-INFO +1 -1
  2. {datachain-0.3.12 → datachain-0.3.14}/examples/computer_vision/iptc_exif_xmp_lib.py +7 -1
  3. {datachain-0.3.12 → datachain-0.3.14}/examples/computer_vision/llava2_image_desc_lib.py +7 -1
  4. {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/json-csv-reader.py +0 -2
  5. {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/torch-loader.py +6 -1
  6. {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/udfs/stateful.py +2 -2
  7. {datachain-0.3.12 → datachain-0.3.14}/noxfile.py +1 -0
  8. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/asyn.py +4 -9
  9. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/cache.py +0 -1
  10. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/catalog/catalog.py +3 -12
  11. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/cli.py +4 -6
  12. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/azure.py +1 -13
  13. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/fsspec.py +7 -8
  14. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/gcs.py +2 -13
  15. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/hf.py +0 -10
  16. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/local.py +3 -12
  17. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/s3.py +9 -23
  18. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/schema.py +4 -8
  19. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/sqlite.py +10 -1
  20. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/warehouse.py +17 -34
  21. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/dc.py +0 -1
  22. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/file.py +0 -3
  23. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/listing.py +1 -2
  24. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/model_store.py +2 -2
  25. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/pytorch.py +32 -26
  26. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/signal_schema.py +146 -58
  27. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/listing.py +8 -10
  28. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/node.py +3 -68
  29. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/builtins.py +0 -14
  30. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/schema.py +1 -16
  31. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/utils.py +0 -3
  32. {datachain-0.3.12 → datachain-0.3.14/src/datachain.egg-info}/PKG-INFO +1 -1
  33. {datachain-0.3.12 → datachain-0.3.14}/tests/conftest.py +35 -3
  34. {datachain-0.3.12 → datachain-0.3.14}/tests/data.py +11 -31
  35. {datachain-0.3.12 → datachain-0.3.14}/tests/examples/test_wds_e2e.py +10 -8
  36. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_catalog.py +32 -9
  37. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_datachain.py +164 -4
  38. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_dataset_query.py +23 -228
  39. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_feature_pickling.py +66 -1
  40. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_ls.py +0 -15
  41. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_pull.py +1 -11
  42. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_query.py +3 -0
  43. {datachain-0.3.12 → datachain-0.3.14}/tests/test_cli_e2e.py +10 -3
  44. {datachain-0.3.12 → datachain-0.3.14}/tests/test_query_e2e.py +10 -3
  45. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_datachain.py +1 -1
  46. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_file.py +3 -7
  47. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_signal_schema.py +244 -8
  48. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_cache.py +3 -7
  49. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_client_s3.py +0 -1
  50. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_data_storage.py +28 -32
  51. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_dataset.py +0 -6
  52. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_id_generator.py +3 -1
  53. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_listing.py +3 -2
  54. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_udf.py +0 -3
  55. {datachain-0.3.12 → datachain-0.3.14}/tests/utils.py +1 -15
  56. {datachain-0.3.12 → datachain-0.3.14}/.cruft.json +0 -0
  57. {datachain-0.3.12 → datachain-0.3.14}/.gitattributes +0 -0
  58. {datachain-0.3.12 → datachain-0.3.14}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  59. {datachain-0.3.12 → datachain-0.3.14}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  60. {datachain-0.3.12 → datachain-0.3.14}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  61. {datachain-0.3.12 → datachain-0.3.14}/.github/codecov.yaml +0 -0
  62. {datachain-0.3.12 → datachain-0.3.14}/.github/dependabot.yml +0 -0
  63. {datachain-0.3.12 → datachain-0.3.14}/.github/workflows/benchmarks.yml +0 -0
  64. {datachain-0.3.12 → datachain-0.3.14}/.github/workflows/release.yml +0 -0
  65. {datachain-0.3.12 → datachain-0.3.14}/.github/workflows/tests-studio.yml +0 -0
  66. {datachain-0.3.12 → datachain-0.3.14}/.github/workflows/tests.yml +0 -0
  67. {datachain-0.3.12 → datachain-0.3.14}/.github/workflows/update-template.yaml +0 -0
  68. {datachain-0.3.12 → datachain-0.3.14}/.gitignore +0 -0
  69. {datachain-0.3.12 → datachain-0.3.14}/.pre-commit-config.yaml +0 -0
  70. {datachain-0.3.12 → datachain-0.3.14}/CODE_OF_CONDUCT.rst +0 -0
  71. {datachain-0.3.12 → datachain-0.3.14}/CONTRIBUTING.rst +0 -0
  72. {datachain-0.3.12 → datachain-0.3.14}/LICENSE +0 -0
  73. {datachain-0.3.12 → datachain-0.3.14}/README.rst +0 -0
  74. {datachain-0.3.12 → datachain-0.3.14}/docs/assets/captioned_cartoons.png +0 -0
  75. {datachain-0.3.12 → datachain-0.3.14}/docs/assets/datachain-white.svg +0 -0
  76. {datachain-0.3.12 → datachain-0.3.14}/docs/assets/datachain.svg +0 -0
  77. {datachain-0.3.12 → datachain-0.3.14}/docs/assets/flowchart.png +0 -0
  78. {datachain-0.3.12 → datachain-0.3.14}/docs/index.md +0 -0
  79. {datachain-0.3.12 → datachain-0.3.14}/docs/references/datachain.md +0 -0
  80. {datachain-0.3.12 → datachain-0.3.14}/docs/references/datatype.md +0 -0
  81. {datachain-0.3.12 → datachain-0.3.14}/docs/references/file.md +0 -0
  82. {datachain-0.3.12 → datachain-0.3.14}/docs/references/index.md +0 -0
  83. {datachain-0.3.12 → datachain-0.3.14}/docs/references/sql.md +0 -0
  84. {datachain-0.3.12 → datachain-0.3.14}/docs/references/torch.md +0 -0
  85. {datachain-0.3.12 → datachain-0.3.14}/docs/references/udf.md +0 -0
  86. {datachain-0.3.12 → datachain-0.3.14}/examples/computer_vision/openimage-detect.py +0 -0
  87. {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/common_sql_functions.py +0 -0
  88. {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/udfs/parallel.py +0 -0
  89. {datachain-0.3.12 → datachain-0.3.14}/examples/get_started/udfs/simple.py +0 -0
  90. {datachain-0.3.12 → datachain-0.3.14}/examples/llm_and_nlp/claude-query.py +0 -0
  91. {datachain-0.3.12 → datachain-0.3.14}/examples/llm_and_nlp/unstructured-text.py +0 -0
  92. {datachain-0.3.12 → datachain-0.3.14}/examples/multimodal/clip_inference.py +0 -0
  93. {datachain-0.3.12 → datachain-0.3.14}/examples/multimodal/hf_pipeline.py +0 -0
  94. {datachain-0.3.12 → datachain-0.3.14}/examples/multimodal/openai_image_desc_lib.py +0 -0
  95. {datachain-0.3.12 → datachain-0.3.14}/examples/multimodal/wds.py +0 -0
  96. {datachain-0.3.12 → datachain-0.3.14}/examples/multimodal/wds_filtered.py +0 -0
  97. {datachain-0.3.12 → datachain-0.3.14}/mkdocs.yml +0 -0
  98. {datachain-0.3.12 → datachain-0.3.14}/pyproject.toml +0 -0
  99. {datachain-0.3.12 → datachain-0.3.14}/setup.cfg +0 -0
  100. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/__init__.py +0 -0
  101. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/__main__.py +0 -0
  102. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/catalog/__init__.py +0 -0
  103. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/catalog/datasource.py +0 -0
  104. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/catalog/loader.py +0 -0
  105. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/cli_utils.py +0 -0
  106. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/__init__.py +0 -0
  107. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/client/fileslice.py +0 -0
  108. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/config.py +0 -0
  109. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/__init__.py +0 -0
  110. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/db_engine.py +0 -0
  111. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/id_generator.py +0 -0
  112. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/job.py +0 -0
  113. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/metastore.py +0 -0
  114. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/data_storage/serializer.py +0 -0
  115. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/dataset.py +0 -0
  116. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/error.py +0 -0
  117. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/job.py +0 -0
  118. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/__init__.py +0 -0
  119. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/arrow.py +0 -0
  120. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/clip.py +0 -0
  121. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/__init__.py +0 -0
  122. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/flatten.py +0 -0
  123. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/python_to_sql.py +0 -0
  124. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/sql_to_python.py +0 -0
  125. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/unflatten.py +0 -0
  126. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  127. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/data_model.py +0 -0
  128. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/dataset_info.py +0 -0
  129. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/hf.py +0 -0
  130. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/image.py +0 -0
  131. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/listing_info.py +0 -0
  132. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/meta_formats.py +0 -0
  133. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/settings.py +0 -0
  134. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/text.py +0 -0
  135. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/udf.py +0 -0
  136. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/udf_signature.py +0 -0
  137. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/utils.py +0 -0
  138. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/vfile.py +0 -0
  139. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/webdataset.py +0 -0
  140. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/lib/webdataset_laion.py +0 -0
  141. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/nodes_fetcher.py +0 -0
  142. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/nodes_thread_pool.py +0 -0
  143. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/progress.py +0 -0
  144. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/py.typed +0 -0
  145. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/__init__.py +0 -0
  146. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/batch.py +0 -0
  147. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/dataset.py +0 -0
  148. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/dispatch.py +0 -0
  149. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/metrics.py +0 -0
  150. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/params.py +0 -0
  151. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/queue.py +0 -0
  152. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/session.py +0 -0
  153. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/query/udf.py +0 -0
  154. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/remote/__init__.py +0 -0
  155. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/remote/studio.py +0 -0
  156. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/__init__.py +0 -0
  157. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/default/__init__.py +0 -0
  158. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/default/base.py +0 -0
  159. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/__init__.py +0 -0
  160. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/array.py +0 -0
  161. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/conditional.py +0 -0
  162. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/path.py +0 -0
  163. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/random.py +0 -0
  164. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/functions/string.py +0 -0
  165. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/selectable.py +0 -0
  166. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/sqlite/__init__.py +0 -0
  167. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/sqlite/base.py +0 -0
  168. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/sqlite/types.py +0 -0
  169. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/sqlite/vector.py +0 -0
  170. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/types.py +0 -0
  171. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/sql/utils.py +0 -0
  172. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/storage.py +0 -0
  173. {datachain-0.3.12 → datachain-0.3.14}/src/datachain/torch/__init__.py +0 -0
  174. {datachain-0.3.12 → datachain-0.3.14}/src/datachain.egg-info/SOURCES.txt +0 -0
  175. {datachain-0.3.12 → datachain-0.3.14}/src/datachain.egg-info/dependency_links.txt +0 -0
  176. {datachain-0.3.12 → datachain-0.3.14}/src/datachain.egg-info/entry_points.txt +0 -0
  177. {datachain-0.3.12 → datachain-0.3.14}/src/datachain.egg-info/requires.txt +0 -0
  178. {datachain-0.3.12 → datachain-0.3.14}/src/datachain.egg-info/top_level.txt +0 -0
  179. {datachain-0.3.12 → datachain-0.3.14}/tests/__init__.py +0 -0
  180. {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/__init__.py +0 -0
  181. {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/conftest.py +0 -0
  182. {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  183. {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/datasets/.dvc/config +0 -0
  184. {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/datasets/.gitignore +0 -0
  185. {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  186. {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/test_datachain.py +0 -0
  187. {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/test_ls.py +0 -0
  188. {datachain-0.3.12 → datachain-0.3.14}/tests/benchmarks/test_version.py +0 -0
  189. {datachain-0.3.12 → datachain-0.3.14}/tests/examples/__init__.py +0 -0
  190. {datachain-0.3.12 → datachain-0.3.14}/tests/examples/test_examples.py +0 -0
  191. {datachain-0.3.12 → datachain-0.3.14}/tests/examples/wds_data.py +0 -0
  192. {datachain-0.3.12 → datachain-0.3.14}/tests/func/__init__.py +0 -0
  193. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_client.py +0 -0
  194. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_datasets.py +0 -0
  195. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_listing.py +0 -0
  196. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_meta_formats.py +0 -0
  197. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_metrics.py +0 -0
  198. {datachain-0.3.12 → datachain-0.3.14}/tests/func/test_pytorch.py +0 -0
  199. {datachain-0.3.12 → datachain-0.3.14}/tests/scripts/feature_class.py +0 -0
  200. {datachain-0.3.12 → datachain-0.3.14}/tests/scripts/feature_class_parallel.py +0 -0
  201. {datachain-0.3.12 → datachain-0.3.14}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  202. {datachain-0.3.12 → datachain-0.3.14}/tests/scripts/name_len_slow.py +0 -0
  203. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/__init__.py +0 -0
  204. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/__init__.py +0 -0
  205. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/conftest.py +0 -0
  206. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_arrow.py +0 -0
  207. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_clip.py +0 -0
  208. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  209. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_datachain_merge.py +0 -0
  210. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_feature.py +0 -0
  211. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_feature_utils.py +0 -0
  212. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_hf.py +0 -0
  213. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_image.py +0 -0
  214. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_schema.py +0 -0
  215. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_sql_to_python.py +0 -0
  216. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_text.py +0 -0
  217. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_udf_signature.py +0 -0
  218. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_utils.py +0 -0
  219. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/lib/test_webdataset.py +0 -0
  220. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/__init__.py +0 -0
  221. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/sqlite/__init__.py +0 -0
  222. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/sqlite/test_utils.py +0 -0
  223. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_array.py +0 -0
  224. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_conditional.py +0 -0
  225. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_path.py +0 -0
  226. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_random.py +0 -0
  227. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_selectable.py +0 -0
  228. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/sql/test_string.py +0 -0
  229. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_asyn.py +0 -0
  230. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_catalog.py +0 -0
  231. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_catalog_loader.py +0 -0
  232. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_cli_parsing.py +0 -0
  233. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_client.py +0 -0
  234. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_database_engine.py +0 -0
  235. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_dispatch.py +0 -0
  236. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_fileslice.py +0 -0
  237. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_metastore.py +0 -0
  238. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_module_exports.py +0 -0
  239. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_query_metrics.py +0 -0
  240. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_query_params.py +0 -0
  241. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_serializer.py +0 -0
  242. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_session.py +0 -0
  243. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_storage.py +0 -0
  244. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_utils.py +0 -0
  245. {datachain-0.3.12 → datachain-0.3.14}/tests/unit/test_warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.12
3
+ Version: 0.3.14
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -1,4 +1,10 @@
1
- # pip install defusedxml
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
2
8
  import json
3
9
 
4
10
  from PIL import (
@@ -1,4 +1,10 @@
1
- # pip install accelerate torch
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
2
8
  import torch
3
9
  from transformers import (
4
10
  AutoProcessor,
@@ -1,5 +1,3 @@
1
- # pip install datamodel-code-generator jmespath
2
-
3
1
  from typing import Optional
4
2
 
5
3
  from pydantic import BaseModel
@@ -1,4 +1,9 @@
1
- # pip install Pillow torchvision
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[torch]
5
+
6
+ """
2
7
 
3
8
  import os
4
9
  from posixpath import basename
@@ -1,7 +1,7 @@
1
1
  """
2
- To install dependencies:
2
+ To install the required dependencies:
3
3
 
4
- pip install open_clip_torch
4
+ pip install datachain[examples]
5
5
 
6
6
  """
7
7
 
@@ -40,6 +40,7 @@ def tests(session: nox.Session) -> None:
40
40
  "--cov-report=xml",
41
41
  "--durations=10",
42
42
  "--numprocesses=logical",
43
+ "--dist=loadgroup",
43
44
  *session.posargs,
44
45
  env={"COVERAGE_FILE": f".coverage.{session.python}"},
45
46
  )
@@ -1,14 +1,8 @@
1
1
  import asyncio
2
- from collections.abc import Awaitable, Coroutine, Iterable
2
+ from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
3
3
  from concurrent.futures import ThreadPoolExecutor
4
4
  from heapq import heappop, heappush
5
- from typing import (
6
- Any,
7
- Callable,
8
- Generic,
9
- Optional,
10
- TypeVar,
11
- )
5
+ from typing import Any, Callable, Generic, Optional, TypeVar
12
6
 
13
7
  from fsspec.asyn import get_loop
14
8
 
@@ -16,6 +10,7 @@ ASYNC_WORKERS = 20
16
10
 
17
11
  InputT = TypeVar("InputT", contravariant=True) # noqa: PLC0105
18
12
  ResultT = TypeVar("ResultT", covariant=True) # noqa: PLC0105
13
+ T = TypeVar("T")
19
14
 
20
15
 
21
16
  class AsyncMapper(Generic[InputT, ResultT]):
@@ -226,7 +221,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
226
221
  self._push_result(self._next_yield, None)
227
222
 
228
223
 
229
- def iter_over_async(ait, loop):
224
+ def iter_over_async(ait: AsyncIterable[T], loop) -> Iterator[T]:
230
225
  """Wrap an asynchronous iterator into a synchronous one"""
231
226
  ait = ait.__aiter__()
232
227
 
@@ -29,7 +29,6 @@ class UniqueId:
29
29
  etag: str
30
30
  version: str = ""
31
31
  is_latest: bool = True
32
- vtype: str = ""
33
32
  location: Optional[str] = None
34
33
  last_modified: datetime = TIME_ZERO
35
34
 
@@ -62,7 +62,7 @@ from datachain.listing import Listing
62
62
  from datachain.node import DirType, Node, NodeWithPath
63
63
  from datachain.nodes_thread_pool import NodesThreadPool
64
64
  from datachain.remote.studio import StudioClient
65
- from datachain.sql.types import JSON, Boolean, DateTime, Int, Int64, SQLType, String
65
+ from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
66
66
  from datachain.storage import Storage, StorageStatus, StorageURI
67
67
  from datachain.utils import (
68
68
  DataChainDir,
@@ -513,8 +513,6 @@ def find_column_to_str( # noqa: PLR0911
513
513
  )
514
514
  if column == "name":
515
515
  return posixpath.basename(row[field_lookup["path"]]) or ""
516
- if column == "owner":
517
- return row[field_lookup["owner_name"]] or ""
518
516
  if column == "path":
519
517
  is_dir = row[field_lookup["dir_type"]] == DirType.DIR
520
518
  path = row[field_lookup["path"]]
@@ -666,16 +664,12 @@ class Catalog:
666
664
  source_metastore = self.metastore.clone(client.uri)
667
665
 
668
666
  columns = [
669
- Column("vtype", String),
670
- Column("dir_type", Int),
671
667
  Column("path", String),
672
668
  Column("etag", String),
673
669
  Column("version", String),
674
670
  Column("is_latest", Boolean),
675
671
  Column("last_modified", DateTime(timezone=True)),
676
672
  Column("size", Int64),
677
- Column("owner_name", String),
678
- Column("owner_id", String),
679
673
  Column("location", JSON),
680
674
  Column("source", String),
681
675
  ]
@@ -1396,12 +1390,12 @@ class Catalog:
1396
1390
  dataset = self.get_dataset(name)
1397
1391
  return self.warehouse.dataset_table_export_file_names(dataset, version)
1398
1392
 
1399
- def dataset_stats(self, name: str, version: int) -> DatasetStats:
1393
+ def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
1400
1394
  """
1401
1395
  Returns tuple with dataset stats: total number of rows and total dataset size.
1402
1396
  """
1403
1397
  dataset = self.get_dataset(name)
1404
- dataset_version = dataset.get_version(version)
1398
+ dataset_version = dataset.get_version(version or dataset.latest_version)
1405
1399
  return DatasetStats(
1406
1400
  num_objects=dataset_version.num_objects,
1407
1401
  size=dataset_version.size,
@@ -1516,7 +1510,6 @@ class Catalog:
1516
1510
  row["etag"],
1517
1511
  row["version"],
1518
1512
  row["is_latest"],
1519
- row["vtype"],
1520
1513
  row["location"],
1521
1514
  row["last_modified"],
1522
1515
  )
@@ -1987,8 +1980,6 @@ class Catalog:
1987
1980
  field_set.add("path")
1988
1981
  elif column == "name":
1989
1982
  field_set.add("path")
1990
- elif column == "owner":
1991
- field_set.add("owner_name")
1992
1983
  elif column == "path":
1993
1984
  field_set.add("dir_type")
1994
1985
  field_set.add("path")
@@ -24,7 +24,7 @@ logger = logging.getLogger("datachain")
24
24
 
25
25
  TTL_HUMAN = "4h"
26
26
  TTL_INT = 4 * 60 * 60
27
- FIND_COLUMNS = ["du", "name", "owner", "path", "size", "type"]
27
+ FIND_COLUMNS = ["du", "name", "path", "size", "type"]
28
28
 
29
29
 
30
30
  def human_time_type(value_str: str, can_be_none: bool = False) -> Optional[int]:
@@ -579,9 +579,8 @@ def _node_data_to_ls_values(row, long_format=False):
579
579
  value = name + ending
580
580
  if long_format:
581
581
  last_modified = row[2]
582
- owner_name = row[3]
583
582
  timestamp = last_modified if not is_dir else None
584
- return long_line_str(value, timestamp, owner_name)
583
+ return long_line_str(value, timestamp)
585
584
  return value
586
585
 
587
586
 
@@ -599,7 +598,7 @@ def _ls_urls_flat(
599
598
  if client_cls.is_root_url(source):
600
599
  buckets = client_cls.ls_buckets(**catalog.client_config)
601
600
  if long:
602
- values = (long_line_str(b.name, b.created, "") for b in buckets)
601
+ values = (long_line_str(b.name, b.created) for b in buckets)
603
602
  else:
604
603
  values = (b.name for b in buckets)
605
604
  yield source, values
@@ -607,7 +606,7 @@ def _ls_urls_flat(
607
606
  found = False
608
607
  fields = ["name", "dir_type"]
609
608
  if long:
610
- fields.extend(["last_modified", "owner_name"])
609
+ fields.append("last_modified")
611
610
  for data_source, results in catalog.ls([source], fields=fields, **kwargs):
612
611
  values = (_node_data_to_ls_values(r, long) for r in results)
613
612
  found = True
@@ -683,7 +682,6 @@ def ls_remote(
683
682
  entry = long_line_str(
684
683
  row["name"] + ("/" if row["dir_type"] else ""),
685
684
  row["last_modified"],
686
- row["owner_name"],
687
685
  )
688
686
  print(format_ls_entry(entry))
689
687
  else:
@@ -4,7 +4,6 @@ from adlfs import AzureBlobFileSystem
4
4
  from tqdm import tqdm
5
5
 
6
6
  from datachain.lib.file import File
7
- from datachain.node import Entry
8
7
 
9
8
  from .fsspec import DELIMITER, Client, ResultQueue
10
9
 
@@ -14,17 +13,6 @@ class AzureClient(Client):
14
13
  PREFIX = "az://"
15
14
  protocol = "az"
16
15
 
17
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
18
- version_id = v.get("version_id")
19
- return Entry.from_file(
20
- path=path,
21
- etag=v.get("etag", "").strip('"'),
22
- version=version_id or "",
23
- is_latest=version_id is None or bool(v.get("is_current_version")),
24
- last_modified=v["last_modified"],
25
- size=v.get("size", ""),
26
- )
27
-
28
16
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
29
17
  version_id = v.get("version_id")
30
18
  return File(
@@ -57,7 +45,7 @@ class AzureClient(Client):
57
45
  continue
58
46
  info = (await self.fs._details([b]))[0]
59
47
  entries.append(
60
- self.convert_info(info, self.rel_path(info["name"]))
48
+ self.info_to_file(info, self.rel_path(info["name"]))
61
49
  )
62
50
  if entries:
63
51
  await result_queue.put(entries)
@@ -29,7 +29,7 @@ from tqdm import tqdm
29
29
  from datachain.cache import DataChainCache, UniqueId
30
30
  from datachain.client.fileslice import FileSlice, FileWrapper
31
31
  from datachain.error import ClientError as DataChainClientError
32
- from datachain.node import Entry
32
+ from datachain.lib.file import File
33
33
  from datachain.nodes_fetcher import NodesFetcher
34
34
  from datachain.nodes_thread_pool import NodeChunk
35
35
  from datachain.storage import StorageURI
@@ -45,7 +45,7 @@ DELIMITER = "/" # Path delimiter.
45
45
 
46
46
  DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
47
47
 
48
- ResultQueue = asyncio.Queue[Optional[Sequence[Entry]]]
48
+ ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
49
49
 
50
50
 
51
51
  def _is_win_local_path(uri: str) -> bool:
@@ -188,7 +188,7 @@ class Client(ABC):
188
188
 
189
189
  async def get_current_etag(self, uid: UniqueId) -> str:
190
190
  info = await self.fs._info(self.get_full_path(uid.path))
191
- return self.convert_info(info, "").etag
191
+ return self.info_to_file(info, "").etag
192
192
 
193
193
  async def get_size(self, path: str) -> int:
194
194
  return await self.fs._size(path)
@@ -198,7 +198,7 @@ class Client(ABC):
198
198
 
199
199
  async def scandir(
200
200
  self, start_prefix: str, method: str = "default"
201
- ) -> AsyncIterator[Sequence[Entry]]:
201
+ ) -> AsyncIterator[Sequence[File]]:
202
202
  try:
203
203
  impl = getattr(self, f"_fetch_{method}")
204
204
  except AttributeError:
@@ -264,7 +264,7 @@ class Client(ABC):
264
264
  ) -> None:
265
265
  await self._fetch_nested(start_prefix, result_queue)
266
266
 
267
- async def _fetch_dir(self, prefix, pbar, result_queue) -> set[str]:
267
+ async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
268
268
  path = f"{self.name}/{prefix}"
269
269
  infos = await self.ls_dir(path)
270
270
  files = []
@@ -277,7 +277,7 @@ class Client(ABC):
277
277
  if info["type"] == "directory":
278
278
  subdirs.add(subprefix)
279
279
  else:
280
- files.append(self.convert_info(info, subprefix))
280
+ files.append(self.info_to_file(info, subprefix))
281
281
  if files:
282
282
  await result_queue.put(files)
283
283
  found_count = len(subdirs) + len(files)
@@ -303,7 +303,7 @@ class Client(ABC):
303
303
  return f"{self.PREFIX}{self.name}/{rel_path}"
304
304
 
305
305
  @abstractmethod
306
- def convert_info(self, v: dict[str, Any], parent: str) -> Entry: ...
306
+ def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
307
307
 
308
308
  def fetch_nodes(
309
309
  self,
@@ -363,7 +363,6 @@ class Client(ABC):
363
363
  parent["path"],
364
364
  parent["size"],
365
365
  parent["etag"],
366
- vtype=parent["vtype"],
367
366
  location=parent["location"],
368
367
  )
369
368
  f = self.open_object(parent_uid, use_cache=use_cache)
@@ -10,7 +10,6 @@ from gcsfs import GCSFileSystem
10
10
  from tqdm import tqdm
11
11
 
12
12
  from datachain.lib.file import File
13
- from datachain.node import Entry
14
13
 
15
14
  from .fsspec import DELIMITER, Client, ResultQueue
16
15
 
@@ -108,19 +107,9 @@ class GCSClient(Client):
108
107
  finally:
109
108
  await page_queue.put(None)
110
109
 
111
- def _entry_from_dict(self, d: dict[str, Any]) -> Entry:
110
+ def _entry_from_dict(self, d: dict[str, Any]) -> File:
112
111
  info = self.fs._process_object(self.name, d)
113
- return self.convert_info(info, self.rel_path(info["name"]))
114
-
115
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
116
- return Entry.from_file(
117
- path=path,
118
- etag=v.get("etag", ""),
119
- version=v.get("generation", ""),
120
- is_latest=not v.get("timeDeleted"),
121
- last_modified=self.parse_timestamp(v["updated"]),
122
- size=v.get("size", ""),
123
- )
112
+ return self.info_to_file(info, self.rel_path(info["name"]))
124
113
 
125
114
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
126
115
  return File(
@@ -5,7 +5,6 @@ from typing import Any, cast
5
5
  from huggingface_hub import HfFileSystem
6
6
 
7
7
  from datachain.lib.file import File
8
- from datachain.node import Entry
9
8
 
10
9
  from .fsspec import Client
11
10
 
@@ -22,15 +21,6 @@ class HfClient(Client):
22
21
 
23
22
  return cast(HfFileSystem, super().create_fs(**kwargs))
24
23
 
25
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
26
- return Entry.from_file(
27
- path=path,
28
- size=v["size"],
29
- version=v["last_commit"].oid,
30
- etag=v.get("blob_id", ""),
31
- last_modified=v["last_commit"].date,
32
- )
33
-
34
24
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
35
25
  return File(
36
26
  path=path,
@@ -7,8 +7,8 @@ from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
9
9
 
10
+ from datachain.cache import UniqueId
10
11
  from datachain.lib.file import File
11
- from datachain.node import Entry
12
12
  from datachain.storage import StorageURI
13
13
 
14
14
  from .fsspec import Client
@@ -114,9 +114,9 @@ class FileClient(Client):
114
114
  use_symlinks=use_symlinks,
115
115
  )
116
116
 
117
- async def get_current_etag(self, uid) -> str:
117
+ async def get_current_etag(self, uid: UniqueId) -> str:
118
118
  info = self.fs.info(self.get_full_path(uid.path))
119
- return self.convert_info(info, "").etag
119
+ return self.info_to_file(info, "").etag
120
120
 
121
121
  async def get_size(self, path: str) -> int:
122
122
  return self.fs.size(path)
@@ -136,15 +136,6 @@ class FileClient(Client):
136
136
  full_path += "/"
137
137
  return full_path
138
138
 
139
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
140
- return Entry.from_file(
141
- path=path,
142
- etag=v["mtime"].hex(),
143
- is_latest=True,
144
- last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
145
- size=v.get("size", ""),
146
- )
147
-
148
139
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
149
140
  return File(
150
141
  source=self.uri,
@@ -1,12 +1,11 @@
1
1
  import asyncio
2
- from typing import Any, cast
2
+ from typing import Any, Optional, cast
3
3
 
4
4
  from botocore.exceptions import NoCredentialsError
5
5
  from s3fs import S3FileSystem
6
6
  from tqdm import tqdm
7
7
 
8
8
  from datachain.lib.file import File
9
- from datachain.node import Entry
10
9
 
11
10
  from .fsspec import DELIMITER, Client, ResultQueue
12
11
 
@@ -111,24 +110,23 @@ class ClientS3(Client):
111
110
  ) -> None:
112
111
  await self._fetch_flat(start_prefix, result_queue)
113
112
 
114
- def _entry_from_boto(self, v, bucket, versions=False):
115
- return Entry.from_file(
113
+ def _entry_from_boto(self, v, bucket, versions=False) -> File:
114
+ return File(
115
+ source=self.uri,
116
116
  path=v["Key"],
117
117
  etag=v.get("ETag", "").strip('"'),
118
118
  version=ClientS3.clean_s3_version(v.get("VersionId", "")),
119
119
  is_latest=v.get("IsLatest", True),
120
120
  last_modified=v.get("LastModified", ""),
121
121
  size=v["Size"],
122
- owner_name=v.get("Owner", {}).get("DisplayName", ""),
123
- owner_id=v.get("Owner", {}).get("ID", ""),
124
122
  )
125
123
 
126
124
  async def _fetch_dir(
127
125
  self,
128
126
  prefix,
129
127
  pbar,
130
- result_queue,
131
- ):
128
+ result_queue: ResultQueue,
129
+ ) -> set[str]:
132
130
  if prefix:
133
131
  prefix = prefix.lstrip(DELIMITER) + DELIMITER
134
132
  files = []
@@ -143,7 +141,7 @@ class ClientS3(Client):
143
141
  if info["type"] == "directory":
144
142
  subdirs.add(subprefix)
145
143
  else:
146
- files.append(self.convert_info(info, subprefix))
144
+ files.append(self.info_to_file(info, subprefix))
147
145
  pbar.update()
148
146
  found = True
149
147
  if not found:
@@ -154,20 +152,8 @@ class ClientS3(Client):
154
152
  return subdirs
155
153
 
156
154
  @staticmethod
157
- def clean_s3_version(ver):
158
- return ver if ver != "null" else ""
159
-
160
- def convert_info(self, v: dict[str, Any], path: str) -> Entry:
161
- return Entry.from_file(
162
- path=path,
163
- etag=v.get("ETag", "").strip('"'),
164
- version=ClientS3.clean_s3_version(v.get("VersionId", "")),
165
- is_latest=v.get("IsLatest", True),
166
- last_modified=v.get("LastModified", ""),
167
- size=v["size"],
168
- owner_name=v.get("Owner", {}).get("DisplayName", ""),
169
- owner_id=v.get("Owner", {}).get("ID", ""),
170
- )
155
+ def clean_s3_version(ver: Optional[str]) -> str:
156
+ return ver if (ver is not None and ver != "null") else ""
171
157
 
172
158
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
173
159
  return File(
@@ -10,9 +10,8 @@ from typing import (
10
10
 
11
11
  import sqlalchemy as sa
12
12
  from sqlalchemy.sql import func as f
13
- from sqlalchemy.sql.expression import null, true
13
+ from sqlalchemy.sql.expression import false, null, true
14
14
 
15
- from datachain.node import DirType
16
15
  from datachain.sql.functions import path
17
16
  from datachain.sql.types import Int, SQLType, UInt64
18
17
 
@@ -81,8 +80,7 @@ class DirExpansion:
81
80
  def base_select(q):
82
81
  return sa.select(
83
82
  q.c.sys__id,
84
- q.c.vtype,
85
- (q.c.dir_type == DirType.DIR).label("is_dir"),
83
+ false().label("is_dir"),
86
84
  q.c.source,
87
85
  q.c.path,
88
86
  q.c.version,
@@ -94,7 +92,6 @@ class DirExpansion:
94
92
  return (
95
93
  sa.select(
96
94
  f.min(q.c.sys__id).label("sys__id"),
97
- q.c.vtype,
98
95
  q.c.is_dir,
99
96
  q.c.source,
100
97
  q.c.path,
@@ -102,8 +99,8 @@ class DirExpansion:
102
99
  f.max(q.c.location).label("location"),
103
100
  )
104
101
  .select_from(q)
105
- .group_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
106
- .order_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
102
+ .group_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
103
+ .order_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
107
104
  )
108
105
 
109
106
  @classmethod
@@ -113,7 +110,6 @@ class DirExpansion:
113
110
  q = q.union_all(
114
111
  sa.select(
115
112
  sa.literal(-1).label("sys__id"),
116
- sa.literal("").label("vtype"),
117
113
  true().label("is_dir"),
118
114
  q.c.source,
119
115
  parent.label("path"),
@@ -43,6 +43,8 @@ if TYPE_CHECKING:
43
43
  from sqlalchemy.sql.elements import ColumnElement
44
44
  from sqlalchemy.types import TypeEngine
45
45
 
46
+ from datachain.lib.file import File
47
+
46
48
 
47
49
  logger = logging.getLogger("datachain")
48
50
 
@@ -58,6 +60,10 @@ quote_schema = sqlite_dialect.identifier_preparer.quote_schema
58
60
  quote = sqlite_dialect.identifier_preparer.quote
59
61
 
60
62
 
63
+ def _get_in_memory_uri():
64
+ return "file::memory:?cache=shared"
65
+
66
+
61
67
  def get_retry_sleep_sec(retry_count: int) -> int:
62
68
  return RETRY_START_SEC * (RETRY_FACTOR**retry_count)
63
69
 
@@ -119,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
119
125
  if db_file == ":memory:":
120
126
  # Enable multithreaded usage of the same in-memory db
121
127
  db = sqlite3.connect(
122
- "file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
128
+ _get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
123
129
  )
124
130
  else:
125
131
  db = sqlite3.connect(
@@ -704,6 +710,9 @@ class SQLiteWarehouse(AbstractWarehouse):
704
710
 
705
711
  self.db.execute(insert_query)
706
712
 
713
+ def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
714
+ return (e.model_dump() for e in entries)
715
+
707
716
  def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
708
717
  rows = list(rows)
709
718
  if not rows: