datachain 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (264) hide show
  1. {datachain-0.3.2 → datachain-0.3.3}/.github/workflows/benchmarks.yml +9 -6
  2. datachain-0.3.3/.github/workflows/tests-studio.yml +103 -0
  3. {datachain-0.3.2 → datachain-0.3.3}/.github/workflows/tests.yml +0 -92
  4. {datachain-0.3.2/src/datachain.egg-info → datachain-0.3.3}/PKG-INFO +2 -2
  5. {datachain-0.3.2 → datachain-0.3.3}/examples/computer_vision/openimage-detect.py +1 -1
  6. {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/common_sql_functions.py +2 -2
  7. {datachain-0.3.2 → datachain-0.3.3}/pyproject.toml +1 -1
  8. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/asyn.py +20 -0
  9. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/catalog/catalog.py +2 -0
  10. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/catalog/loader.py +75 -50
  11. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/azure.py +13 -0
  12. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/gcs.py +12 -0
  13. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/local.py +11 -0
  14. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/s3.py +12 -0
  15. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/sqlite.py +55 -14
  16. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/warehouse.py +17 -3
  17. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/arrow.py +1 -1
  18. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/values_to_tuples.py +14 -8
  19. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/data_model.py +1 -0
  20. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/dc.py +25 -6
  21. datachain-0.3.3/src/datachain/lib/listing.py +111 -0
  22. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/dataset.py +22 -12
  23. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/session.py +9 -2
  24. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/sqlite/base.py +30 -4
  25. {datachain-0.3.2 → datachain-0.3.3/src/datachain.egg-info}/PKG-INFO +2 -2
  26. {datachain-0.3.2 → datachain-0.3.3}/src/datachain.egg-info/SOURCES.txt +8 -22
  27. {datachain-0.3.2 → datachain-0.3.3}/src/datachain.egg-info/requires.txt +1 -1
  28. {datachain-0.3.2 → datachain-0.3.3}/tests/benchmarks/conftest.py +6 -0
  29. datachain-0.3.3/tests/benchmarks/datasets/.dvc/.gitignore +3 -0
  30. datachain-0.3.3/tests/benchmarks/datasets/.dvc/config +4 -0
  31. datachain-0.3.3/tests/benchmarks/datasets/.gitignore +1 -0
  32. datachain-0.3.3/tests/benchmarks/datasets/laion-tiny.npz.dvc +5 -0
  33. datachain-0.3.3/tests/benchmarks/test_datachain.py +22 -0
  34. datachain-0.3.3/tests/func/test_listing.py +34 -0
  35. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_datachain.py +169 -42
  36. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_datachain_merge.py +35 -2
  37. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_path.py +2 -1
  38. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_asyn.py +29 -1
  39. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_catalog_loader.py +41 -0
  40. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_database_engine.py +21 -1
  41. datachain-0.3.2/examples/computer_vision/fashion_product_images/.gitignore +0 -5
  42. datachain-0.3.2/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -2211
  43. datachain-0.3.2/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -4103
  44. datachain-0.3.2/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -1081
  45. datachain-0.3.2/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -754
  46. datachain-0.3.2/examples/computer_vision/fashion_product_images/README.md +0 -60
  47. datachain-0.3.2/examples/computer_vision/fashion_product_images/requirements.txt +0 -6
  48. datachain-0.3.2/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -47
  49. datachain-0.3.2/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -47
  50. datachain-0.3.2/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -36
  51. datachain-0.3.2/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -44
  52. datachain-0.3.2/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -52
  53. datachain-0.3.2/examples/computer_vision/fashion_product_images/src/clustering.py +0 -41
  54. datachain-0.3.2/examples/computer_vision/fashion_product_images/src/train.py +0 -143
  55. datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  56. datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  57. datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  58. datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  59. datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  60. datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  61. datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  62. datachain-0.3.2/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  63. {datachain-0.3.2 → datachain-0.3.3}/.cruft.json +0 -0
  64. {datachain-0.3.2 → datachain-0.3.3}/.gitattributes +0 -0
  65. {datachain-0.3.2 → datachain-0.3.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  66. {datachain-0.3.2 → datachain-0.3.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  67. {datachain-0.3.2 → datachain-0.3.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  68. {datachain-0.3.2 → datachain-0.3.3}/.github/codecov.yaml +0 -0
  69. {datachain-0.3.2 → datachain-0.3.3}/.github/dependabot.yml +0 -0
  70. {datachain-0.3.2 → datachain-0.3.3}/.github/workflows/release.yml +0 -0
  71. {datachain-0.3.2 → datachain-0.3.3}/.github/workflows/update-template.yaml +0 -0
  72. {datachain-0.3.2 → datachain-0.3.3}/.gitignore +0 -0
  73. {datachain-0.3.2 → datachain-0.3.3}/.pre-commit-config.yaml +0 -0
  74. {datachain-0.3.2 → datachain-0.3.3}/CODE_OF_CONDUCT.rst +0 -0
  75. {datachain-0.3.2 → datachain-0.3.3}/CONTRIBUTING.rst +0 -0
  76. {datachain-0.3.2 → datachain-0.3.3}/LICENSE +0 -0
  77. {datachain-0.3.2 → datachain-0.3.3}/README.rst +0 -0
  78. {datachain-0.3.2 → datachain-0.3.3}/docs/assets/captioned_cartoons.png +0 -0
  79. {datachain-0.3.2 → datachain-0.3.3}/docs/assets/datachain.png +0 -0
  80. {datachain-0.3.2 → datachain-0.3.3}/docs/assets/flowchart.png +0 -0
  81. {datachain-0.3.2 → datachain-0.3.3}/docs/index.md +0 -0
  82. {datachain-0.3.2 → datachain-0.3.3}/docs/references/datachain.md +0 -0
  83. {datachain-0.3.2 → datachain-0.3.3}/docs/references/datatype.md +0 -0
  84. {datachain-0.3.2 → datachain-0.3.3}/docs/references/file.md +0 -0
  85. {datachain-0.3.2 → datachain-0.3.3}/docs/references/index.md +0 -0
  86. {datachain-0.3.2 → datachain-0.3.3}/docs/references/sql.md +0 -0
  87. {datachain-0.3.2 → datachain-0.3.3}/docs/references/torch.md +0 -0
  88. {datachain-0.3.2 → datachain-0.3.3}/docs/references/udf.md +0 -0
  89. {datachain-0.3.2 → datachain-0.3.3}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  90. {datachain-0.3.2 → datachain-0.3.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  91. {datachain-0.3.2 → datachain-0.3.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  92. {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/json-csv-reader.py +0 -0
  93. {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/torch-loader.py +0 -0
  94. {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/udfs/parallel.py +0 -0
  95. {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/udfs/simple.py +0 -0
  96. {datachain-0.3.2 → datachain-0.3.3}/examples/get_started/udfs/stateful.py +0 -0
  97. {datachain-0.3.2 → datachain-0.3.3}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  98. {datachain-0.3.2 → datachain-0.3.3}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  99. {datachain-0.3.2 → datachain-0.3.3}/examples/llm_and_nlp/llm-claude.py +0 -0
  100. {datachain-0.3.2 → datachain-0.3.3}/examples/llm_and_nlp/unstructured-text.py +0 -0
  101. {datachain-0.3.2 → datachain-0.3.3}/examples/multimodal/clip_inference.py +0 -0
  102. {datachain-0.3.2 → datachain-0.3.3}/examples/multimodal/hf_pipeline.py +0 -0
  103. {datachain-0.3.2 → datachain-0.3.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
  104. {datachain-0.3.2 → datachain-0.3.3}/examples/multimodal/wds.py +0 -0
  105. {datachain-0.3.2 → datachain-0.3.3}/examples/multimodal/wds_filtered.py +0 -0
  106. {datachain-0.3.2 → datachain-0.3.3}/mkdocs.yml +0 -0
  107. {datachain-0.3.2 → datachain-0.3.3}/noxfile.py +0 -0
  108. {datachain-0.3.2 → datachain-0.3.3}/setup.cfg +0 -0
  109. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/__init__.py +0 -0
  110. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/__main__.py +0 -0
  111. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/cache.py +0 -0
  112. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/catalog/__init__.py +0 -0
  113. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/catalog/datasource.py +0 -0
  114. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/catalog/subclass.py +0 -0
  115. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/cli.py +0 -0
  116. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/cli_utils.py +0 -0
  117. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/__init__.py +0 -0
  118. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/fileslice.py +0 -0
  119. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/client/fsspec.py +0 -0
  120. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/config.py +0 -0
  121. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/__init__.py +0 -0
  122. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/db_engine.py +0 -0
  123. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/id_generator.py +0 -0
  124. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/job.py +0 -0
  125. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/metastore.py +0 -0
  126. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/schema.py +0 -0
  127. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/data_storage/serializer.py +0 -0
  128. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/dataset.py +0 -0
  129. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/error.py +0 -0
  130. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/job.py +0 -0
  131. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/__init__.py +0 -0
  132. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/clip.py +0 -0
  133. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/__init__.py +0 -0
  134. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/flatten.py +0 -0
  135. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
  136. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
  137. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/convert/unflatten.py +0 -0
  138. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/dataset_info.py +0 -0
  139. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/file.py +0 -0
  140. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/image.py +0 -0
  141. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/meta_formats.py +0 -0
  142. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/model_store.py +0 -0
  143. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/pytorch.py +0 -0
  144. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/settings.py +0 -0
  145. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/signal_schema.py +0 -0
  146. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/text.py +0 -0
  147. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/udf.py +0 -0
  148. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/udf_signature.py +0 -0
  149. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/utils.py +0 -0
  150. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/vfile.py +0 -0
  151. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/webdataset.py +0 -0
  152. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/lib/webdataset_laion.py +0 -0
  153. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/listing.py +0 -0
  154. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/node.py +0 -0
  155. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/nodes_fetcher.py +0 -0
  156. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/nodes_thread_pool.py +0 -0
  157. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/progress.py +0 -0
  158. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/py.typed +0 -0
  159. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/__init__.py +0 -0
  160. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/batch.py +0 -0
  161. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/builtins.py +0 -0
  162. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/dispatch.py +0 -0
  163. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/metrics.py +0 -0
  164. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/params.py +0 -0
  165. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/queue.py +0 -0
  166. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/schema.py +0 -0
  167. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/query/udf.py +0 -0
  168. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/remote/__init__.py +0 -0
  169. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/remote/studio.py +0 -0
  170. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/__init__.py +0 -0
  171. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/default/__init__.py +0 -0
  172. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/default/base.py +0 -0
  173. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/__init__.py +0 -0
  174. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/array.py +0 -0
  175. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/conditional.py +0 -0
  176. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/path.py +0 -0
  177. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/random.py +0 -0
  178. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/functions/string.py +0 -0
  179. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/selectable.py +0 -0
  180. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/sqlite/__init__.py +0 -0
  181. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/sqlite/types.py +0 -0
  182. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/sqlite/vector.py +0 -0
  183. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/types.py +0 -0
  184. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/sql/utils.py +0 -0
  185. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/storage.py +0 -0
  186. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/torch/__init__.py +0 -0
  187. {datachain-0.3.2 → datachain-0.3.3}/src/datachain/utils.py +0 -0
  188. {datachain-0.3.2 → datachain-0.3.3}/src/datachain.egg-info/dependency_links.txt +0 -0
  189. {datachain-0.3.2 → datachain-0.3.3}/src/datachain.egg-info/entry_points.txt +0 -0
  190. {datachain-0.3.2 → datachain-0.3.3}/src/datachain.egg-info/top_level.txt +0 -0
  191. {datachain-0.3.2 → datachain-0.3.3}/tests/__init__.py +0 -0
  192. {datachain-0.3.2 → datachain-0.3.3}/tests/benchmarks/__init__.py +0 -0
  193. {datachain-0.3.2 → datachain-0.3.3}/tests/benchmarks/test_ls.py +0 -0
  194. {datachain-0.3.2 → datachain-0.3.3}/tests/benchmarks/test_version.py +0 -0
  195. {datachain-0.3.2 → datachain-0.3.3}/tests/conftest.py +0 -0
  196. {datachain-0.3.2 → datachain-0.3.3}/tests/data.py +0 -0
  197. {datachain-0.3.2 → datachain-0.3.3}/tests/examples/__init__.py +0 -0
  198. {datachain-0.3.2 → datachain-0.3.3}/tests/examples/test_examples.py +0 -0
  199. {datachain-0.3.2 → datachain-0.3.3}/tests/examples/test_wds_e2e.py +0 -0
  200. {datachain-0.3.2 → datachain-0.3.3}/tests/examples/wds_data.py +0 -0
  201. {datachain-0.3.2 → datachain-0.3.3}/tests/func/__init__.py +0 -0
  202. {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_catalog.py +0 -0
  203. {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_client.py +0 -0
  204. {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_datachain.py +0 -0
  205. {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_dataset_query.py +0 -0
  206. {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_datasets.py +0 -0
  207. {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_feature_pickling.py +0 -0
  208. {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_ls.py +0 -0
  209. {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_pull.py +0 -0
  210. {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_pytorch.py +0 -0
  211. {datachain-0.3.2 → datachain-0.3.3}/tests/func/test_query.py +0 -0
  212. {datachain-0.3.2 → datachain-0.3.3}/tests/scripts/feature_class.py +0 -0
  213. {datachain-0.3.2 → datachain-0.3.3}/tests/scripts/feature_class_parallel.py +0 -0
  214. {datachain-0.3.2 → datachain-0.3.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  215. {datachain-0.3.2 → datachain-0.3.3}/tests/scripts/name_len_slow.py +0 -0
  216. {datachain-0.3.2 → datachain-0.3.3}/tests/test_cli_e2e.py +0 -0
  217. {datachain-0.3.2 → datachain-0.3.3}/tests/test_query_e2e.py +0 -0
  218. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/__init__.py +0 -0
  219. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/__init__.py +0 -0
  220. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/conftest.py +0 -0
  221. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_arrow.py +0 -0
  222. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_clip.py +0 -0
  223. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  224. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_feature.py +0 -0
  225. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_feature_utils.py +0 -0
  226. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_file.py +0 -0
  227. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_image.py +0 -0
  228. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_schema.py +0 -0
  229. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_signal_schema.py +0 -0
  230. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_sql_to_python.py +0 -0
  231. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_text.py +0 -0
  232. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_udf_signature.py +0 -0
  233. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_utils.py +0 -0
  234. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/lib/test_webdataset.py +0 -0
  235. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/__init__.py +0 -0
  236. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/sqlite/__init__.py +0 -0
  237. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
  238. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_array.py +0 -0
  239. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_conditional.py +0 -0
  240. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_random.py +0 -0
  241. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_selectable.py +0 -0
  242. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/sql/test_string.py +0 -0
  243. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_cache.py +0 -0
  244. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_catalog.py +0 -0
  245. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_cli_parsing.py +0 -0
  246. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_client.py +0 -0
  247. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_client_s3.py +0 -0
  248. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_data_storage.py +0 -0
  249. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_dataset.py +0 -0
  250. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_dispatch.py +0 -0
  251. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_fileslice.py +0 -0
  252. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_id_generator.py +0 -0
  253. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_listing.py +0 -0
  254. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_metastore.py +0 -0
  255. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_module_exports.py +0 -0
  256. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_query_metrics.py +0 -0
  257. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_query_params.py +0 -0
  258. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_serializer.py +0 -0
  259. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_session.py +0 -0
  260. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_storage.py +0 -0
  261. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_udf.py +0 -0
  262. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_utils.py +0 -0
  263. {datachain-0.3.2 → datachain-0.3.3}/tests/unit/test_warehouse.py +0 -0
  264. {datachain-0.3.2 → datachain-0.3.3}/tests/utils.py +0 -0
@@ -5,23 +5,24 @@ on:
5
5
  - cron: '0 0 * * *'
6
6
  pull_request:
7
7
  types: [opened, reopened, labeled, synchronize]
8
- workflow_dispatch: {}
8
+ workflow_dispatch:
9
9
 
10
10
  env:
11
11
  FORCE_COLOR: "1"
12
12
 
13
+ concurrency:
14
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
15
+ cancel-in-progress: true
16
+
13
17
  jobs:
14
18
  run:
15
- if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') }}
16
19
  runs-on: ubuntu-latest
17
-
18
20
  steps:
19
21
  - uses: actions/checkout@v4
20
-
21
- - name: Set up Python 3.10
22
+ - name: Set up Python 3.12
22
23
  uses: actions/setup-python@v5
23
24
  with:
24
- python-version: '3.10'
25
+ python-version: '3.12'
25
26
  cache: 'pip'
26
27
 
27
28
  - name: Upgrade nox and uv
@@ -30,5 +31,7 @@ jobs:
30
31
  nox --version
31
32
  uv --version
32
33
 
34
+ - run: uv pip install dvc[gs] --system
35
+ - run: dvc --cd tests/benchmarks/datasets pull
33
36
  - name: Run benchmarks
34
37
  run: nox -s bench
@@ -0,0 +1,103 @@
1
+ name: Studio Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ workflow_dispatch:
8
+
9
+ env:
10
+ FORCE_COLOR: "1"
11
+ BRANCH: ${{ github.head_ref || github.ref_name }}
12
+
13
+ concurrency:
14
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
15
+ cancel-in-progress: true
16
+
17
+ jobs:
18
+ studio:
19
+ if: '!github.event.pull_request.head.repo.fork'
20
+ runs-on: ubuntu-latest-16-cores
21
+ strategy:
22
+ matrix:
23
+ pyv: ['3.12']
24
+ group: [1, 2, 3, 4, 5, 6]
25
+ services:
26
+ postgres:
27
+ image: postgres:16.3
28
+ ports:
29
+ - 5432:5432
30
+ env:
31
+ POSTGRES_USER: test
32
+ POSTGRES_DB: database
33
+ POSTGRES_HOST_AUTH_METHOD: trust
34
+ clickhouse:
35
+ image: clickhouse/clickhouse-server:24
36
+ ports:
37
+ - 8123:8123
38
+ - 9010:9000
39
+ env:
40
+ CLICKHOUSE_DB: studio_local_db
41
+ CLICKHOUSE_USER: studio_local
42
+ CLICKHOUSE_PASSWORD: ch123456789!
43
+ CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
44
+ redis:
45
+ image: redis:7.2.5
46
+ ports:
47
+ - 6379:6379
48
+ steps:
49
+ - name: Studio branch name
50
+ env:
51
+ BRANCH: ${{ env.BRANCH }}
52
+ STUDIO_READ_ACCESS_TOKEN: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
53
+ run: |
54
+ echo "DataChain branch: $BRANCH"
55
+ if [[ "$BRANCH" == "main" ]]
56
+ then
57
+ STUDIO_BRANCH=develop
58
+ elif git ls-remote --heads https://"$STUDIO_READ_ACCESS_TOKEN"@github.com/iterative/studio.git "$BRANCH" | grep -F "$BRANCH" 2>&1>/dev/null
59
+ then
60
+ STUDIO_BRANCH="$BRANCH"
61
+ else
62
+ STUDIO_BRANCH=develop
63
+ fi
64
+ echo "STUDIO_BRANCH=$STUDIO_BRANCH" >> $GITHUB_ENV
65
+ echo "Studio branch: $STUDIO_BRANCH"
66
+
67
+ - name: Check out Studio
68
+ uses: actions/checkout@v4
69
+ with:
70
+ fetch-depth: 0
71
+ repository: iterative/studio
72
+ ref: ${{ env.STUDIO_BRANCH }}
73
+ token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
74
+
75
+ - name: Check out repository
76
+ uses: actions/checkout@v4
77
+ with:
78
+ path: './backend/datachain'
79
+ fetch-depth: 0
80
+
81
+ - name: Set up Python ${{ matrix.pyv }}
82
+ uses: actions/setup-python@v5
83
+ with:
84
+ python-version: ${{ matrix.pyv }}
85
+ cache: 'pip'
86
+
87
+ - name: Install uv
88
+ run: |
89
+ python -m pip install --upgrade uv
90
+ uv --version
91
+
92
+ - name: Install dependencies
93
+ run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
94
+
95
+ - name: Run tests
96
+ # Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
97
+ run: >
98
+ pytest
99
+ --config-file=pyproject.toml -rs
100
+ --splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
101
+ -m 'not benchmark'
102
+ tests ../datachain/tests
103
+ working-directory: backend/datachain_server
@@ -8,7 +8,6 @@ on:
8
8
 
9
9
  env:
10
10
  FORCE_COLOR: "1"
11
- BRANCH: ${{ github.head_ref || github.ref_name }}
12
11
 
13
12
  concurrency:
14
13
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -18,7 +17,6 @@ jobs:
18
17
  lint:
19
18
  runs-on: ubuntu-latest
20
19
  steps:
21
-
22
20
  - name: Check out the repository
23
21
  uses: actions/checkout@v4
24
22
  with:
@@ -112,95 +110,6 @@ jobs:
112
110
  - name: Build docs
113
111
  run: nox -s docs
114
112
 
115
-
116
- studio:
117
- if: '!github.event.pull_request.head.repo.fork'
118
- runs-on: ubuntu-latest-16-cores
119
- strategy:
120
- matrix:
121
- pyv: ['3.12']
122
- group: [1, 2, 3, 4, 5, 6]
123
- services:
124
- postgres:
125
- image: postgres:16.3
126
- ports:
127
- - 5432:5432
128
- env:
129
- POSTGRES_USER: test
130
- POSTGRES_DB: database
131
- POSTGRES_HOST_AUTH_METHOD: trust
132
- clickhouse:
133
- image: clickhouse/clickhouse-server:24
134
- ports:
135
- - 8123:8123
136
- - 9010:9000
137
- env:
138
- CLICKHOUSE_DB: studio_local_db
139
- CLICKHOUSE_USER: studio_local
140
- CLICKHOUSE_PASSWORD: ch123456789!
141
- CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
142
- redis:
143
- image: redis:7.2.5
144
- ports:
145
- - 6379:6379
146
- steps:
147
-
148
- - name: Studio branch name
149
- env:
150
- BRANCH: ${{ env.BRANCH }}
151
- STUDIO_READ_ACCESS_TOKEN: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
152
- run: |
153
- echo "DataChain branch: $BRANCH"
154
- if [[ "$BRANCH" == "main" ]]
155
- then
156
- STUDIO_BRANCH=develop
157
- elif git ls-remote --heads https://"$STUDIO_READ_ACCESS_TOKEN"@github.com/iterative/studio.git "$BRANCH" | grep -F "$BRANCH" 2>&1>/dev/null
158
- then
159
- STUDIO_BRANCH="$BRANCH"
160
- else
161
- STUDIO_BRANCH=develop
162
- fi
163
- echo "STUDIO_BRANCH=$STUDIO_BRANCH" >> $GITHUB_ENV
164
- echo "Studio branch: $STUDIO_BRANCH"
165
-
166
- - name: Check out Studio
167
- uses: actions/checkout@v4
168
- with:
169
- fetch-depth: 0
170
- repository: iterative/studio
171
- ref: ${{ env.STUDIO_BRANCH }}
172
- token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
173
-
174
- - name: Check out repository
175
- uses: actions/checkout@v4
176
- with:
177
- path: './backend/datachain'
178
- fetch-depth: 0
179
-
180
- - name: Set up Python ${{ matrix.pyv }}
181
- uses: actions/setup-python@v5
182
- with:
183
- python-version: ${{ matrix.pyv }}
184
- cache: 'pip'
185
-
186
- - name: Install uv
187
- run: |
188
- python -m pip install --upgrade uv
189
- uv --version
190
-
191
- - name: Install dependencies
192
- run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
193
-
194
- - name: Run tests
195
- # Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
196
- run: >
197
- pytest
198
- --config-file=pyproject.toml -rs
199
- --splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
200
- tests ../datachain/tests
201
- working-directory: backend/datachain_server
202
-
203
-
204
113
  examples:
205
114
  runs-on: ${{ matrix.os }}
206
115
  timeout-minutes: 60
@@ -211,7 +120,6 @@ jobs:
211
120
  pyv: ['3.9', '3.12']
212
121
  group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
213
122
  steps:
214
-
215
123
  - uses: actions/checkout@v4
216
124
 
217
125
  - name: Set up Python ${{ matrix.pyv }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -76,7 +76,7 @@ Requires-Dist: aiotools>=1.7.0; extra == "tests"
76
76
  Requires-Dist: requests-mock; extra == "tests"
77
77
  Provides-Extra: dev
78
78
  Requires-Dist: datachain[docs,tests]; extra == "dev"
79
- Requires-Dist: mypy==1.10.1; extra == "dev"
79
+ Requires-Dist: mypy==1.11.1; extra == "dev"
80
80
  Requires-Dist: types-python-dateutil; extra == "dev"
81
81
  Requires-Dist: types-pytz; extra == "dev"
82
82
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -54,7 +54,7 @@ source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
54
54
  .filter(C("file.path").glob("*.jpg") | C("file.path").glob("*.json"))
55
55
  .agg(
56
56
  openimage_detect,
57
- partition_by=path.file_stem(path.name(C("file.path"))),
57
+ partition_by=path.file_stem(C("file.path")),
58
58
  params=["file"],
59
59
  output={"file": File, "bbox": BBox},
60
60
  )
@@ -26,8 +26,8 @@ dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
26
26
 
27
27
  (
28
28
  dc.mutate(
29
- stem=path.file_stem(path.name(C("file.path"))),
30
- ext=path.file_ext(path.name(C("file.path"))),
29
+ stem=path.file_stem(C("file.path")),
30
+ ext=path.file_ext(C("file.path")),
31
31
  )
32
32
  .select("file.path", "stem", "ext")
33
33
  .show(5)
@@ -87,7 +87,7 @@ tests = [
87
87
  ]
88
88
  dev = [
89
89
  "datachain[docs,tests]",
90
- "mypy==1.10.1",
90
+ "mypy==1.11.1",
91
91
  "types-python-dateutil",
92
92
  "types-pytz",
93
93
  "types-PyYAML",
@@ -224,3 +224,23 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
224
224
  async def _break_iteration(self) -> None:
225
225
  self.heap = []
226
226
  self._push_result(self._next_yield, None)
227
+
228
+
229
+ def iter_over_async(ait, loop):
230
+ """Wrap an asynchronous iterator into a synchronous one"""
231
+ ait = ait.__aiter__()
232
+
233
+ # helper async fn that just gets the next element from the async iterator
234
+ async def get_next():
235
+ try:
236
+ obj = await ait.__anext__()
237
+ return False, obj
238
+ except StopAsyncIteration:
239
+ return True, None
240
+
241
+ # actual sync iterator
242
+ while True:
243
+ done, obj = asyncio.run_coroutine_threadsafe(get_next(), loop).result()
244
+ if done:
245
+ break
246
+ yield obj
@@ -577,6 +577,7 @@ class Catalog:
577
577
  warehouse_ready_callback: Optional[
578
578
  Callable[["AbstractWarehouse"], None]
579
579
  ] = None,
580
+ in_memory: bool = False,
580
581
  ):
581
582
  datachain_dir = DataChainDir(cache=cache_dir, tmp=tmp_dir)
582
583
  datachain_dir.init()
@@ -590,6 +591,7 @@ class Catalog:
590
591
  "tmp_dir": tmp_dir,
591
592
  }
592
593
  self._warehouse_ready_callback = warehouse_ready_callback
594
+ self.in_memory = in_memory
593
595
 
594
596
  @cached_property
595
597
  def warehouse(self) -> "AbstractWarehouse":
@@ -28,8 +28,10 @@ WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
28
28
  DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
29
29
  DISTRIBUTED_ARG_PREFIX = "DATACHAIN_DISTRIBUTED_ARG_"
30
30
 
31
+ IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
31
32
 
32
- def get_id_generator() -> "AbstractIDGenerator":
33
+
34
+ def get_id_generator(in_memory: bool = False) -> "AbstractIDGenerator":
33
35
  id_generator_serialized = os.environ.get(ID_GENERATOR_SERIALIZED)
34
36
  if id_generator_serialized:
35
37
  id_generator_obj = deserialize(id_generator_serialized)
@@ -43,25 +45,31 @@ def get_id_generator() -> "AbstractIDGenerator":
43
45
  id_generator_import_path = os.environ.get(ID_GENERATOR_IMPORT_PATH)
44
46
  id_generator_arg_envs = get_envs_by_prefix(ID_GENERATOR_ARG_PREFIX)
45
47
  # Convert env variable names to keyword argument names by lowercasing them
46
- id_generator_args = {k.lower(): v for k, v in id_generator_arg_envs.items()}
47
-
48
- if id_generator_import_path:
49
- # ID generator paths are specified as (for example):
50
- # datachain.data_storage.SQLiteIDGenerator
51
- if "." not in id_generator_import_path:
52
- raise RuntimeError(
53
- f"Invalid {ID_GENERATOR_IMPORT_PATH} import path:"
54
- f"{id_generator_import_path}"
55
- )
56
- module_name, _, class_name = id_generator_import_path.rpartition(".")
57
- id_generator = import_module(module_name)
58
- id_generator_class = getattr(id_generator, class_name)
59
- else:
60
- id_generator_class = SQLiteIDGenerator
48
+ id_generator_args: dict[str, Any] = {
49
+ k.lower(): v for k, v in id_generator_arg_envs.items()
50
+ }
51
+
52
+ if not id_generator_import_path:
53
+ id_generator_args["in_memory"] = in_memory
54
+ return SQLiteIDGenerator(**id_generator_args)
55
+ if in_memory:
56
+ raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
57
+ # ID generator paths are specified as (for example):
58
+ # datachain.data_storage.SQLiteIDGenerator
59
+ if "." not in id_generator_import_path:
60
+ raise RuntimeError(
61
+ f"Invalid {ID_GENERATOR_IMPORT_PATH} import path:"
62
+ f"{id_generator_import_path}"
63
+ )
64
+ module_name, _, class_name = id_generator_import_path.rpartition(".")
65
+ id_generator = import_module(module_name)
66
+ id_generator_class = getattr(id_generator, class_name)
61
67
  return id_generator_class(**id_generator_args)
62
68
 
63
69
 
64
- def get_metastore(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractMetastore":
70
+ def get_metastore(
71
+ id_generator: Optional["AbstractIDGenerator"], in_memory: bool = False
72
+ ) -> "AbstractMetastore":
65
73
  metastore_serialized = os.environ.get(METASTORE_SERIALIZED)
66
74
  if metastore_serialized:
67
75
  metastore_obj = deserialize(metastore_serialized)
@@ -78,24 +86,32 @@ def get_metastore(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractMet
78
86
  metastore_import_path = os.environ.get(METASTORE_IMPORT_PATH)
79
87
  metastore_arg_envs = get_envs_by_prefix(METASTORE_ARG_PREFIX)
80
88
  # Convert env variable names to keyword argument names by lowercasing them
81
- metastore_args = {k.lower(): v for k, v in metastore_arg_envs.items()}
82
-
83
- if metastore_import_path:
84
- # Metastore paths are specified as (for example):
85
- # datachain.data_storage.SQLiteMetastore
86
- if "." not in metastore_import_path:
87
- raise RuntimeError(
88
- f"Invalid {METASTORE_IMPORT_PATH} import path: {metastore_import_path}"
89
- )
90
- module_name, _, class_name = metastore_import_path.rpartition(".")
91
- metastore = import_module(module_name)
92
- metastore_class = getattr(metastore, class_name)
93
- else:
94
- metastore_class = SQLiteMetastore
89
+ metastore_args: dict[str, Any] = {
90
+ k.lower(): v for k, v in metastore_arg_envs.items()
91
+ }
92
+
93
+ if not metastore_import_path:
94
+ if not isinstance(id_generator, SQLiteIDGenerator):
95
+ raise ValueError("SQLiteMetastore can only be used with SQLiteIDGenerator")
96
+ metastore_args["in_memory"] = in_memory
97
+ return SQLiteMetastore(id_generator, **metastore_args)
98
+ if in_memory:
99
+ raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
100
+ # Metastore paths are specified as (for example):
101
+ # datachain.data_storage.SQLiteMetastore
102
+ if "." not in metastore_import_path:
103
+ raise RuntimeError(
104
+ f"Invalid {METASTORE_IMPORT_PATH} import path: {metastore_import_path}"
105
+ )
106
+ module_name, _, class_name = metastore_import_path.rpartition(".")
107
+ metastore = import_module(module_name)
108
+ metastore_class = getattr(metastore, class_name)
95
109
  return metastore_class(id_generator, **metastore_args)
96
110
 
97
111
 
98
- def get_warehouse(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractWarehouse":
112
+ def get_warehouse(
113
+ id_generator: Optional["AbstractIDGenerator"], in_memory: bool = False
114
+ ) -> "AbstractWarehouse":
99
115
  warehouse_serialized = os.environ.get(WAREHOUSE_SERIALIZED)
100
116
  if warehouse_serialized:
101
117
  warehouse_obj = deserialize(warehouse_serialized)
@@ -112,20 +128,26 @@ def get_warehouse(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractWar
112
128
  warehouse_import_path = os.environ.get(WAREHOUSE_IMPORT_PATH)
113
129
  warehouse_arg_envs = get_envs_by_prefix(WAREHOUSE_ARG_PREFIX)
114
130
  # Convert env variable names to keyword argument names by lowercasing them
115
- warehouse_args = {k.lower(): v for k, v in warehouse_arg_envs.items()}
116
-
117
- if warehouse_import_path:
118
- # Warehouse paths are specified as (for example):
119
- # datachain.data_storage.SQLiteWarehouse
120
- if "." not in warehouse_import_path:
121
- raise RuntimeError(
122
- f"Invalid {WAREHOUSE_IMPORT_PATH} import path: {warehouse_import_path}"
123
- )
124
- module_name, _, class_name = warehouse_import_path.rpartition(".")
125
- warehouse = import_module(module_name)
126
- warehouse_class = getattr(warehouse, class_name)
127
- else:
128
- warehouse_class = SQLiteWarehouse
131
+ warehouse_args: dict[str, Any] = {
132
+ k.lower(): v for k, v in warehouse_arg_envs.items()
133
+ }
134
+
135
+ if not warehouse_import_path:
136
+ if not isinstance(id_generator, SQLiteIDGenerator):
137
+ raise ValueError("SQLiteWarehouse can only be used with SQLiteIDGenerator")
138
+ warehouse_args["in_memory"] = in_memory
139
+ return SQLiteWarehouse(id_generator, **warehouse_args)
140
+ if in_memory:
141
+ raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
142
+ # Warehouse paths are specified as (for example):
143
+ # datachain.data_storage.SQLiteWarehouse
144
+ if "." not in warehouse_import_path:
145
+ raise RuntimeError(
146
+ f"Invalid {WAREHOUSE_IMPORT_PATH} import path: {warehouse_import_path}"
147
+ )
148
+ module_name, _, class_name = warehouse_import_path.rpartition(".")
149
+ warehouse = import_module(module_name)
150
+ warehouse_class = getattr(warehouse, class_name)
129
151
  return warehouse_class(id_generator, **warehouse_args)
130
152
 
131
153
 
@@ -152,7 +174,9 @@ def get_distributed_class(**kwargs):
152
174
  return distributed_class(**distributed_args | kwargs)
153
175
 
154
176
 
155
- def get_catalog(client_config: Optional[dict[str, Any]] = None) -> Catalog:
177
+ def get_catalog(
178
+ client_config: Optional[dict[str, Any]] = None, in_memory: bool = False
179
+ ) -> Catalog:
156
180
  """
157
181
  Function that creates Catalog instance with appropriate metastore
158
182
  and warehouse classes. Metastore class can be provided with env variable
@@ -164,10 +188,11 @@ def get_catalog(client_config: Optional[dict[str, Any]] = None) -> Catalog:
164
188
  and name of variable after, e.g. if it accepts team_id as kwargs
165
189
  we can provide DATACHAIN_METASTORE_ARG_TEAM_ID=12345 env variable.
166
190
  """
167
- id_generator = get_id_generator()
191
+ id_generator = get_id_generator(in_memory=in_memory)
168
192
  return Catalog(
169
193
  id_generator=id_generator,
170
- metastore=get_metastore(id_generator),
171
- warehouse=get_warehouse(id_generator),
194
+ metastore=get_metastore(id_generator, in_memory=in_memory),
195
+ warehouse=get_warehouse(id_generator, in_memory=in_memory),
172
196
  client_config=client_config,
197
+ in_memory=in_memory,
173
198
  )
@@ -3,6 +3,7 @@ from typing import Any
3
3
  from adlfs import AzureBlobFileSystem
4
4
  from tqdm import tqdm
5
5
 
6
+ from datachain.lib.file import File
6
7
  from datachain.node import Entry
7
8
 
8
9
  from .fsspec import DELIMITER, Client, ResultQueue
@@ -24,6 +25,18 @@ class AzureClient(Client):
24
25
  size=v.get("size", ""),
25
26
  )
26
27
 
28
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
29
+ version_id = v.get("version_id")
30
+ return File(
31
+ source=self.uri,
32
+ path=path,
33
+ etag=v.get("etag", "").strip('"'),
34
+ version=version_id or "",
35
+ is_latest=version_id is None or bool(v.get("is_current_version")),
36
+ last_modified=v["last_modified"],
37
+ size=v.get("size", ""),
38
+ )
39
+
27
40
  async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
28
41
  prefix = start_prefix
29
42
  if prefix:
@@ -9,6 +9,7 @@ from dateutil.parser import isoparse
9
9
  from gcsfs import GCSFileSystem
10
10
  from tqdm import tqdm
11
11
 
12
+ from datachain.lib.file import File
12
13
  from datachain.node import Entry
13
14
 
14
15
  from .fsspec import DELIMITER, Client, ResultQueue
@@ -120,3 +121,14 @@ class GCSClient(Client):
120
121
  last_modified=self.parse_timestamp(v["updated"]),
121
122
  size=v.get("size", ""),
122
123
  )
124
+
125
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
126
+ return File(
127
+ source=self.uri,
128
+ path=path,
129
+ etag=v.get("etag", ""),
130
+ version=v.get("generation", ""),
131
+ is_latest=not v.get("timeDeleted"),
132
+ last_modified=self.parse_timestamp(v["updated"]),
133
+ size=v.get("size", ""),
134
+ )
@@ -7,6 +7,7 @@ from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
9
9
 
10
+ from datachain.lib.file import File
10
11
  from datachain.node import Entry
11
12
  from datachain.storage import StorageURI
12
13
 
@@ -144,6 +145,16 @@ class FileClient(Client):
144
145
  size=v.get("size", ""),
145
146
  )
146
147
 
148
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
149
+ return File(
150
+ source=self.uri,
151
+ path=path,
152
+ size=v.get("size", ""),
153
+ etag=v["mtime"].hex(),
154
+ is_latest=True,
155
+ last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
156
+ )
157
+
147
158
  def fetch_nodes(
148
159
  self,
149
160
  nodes,
@@ -5,6 +5,7 @@ from botocore.exceptions import NoCredentialsError
5
5
  from s3fs import S3FileSystem
6
6
  from tqdm import tqdm
7
7
 
8
+ from datachain.lib.file import File
8
9
  from datachain.node import Entry
9
10
 
10
11
  from .fsspec import DELIMITER, Client, ResultQueue
@@ -167,3 +168,14 @@ class ClientS3(Client):
167
168
  owner_name=v.get("Owner", {}).get("DisplayName", ""),
168
169
  owner_id=v.get("Owner", {}).get("ID", ""),
169
170
  )
171
+
172
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
173
+ return File(
174
+ source=self.uri,
175
+ path=path,
176
+ size=v["size"],
177
+ version=ClientS3.clean_s3_version(v.get("VersionId", "")),
178
+ etag=v.get("ETag", "").strip('"'),
179
+ is_latest=v.get("IsLatest", True),
180
+ last_modified=v.get("LastModified", ""),
181
+ )