datachain 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (251) hide show
  1. {datachain-0.5.0 → datachain-0.5.1}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.5.0/src/datachain.egg-info → datachain-0.5.1}/PKG-INFO +1 -1
  3. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/catalog/catalog.py +8 -0
  4. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/metastore.py +20 -1
  5. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/sqlite.py +24 -32
  6. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/arrow.py +64 -19
  7. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/dc.py +113 -10
  8. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/udf.py +100 -78
  9. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/udf_signature.py +8 -6
  10. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/dataset.py +6 -6
  11. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/dispatch.py +2 -2
  12. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/session.py +42 -0
  13. {datachain-0.5.0 → datachain-0.5.1/src/datachain.egg-info}/PKG-INFO +1 -1
  14. {datachain-0.5.0 → datachain-0.5.1}/src/datachain.egg-info/SOURCES.txt +2 -1
  15. datachain-0.5.1/tests/scripts/feature_class_exception.py +24 -0
  16. datachain-0.5.1/tests/test_atomicity.py +58 -0
  17. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_datachain.py +169 -0
  18. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_id_generator.py +18 -0
  19. datachain-0.5.0/src/datachain/query/udf.py +0 -126
  20. {datachain-0.5.0 → datachain-0.5.1}/.cruft.json +0 -0
  21. {datachain-0.5.0 → datachain-0.5.1}/.gitattributes +0 -0
  22. {datachain-0.5.0 → datachain-0.5.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  23. {datachain-0.5.0 → datachain-0.5.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  24. {datachain-0.5.0 → datachain-0.5.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  25. {datachain-0.5.0 → datachain-0.5.1}/.github/codecov.yaml +0 -0
  26. {datachain-0.5.0 → datachain-0.5.1}/.github/dependabot.yml +0 -0
  27. {datachain-0.5.0 → datachain-0.5.1}/.github/workflows/benchmarks.yml +0 -0
  28. {datachain-0.5.0 → datachain-0.5.1}/.github/workflows/release.yml +0 -0
  29. {datachain-0.5.0 → datachain-0.5.1}/.github/workflows/tests-studio.yml +0 -0
  30. {datachain-0.5.0 → datachain-0.5.1}/.github/workflows/tests.yml +0 -0
  31. {datachain-0.5.0 → datachain-0.5.1}/.github/workflows/update-template.yaml +0 -0
  32. {datachain-0.5.0 → datachain-0.5.1}/.gitignore +0 -0
  33. {datachain-0.5.0 → datachain-0.5.1}/CODE_OF_CONDUCT.rst +0 -0
  34. {datachain-0.5.0 → datachain-0.5.1}/CONTRIBUTING.rst +0 -0
  35. {datachain-0.5.0 → datachain-0.5.1}/LICENSE +0 -0
  36. {datachain-0.5.0 → datachain-0.5.1}/README.rst +0 -0
  37. {datachain-0.5.0 → datachain-0.5.1}/docs/assets/captioned_cartoons.png +0 -0
  38. {datachain-0.5.0 → datachain-0.5.1}/docs/assets/datachain-white.svg +0 -0
  39. {datachain-0.5.0 → datachain-0.5.1}/docs/assets/datachain.svg +0 -0
  40. {datachain-0.5.0 → datachain-0.5.1}/docs/assets/flowchart.png +0 -0
  41. {datachain-0.5.0 → datachain-0.5.1}/docs/index.md +0 -0
  42. {datachain-0.5.0 → datachain-0.5.1}/docs/references/datachain.md +0 -0
  43. {datachain-0.5.0 → datachain-0.5.1}/docs/references/datatype.md +0 -0
  44. {datachain-0.5.0 → datachain-0.5.1}/docs/references/file.md +0 -0
  45. {datachain-0.5.0 → datachain-0.5.1}/docs/references/index.md +0 -0
  46. {datachain-0.5.0 → datachain-0.5.1}/docs/references/sql.md +0 -0
  47. {datachain-0.5.0 → datachain-0.5.1}/docs/references/torch.md +0 -0
  48. {datachain-0.5.0 → datachain-0.5.1}/docs/references/udf.md +0 -0
  49. {datachain-0.5.0 → datachain-0.5.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  50. {datachain-0.5.0 → datachain-0.5.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  51. {datachain-0.5.0 → datachain-0.5.1}/examples/computer_vision/openimage-detect.py +0 -0
  52. {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/common_sql_functions.py +0 -0
  53. {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/json-csv-reader.py +0 -0
  54. {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/torch-loader.py +0 -0
  55. {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/udfs/parallel.py +0 -0
  56. {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/udfs/simple.py +0 -0
  57. {datachain-0.5.0 → datachain-0.5.1}/examples/get_started/udfs/stateful.py +0 -0
  58. {datachain-0.5.0 → datachain-0.5.1}/examples/llm_and_nlp/claude-query.py +0 -0
  59. {datachain-0.5.0 → datachain-0.5.1}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  60. {datachain-0.5.0 → datachain-0.5.1}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  61. {datachain-0.5.0 → datachain-0.5.1}/examples/multimodal/clip_inference.py +0 -0
  62. {datachain-0.5.0 → datachain-0.5.1}/examples/multimodal/hf_pipeline.py +0 -0
  63. {datachain-0.5.0 → datachain-0.5.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  64. {datachain-0.5.0 → datachain-0.5.1}/examples/multimodal/wds.py +0 -0
  65. {datachain-0.5.0 → datachain-0.5.1}/examples/multimodal/wds_filtered.py +0 -0
  66. {datachain-0.5.0 → datachain-0.5.1}/mkdocs.yml +0 -0
  67. {datachain-0.5.0 → datachain-0.5.1}/noxfile.py +0 -0
  68. {datachain-0.5.0 → datachain-0.5.1}/overrides/main.html +0 -0
  69. {datachain-0.5.0 → datachain-0.5.1}/pyproject.toml +0 -0
  70. {datachain-0.5.0 → datachain-0.5.1}/setup.cfg +0 -0
  71. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/__init__.py +0 -0
  72. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/__main__.py +0 -0
  73. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/asyn.py +0 -0
  74. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/cache.py +0 -0
  75. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/catalog/__init__.py +0 -0
  76. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/catalog/datasource.py +0 -0
  77. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/catalog/loader.py +0 -0
  78. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/cli.py +0 -0
  79. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/cli_utils.py +0 -0
  80. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/__init__.py +0 -0
  81. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/azure.py +0 -0
  82. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/fileslice.py +0 -0
  83. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/fsspec.py +0 -0
  84. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/gcs.py +0 -0
  85. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/hf.py +0 -0
  86. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/local.py +0 -0
  87. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/client/s3.py +0 -0
  88. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/config.py +0 -0
  89. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/__init__.py +0 -0
  90. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/db_engine.py +0 -0
  91. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/id_generator.py +0 -0
  92. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/job.py +0 -0
  93. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/schema.py +0 -0
  94. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/serializer.py +0 -0
  95. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/data_storage/warehouse.py +0 -0
  96. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/dataset.py +0 -0
  97. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/error.py +0 -0
  98. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/job.py +0 -0
  99. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/__init__.py +0 -0
  100. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/clip.py +0 -0
  101. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/__init__.py +0 -0
  102. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/flatten.py +0 -0
  103. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  104. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  105. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/unflatten.py +0 -0
  106. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  107. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/data_model.py +0 -0
  108. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/dataset_info.py +0 -0
  109. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/file.py +0 -0
  110. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/hf.py +0 -0
  111. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/image.py +0 -0
  112. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/listing.py +0 -0
  113. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/listing_info.py +0 -0
  114. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/meta_formats.py +0 -0
  115. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/model_store.py +0 -0
  116. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/pytorch.py +0 -0
  117. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/settings.py +0 -0
  118. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/signal_schema.py +0 -0
  119. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/tar.py +0 -0
  120. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/text.py +0 -0
  121. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/utils.py +0 -0
  122. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/vfile.py +0 -0
  123. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/webdataset.py +0 -0
  124. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/lib/webdataset_laion.py +0 -0
  125. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/listing.py +0 -0
  126. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/node.py +0 -0
  127. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/nodes_fetcher.py +0 -0
  128. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/nodes_thread_pool.py +0 -0
  129. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/progress.py +0 -0
  130. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/py.typed +0 -0
  131. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/__init__.py +0 -0
  132. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/batch.py +0 -0
  133. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/metrics.py +0 -0
  134. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/params.py +0 -0
  135. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/queue.py +0 -0
  136. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/query/schema.py +0 -0
  137. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/remote/__init__.py +0 -0
  138. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/remote/studio.py +0 -0
  139. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/__init__.py +0 -0
  140. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/default/__init__.py +0 -0
  141. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/default/base.py +0 -0
  142. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/__init__.py +0 -0
  143. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/array.py +0 -0
  144. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/conditional.py +0 -0
  145. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/path.py +0 -0
  146. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/random.py +0 -0
  147. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/functions/string.py +0 -0
  148. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/selectable.py +0 -0
  149. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  150. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/sqlite/base.py +0 -0
  151. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/sqlite/types.py +0 -0
  152. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/sqlite/vector.py +0 -0
  153. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/types.py +0 -0
  154. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/sql/utils.py +0 -0
  155. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/storage.py +0 -0
  156. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/telemetry.py +0 -0
  157. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/torch/__init__.py +0 -0
  158. {datachain-0.5.0 → datachain-0.5.1}/src/datachain/utils.py +0 -0
  159. {datachain-0.5.0 → datachain-0.5.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  160. {datachain-0.5.0 → datachain-0.5.1}/src/datachain.egg-info/entry_points.txt +0 -0
  161. {datachain-0.5.0 → datachain-0.5.1}/src/datachain.egg-info/requires.txt +0 -0
  162. {datachain-0.5.0 → datachain-0.5.1}/src/datachain.egg-info/top_level.txt +0 -0
  163. {datachain-0.5.0 → datachain-0.5.1}/tests/__init__.py +0 -0
  164. {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/__init__.py +0 -0
  165. {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/conftest.py +0 -0
  166. {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  167. {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  168. {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/datasets/.gitignore +0 -0
  169. {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  170. {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/test_datachain.py +0 -0
  171. {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/test_ls.py +0 -0
  172. {datachain-0.5.0 → datachain-0.5.1}/tests/benchmarks/test_version.py +0 -0
  173. {datachain-0.5.0 → datachain-0.5.1}/tests/conftest.py +0 -0
  174. {datachain-0.5.0 → datachain-0.5.1}/tests/data.py +0 -0
  175. {datachain-0.5.0 → datachain-0.5.1}/tests/examples/__init__.py +0 -0
  176. {datachain-0.5.0 → datachain-0.5.1}/tests/examples/test_examples.py +0 -0
  177. {datachain-0.5.0 → datachain-0.5.1}/tests/examples/test_wds_e2e.py +0 -0
  178. {datachain-0.5.0 → datachain-0.5.1}/tests/examples/wds_data.py +0 -0
  179. {datachain-0.5.0 → datachain-0.5.1}/tests/func/__init__.py +0 -0
  180. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_catalog.py +0 -0
  181. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_client.py +0 -0
  182. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_datachain.py +0 -0
  183. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_dataset_query.py +0 -0
  184. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_datasets.py +0 -0
  185. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_feature_pickling.py +0 -0
  186. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_listing.py +0 -0
  187. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_ls.py +0 -0
  188. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_meta_formats.py +0 -0
  189. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_metrics.py +0 -0
  190. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_pull.py +0 -0
  191. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_pytorch.py +0 -0
  192. {datachain-0.5.0 → datachain-0.5.1}/tests/func/test_query.py +0 -0
  193. {datachain-0.5.0 → datachain-0.5.1}/tests/scripts/feature_class.py +0 -0
  194. {datachain-0.5.0 → datachain-0.5.1}/tests/scripts/feature_class_parallel.py +0 -0
  195. {datachain-0.5.0 → datachain-0.5.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  196. {datachain-0.5.0 → datachain-0.5.1}/tests/scripts/name_len_slow.py +0 -0
  197. {datachain-0.5.0 → datachain-0.5.1}/tests/test_cli_e2e.py +0 -0
  198. {datachain-0.5.0 → datachain-0.5.1}/tests/test_query_e2e.py +0 -0
  199. {datachain-0.5.0 → datachain-0.5.1}/tests/test_telemetry.py +0 -0
  200. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/__init__.py +0 -0
  201. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/__init__.py +0 -0
  202. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/conftest.py +0 -0
  203. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_arrow.py +0 -0
  204. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_clip.py +0 -0
  205. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  206. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  207. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_feature.py +0 -0
  208. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_feature_utils.py +0 -0
  209. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_file.py +0 -0
  210. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_hf.py +0 -0
  211. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_image.py +0 -0
  212. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_schema.py +0 -0
  213. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_signal_schema.py +0 -0
  214. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  215. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_text.py +0 -0
  216. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_udf_signature.py +0 -0
  217. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_utils.py +0 -0
  218. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/lib/test_webdataset.py +0 -0
  219. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/__init__.py +0 -0
  220. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  221. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  222. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_array.py +0 -0
  223. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_conditional.py +0 -0
  224. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_path.py +0 -0
  225. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_random.py +0 -0
  226. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_selectable.py +0 -0
  227. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/sql/test_string.py +0 -0
  228. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_asyn.py +0 -0
  229. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_cache.py +0 -0
  230. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_catalog.py +0 -0
  231. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_catalog_loader.py +0 -0
  232. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_cli_parsing.py +0 -0
  233. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_client.py +0 -0
  234. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_client_s3.py +0 -0
  235. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_data_storage.py +0 -0
  236. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_database_engine.py +0 -0
  237. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_dataset.py +0 -0
  238. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_dispatch.py +0 -0
  239. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_fileslice.py +0 -0
  240. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_listing.py +0 -0
  241. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_metastore.py +0 -0
  242. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_module_exports.py +0 -0
  243. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_query.py +0 -0
  244. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_query_metrics.py +0 -0
  245. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_query_params.py +0 -0
  246. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_serializer.py +0 -0
  247. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_session.py +0 -0
  248. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_storage.py +0 -0
  249. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_utils.py +0 -0
  250. {datachain-0.5.0 → datachain-0.5.1}/tests/unit/test_warehouse.py +0 -0
  251. {datachain-0.5.0 → datachain-0.5.1}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.6.7'
27
+ rev: 'v0.6.8'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -988,6 +988,14 @@ class Catalog:
988
988
  schema = {
989
989
  c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
990
990
  }
991
+
992
+ job_id = job_id or os.getenv("DATACHAIN_JOB_ID")
993
+ if not job_id:
994
+ from datachain.query.session import Session
995
+
996
+ session = Session.get(catalog=self)
997
+ job_id = session.job_id
998
+
991
999
  dataset = self.metastore.create_dataset_version(
992
1000
  dataset,
993
1001
  version,
@@ -50,7 +50,6 @@ if TYPE_CHECKING:
50
50
  from datachain.data_storage import AbstractIDGenerator, schema
51
51
  from datachain.data_storage.db_engine import DatabaseEngine
52
52
 
53
-
54
53
  logger = logging.getLogger("datachain")
55
54
 
56
55
 
@@ -384,6 +383,11 @@ class AbstractMetastore(ABC, Serializable):
384
383
  ) -> None:
385
384
  """Set the status of the given job and dataset."""
386
385
 
386
+ @abstractmethod
387
+ def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
388
+ """Returns dataset names and versions for the job."""
389
+ raise NotImplementedError
390
+
387
391
 
388
392
  class AbstractDBMetastore(AbstractMetastore):
389
393
  """
@@ -1519,3 +1523,18 @@ class AbstractDBMetastore(AbstractMetastore):
1519
1523
  .values(status=dataset_status)
1520
1524
  )
1521
1525
  self.db.execute(query, conn=conn) # type: ignore[attr-defined]
1526
+
1527
+ def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
1528
+ """Returns dataset names and versions for the job."""
1529
+ dv = self._datasets_versions
1530
+ ds = self._datasets
1531
+
1532
+ join_condition = dv.c.dataset_id == ds.c.id
1533
+
1534
+ query = (
1535
+ self._datasets_versions_select(ds.c.name, dv.c.version)
1536
+ .select_from(dv.join(ds, join_condition))
1537
+ .where(dv.c.job_id == job_id)
1538
+ )
1539
+
1540
+ return list(self.db.execute(query))
@@ -15,6 +15,7 @@ from typing import (
15
15
  )
16
16
 
17
17
  import sqlalchemy
18
+ from packaging import version
18
19
  from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
19
20
  from sqlalchemy.dialects import sqlite
20
21
  from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
@@ -153,7 +154,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
153
154
  if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
154
155
  import sys
155
156
 
156
- db.set_trace_callback(sys.stderr.write)
157
+ db.set_trace_callback(lambda stmt: print(stmt, file=sys.stderr))
157
158
 
158
159
  load_usearch_extension(db)
159
160
 
@@ -345,45 +346,36 @@ class SQLiteIDGenerator(AbstractDBIDGenerator):
345
346
  def get_next_ids(self, uri: str, count: int) -> range:
346
347
  """Returns a range of IDs for the given URI."""
347
348
 
348
- # NOTE: we can't use RETURNING clause here because it is only available
349
- # in sqlalchemy v2, see
350
- # https://github.com/sqlalchemy/sqlalchemy/issues/6195#issuecomment-1248700677
351
- # After we upgrade to sqlalchemy v2, we can use the following code,
352
- # leaving fallback to the current implementation for older versions of SQLite,
353
- # which is still supported, for example, in Ubuntu 20.04 LTS (Focal Fossa),
354
- # where SQLite version 3.31.1 is used.
355
-
356
- # sqlite_version = version.parse(sqlite3.sqlite_version)
357
- # if sqlite_version >= version.parse("3.35.0"):
358
- # # RETURNING is supported on SQLite 3.35.0 (2021-03-12) or newer
359
- # stmt = (
360
- # sqlite.insert(self._table)
361
- # .values(uri=uri, last_id=count)
362
- # .on_conflict_do_update(
363
- # index_elements=["uri"],
364
- # set_={"last_id": self._table.c.last_id + count},
365
- # )
366
- # .returning(self._table.c.last_id)
367
- # )
368
- # last_id = self._db.execute(stmt).fetchone()[0]
369
- # else:
370
- # (fallback to the current implementation with a transaction)
371
-
372
- # Transactions ensure no concurrency conflicts
373
- with self._db.transaction() as conn:
374
- # UPSERT syntax was added to SQLite with version 3.24.0 (2018-06-04).
375
- stmt_ins = (
349
+ sqlite_version = version.parse(sqlite3.sqlite_version)
350
+ is_returning_supported = sqlite_version >= version.parse("3.35.0")
351
+ if is_returning_supported:
352
+ stmt = (
376
353
  sqlite.insert(self._table)
377
354
  .values(uri=uri, last_id=count)
378
355
  .on_conflict_do_update(
379
356
  index_elements=["uri"],
380
357
  set_={"last_id": self._table.c.last_id + count},
381
358
  )
359
+ .returning(self._table.c.last_id)
382
360
  )
383
- self._db.execute(stmt_ins, conn=conn)
361
+ last_id = self._db.execute(stmt).fetchone()[0]
362
+ else:
363
+ # Older versions of SQLite are still the default under Ubuntu LTS,
364
+ # e.g. Ubuntu 20.04 LTS (Focal Fossa) uses 3.31.1
365
+ # Transactions ensure no concurrency conflicts
366
+ with self._db.transaction() as conn:
367
+ stmt_ins = (
368
+ sqlite.insert(self._table)
369
+ .values(uri=uri, last_id=count)
370
+ .on_conflict_do_update(
371
+ index_elements=["uri"],
372
+ set_={"last_id": self._table.c.last_id + count},
373
+ )
374
+ )
375
+ self._db.execute(stmt_ins, conn=conn)
384
376
 
385
- stmt_sel = select(self._table.c.last_id).where(self._table.c.uri == uri)
386
- last_id = self._db.execute(stmt_sel, conn=conn).fetchone()[0]
377
+ stmt_sel = select(self._table.c.last_id).where(self._table.c.uri == uri)
378
+ last_id = self._db.execute(stmt_sel, conn=conn).fetchone()[0]
387
379
 
388
380
  return range(last_id - count + 1, last_id + 1)
389
381
 
@@ -1,8 +1,9 @@
1
1
  import re
2
2
  from collections.abc import Sequence
3
3
  from tempfile import NamedTemporaryFile
4
- from typing import TYPE_CHECKING, Optional
4
+ from typing import TYPE_CHECKING, Any, Optional
5
5
 
6
+ import orjson
6
7
  import pyarrow as pa
7
8
  from pyarrow.dataset import CsvFileFormat, dataset
8
9
  from tqdm import tqdm
@@ -10,6 +11,7 @@ from tqdm import tqdm
10
11
  from datachain.lib.data_model import dict_to_data_model
11
12
  from datachain.lib.file import ArrowRow, File
12
13
  from datachain.lib.model_store import ModelStore
14
+ from datachain.lib.signal_schema import SignalSchema
13
15
  from datachain.lib.udf import Generator
14
16
 
15
17
  if TYPE_CHECKING:
@@ -20,6 +22,9 @@ if TYPE_CHECKING:
20
22
  from datachain.lib.dc import DataChain
21
23
 
22
24
 
25
+ DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
26
+
27
+
23
28
  class ArrowGenerator(Generator):
24
29
  def __init__(
25
30
  self,
@@ -61,28 +66,35 @@ class ArrowGenerator(Generator):
61
66
  path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
62
67
  )
63
68
  hf_schema = _get_hf_schema(ds.schema)
69
+ use_datachain_schema = (
70
+ bool(ds.schema.metadata)
71
+ and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in ds.schema.metadata
72
+ )
64
73
  index = 0
65
74
  with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
66
75
  for record_batch in ds.to_batches():
67
76
  for record in record_batch.to_pylist():
68
- vals = list(record.values())
69
- if self.output_schema:
70
- fields = self.output_schema.model_fields
71
- vals_dict = {}
72
- for i, ((field, field_info), val) in enumerate(
73
- zip(fields.items(), vals)
74
- ):
75
- anno = field_info.annotation
76
- if hf_schema:
77
- from datachain.lib.hf import convert_feature
78
-
79
- feat = list(hf_schema[0].values())[i]
80
- vals_dict[field] = convert_feature(val, feat, anno)
81
- elif ModelStore.is_pydantic(anno):
82
- vals_dict[field] = anno(**val) # type: ignore[misc]
83
- else:
84
- vals_dict[field] = val
85
- vals = [self.output_schema(**vals_dict)]
77
+ if use_datachain_schema and self.output_schema:
78
+ vals = [_nested_model_instantiate(record, self.output_schema)]
79
+ else:
80
+ vals = list(record.values())
81
+ if self.output_schema:
82
+ fields = self.output_schema.model_fields
83
+ vals_dict = {}
84
+ for i, ((field, field_info), val) in enumerate(
85
+ zip(fields.items(), vals)
86
+ ):
87
+ anno = field_info.annotation
88
+ if hf_schema:
89
+ from datachain.lib.hf import convert_feature
90
+
91
+ feat = list(hf_schema[0].values())[i]
92
+ vals_dict[field] = convert_feature(val, feat, anno)
93
+ elif ModelStore.is_pydantic(anno):
94
+ vals_dict[field] = anno(**val) # type: ignore[misc]
95
+ else:
96
+ vals_dict[field] = val
97
+ vals = [self.output_schema(**vals_dict)]
86
98
  if self.source:
87
99
  kwargs: dict = self.kwargs
88
100
  # Can't serialize CsvFileFormat; may lose formatting options.
@@ -113,6 +125,9 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
113
125
  )
114
126
  if not col_names:
115
127
  col_names = schema.names
128
+ signal_schema = _get_datachain_schema(schema)
129
+ if signal_schema:
130
+ return signal_schema.values
116
131
  columns = _convert_col_names(col_names) # type: ignore[arg-type]
117
132
  hf_schema = _get_hf_schema(schema)
118
133
  if hf_schema:
@@ -197,3 +212,33 @@ def _get_hf_schema(
197
212
  features = schema_from_arrow(schema)
198
213
  return features, get_output_schema(features)
199
214
  return None
215
+
216
+
217
+ def _get_datachain_schema(schema: "pa.Schema") -> Optional[SignalSchema]:
218
+ """Return a restored SignalSchema from parquet metadata, if any is found."""
219
+ if schema.metadata and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in schema.metadata:
220
+ serialized_signal_schema = orjson.loads(
221
+ schema.metadata[DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY]
222
+ )
223
+ return SignalSchema.deserialize(serialized_signal_schema)
224
+ return None
225
+
226
+
227
+ def _nested_model_instantiate(
228
+ column_values: dict[str, Any], model: type["BaseModel"], prefix: str = ""
229
+ ) -> "BaseModel":
230
+ """Instantiate the given model and all sub-models/fields based on the provided
231
+ column values."""
232
+ vals_dict = {}
233
+ for field, field_info in model.model_fields.items():
234
+ anno = field_info.annotation
235
+ cur_path = f"{prefix}.{field}" if prefix else field
236
+ if ModelStore.is_pydantic(anno):
237
+ vals_dict[field] = _nested_model_instantiate(
238
+ column_values,
239
+ anno, # type: ignore[arg-type]
240
+ prefix=cur_path,
241
+ )
242
+ elif cur_path in column_values:
243
+ vals_dict[field] = column_values[cur_path]
244
+ return model(**vals_dict)
@@ -16,6 +16,7 @@ from typing import (
16
16
  overload,
17
17
  )
18
18
 
19
+ import orjson
19
20
  import pandas as pd
20
21
  import sqlalchemy
21
22
  from pydantic import BaseModel
@@ -58,7 +59,7 @@ from datachain.query.dataset import (
58
59
  from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
59
60
  from datachain.sql.functions import path as pathfunc
60
61
  from datachain.telemetry import telemetry
61
- from datachain.utils import inside_notebook
62
+ from datachain.utils import batched_it, inside_notebook
62
63
 
63
64
  if TYPE_CHECKING:
64
65
  from typing_extensions import Concatenate, ParamSpec, Self
@@ -71,6 +72,10 @@ C = Column
71
72
 
72
73
  _T = TypeVar("_T")
73
74
  D = TypeVar("D", bound="DataChain")
75
+ UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
76
+
77
+
78
+ DEFAULT_PARQUET_CHUNK_SIZE = 100_000
74
79
 
75
80
 
76
81
  def resolve_columns(
@@ -819,7 +824,7 @@ class DataChain:
819
824
 
820
825
  def gen(
821
826
  self,
822
- func: Optional[Callable] = None,
827
+ func: Optional[Union[Callable, Generator]] = None,
823
828
  params: Union[None, str, Sequence[str]] = None,
824
829
  output: OutputType = None,
825
830
  **signal_map,
@@ -931,12 +936,12 @@ class DataChain:
931
936
 
932
937
  def _udf_to_obj(
933
938
  self,
934
- target_class: type[UDFBase],
935
- func: Optional[Callable],
939
+ target_class: type[UDFObjT],
940
+ func: Optional[Union[Callable, UDFObjT]],
936
941
  params: Union[None, str, Sequence[str]],
937
942
  output: OutputType,
938
943
  signal_map,
939
- ) -> UDFBase:
944
+ ) -> UDFObjT:
940
945
  is_generator = target_class.is_output_batched
941
946
  name = self.name or ""
942
947
 
@@ -1103,6 +1108,29 @@ class DataChain:
1103
1108
  rows = (row_factory(db_signals, r) for r in rows)
1104
1109
  yield from rows
1105
1110
 
1111
+ def to_columnar_data_with_names(
1112
+ self, chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE
1113
+ ) -> tuple[list[str], Iterator[list[list[Any]]]]:
1114
+ """Returns column names and the results as an iterator that provides chunks,
1115
+ with each chunk containing a list of columns, where each column contains a
1116
+ list of the row values for that column in that chunk. Useful for columnar data
1117
+ formats, such as parquet or other OLAP databases.
1118
+ """
1119
+ headers, _ = self._effective_signals_schema.get_headers_with_length()
1120
+ column_names = [".".join(filter(None, header)) for header in headers]
1121
+
1122
+ results_iter = self.collect_flatten()
1123
+
1124
+ def column_chunks() -> Iterator[list[list[Any]]]:
1125
+ for chunk_iter in batched_it(results_iter, chunk_size):
1126
+ columns: list[list[Any]] = [[] for _ in column_names]
1127
+ for row in chunk_iter:
1128
+ for i, col in enumerate(columns):
1129
+ col.append(row[i])
1130
+ yield columns
1131
+
1132
+ return column_names, column_chunks()
1133
+
1106
1134
  @overload
1107
1135
  def results(self) -> list[tuple[Any, ...]]: ...
1108
1136
 
@@ -1808,21 +1836,96 @@ class DataChain:
1808
1836
  self,
1809
1837
  path: Union[str, os.PathLike[str], BinaryIO],
1810
1838
  partition_cols: Optional[Sequence[str]] = None,
1839
+ chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
1811
1840
  **kwargs,
1812
1841
  ) -> None:
1813
- """Save chain to parquet file.
1842
+ """Save chain to parquet file with SignalSchema metadata.
1814
1843
 
1815
1844
  Parameters:
1816
1845
  path : Path or a file-like binary object to save the file.
1817
1846
  partition_cols : Column names by which to partition the dataset.
1847
+ chunk_size : The chunk size of results to read and convert to columnar
1848
+ data, to avoid running out of memory.
1818
1849
  """
1850
+ import pyarrow as pa
1851
+ import pyarrow.parquet as pq
1852
+
1853
+ from datachain.lib.arrow import DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY
1854
+
1819
1855
  _partition_cols = list(partition_cols) if partition_cols else None
1820
- return self.to_pandas().to_parquet(
1821
- path,
1822
- partition_cols=_partition_cols,
1823
- **kwargs,
1856
+ signal_schema_metadata = orjson.dumps(
1857
+ self._effective_signals_schema.serialize()
1824
1858
  )
1825
1859
 
1860
+ column_names, column_chunks = self.to_columnar_data_with_names(chunk_size)
1861
+
1862
+ parquet_schema = None
1863
+ parquet_writer = None
1864
+ first_chunk = True
1865
+
1866
+ for chunk in column_chunks:
1867
+ # pyarrow infers the best parquet schema from the python types of
1868
+ # the input data.
1869
+ table = pa.Table.from_pydict(
1870
+ dict(zip(column_names, chunk)),
1871
+ schema=parquet_schema,
1872
+ )
1873
+
1874
+ # Preserve any existing metadata, and add the DataChain SignalSchema.
1875
+ existing_metadata = table.schema.metadata or {}
1876
+ merged_metadata = {
1877
+ **existing_metadata,
1878
+ DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY: signal_schema_metadata,
1879
+ }
1880
+ table = table.replace_schema_metadata(merged_metadata)
1881
+ parquet_schema = table.schema
1882
+
1883
+ if _partition_cols:
1884
+ # Write to a partitioned parquet dataset.
1885
+ pq.write_to_dataset(
1886
+ table,
1887
+ root_path=path,
1888
+ partition_cols=_partition_cols,
1889
+ **kwargs,
1890
+ )
1891
+ else:
1892
+ if first_chunk:
1893
+ # Write to a single parquet file.
1894
+ parquet_writer = pq.ParquetWriter(path, parquet_schema, **kwargs)
1895
+ first_chunk = False
1896
+
1897
+ assert parquet_writer
1898
+ parquet_writer.write_table(table)
1899
+
1900
+ if parquet_writer:
1901
+ parquet_writer.close()
1902
+
1903
+ def to_csv(
1904
+ self,
1905
+ path: Union[str, os.PathLike[str]],
1906
+ delimiter: str = ",",
1907
+ **kwargs,
1908
+ ) -> None:
1909
+ """Save chain to a csv (comma-separated values) file.
1910
+
1911
+ Parameters:
1912
+ path : Path to save the file.
1913
+ delimiter : Delimiter to use for the resulting file.
1914
+ """
1915
+ import csv
1916
+
1917
+ headers, _ = self._effective_signals_schema.get_headers_with_length()
1918
+ column_names = [".".join(filter(None, header)) for header in headers]
1919
+
1920
+ results_iter = self.collect_flatten()
1921
+
1922
+ with open(path, "w", newline="") as f:
1923
+ writer = csv.writer(f, delimiter=delimiter, **kwargs)
1924
+ writer.writerow(column_names)
1925
+
1926
+ for row in results_iter:
1927
+ writer.writerow(row)
1928
+
1826
1929
  @classmethod
1827
1930
  def from_records(
1828
1931
  cls,