datachain 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (252) hide show
  1. {datachain-0.5.0 → datachain-0.6.0}/.pre-commit-config.yaml +2 -2
  2. {datachain-0.5.0/src/datachain.egg-info → datachain-0.6.0}/PKG-INFO +1 -1
  3. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/catalog/catalog.py +8 -0
  4. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/data_storage/metastore.py +20 -1
  5. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/data_storage/sqlite.py +24 -32
  6. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/arrow.py +64 -19
  7. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/convert/values_to_tuples.py +2 -2
  8. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/data_model.py +1 -1
  9. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/dc.py +131 -12
  10. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/signal_schema.py +6 -6
  11. datachain-0.6.0/src/datachain/lib/udf.py +406 -0
  12. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/udf_signature.py +8 -6
  13. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/query/batch.py +0 -10
  14. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/query/dataset.py +7 -7
  15. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/query/dispatch.py +2 -14
  16. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/query/session.py +42 -0
  17. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/functions/string.py +12 -0
  18. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/sqlite/base.py +10 -5
  19. {datachain-0.5.0 → datachain-0.6.0/src/datachain.egg-info}/PKG-INFO +1 -1
  20. {datachain-0.5.0 → datachain-0.6.0}/src/datachain.egg-info/SOURCES.txt +2 -1
  21. datachain-0.6.0/tests/scripts/feature_class_exception.py +11 -0
  22. datachain-0.6.0/tests/test_atomicity.py +58 -0
  23. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_datachain.py +180 -0
  24. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/sql/test_string.py +15 -0
  25. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_id_generator.py +18 -0
  26. datachain-0.5.0/src/datachain/lib/udf.py +0 -358
  27. datachain-0.5.0/src/datachain/query/udf.py +0 -126
  28. {datachain-0.5.0 → datachain-0.6.0}/.cruft.json +0 -0
  29. {datachain-0.5.0 → datachain-0.6.0}/.gitattributes +0 -0
  30. {datachain-0.5.0 → datachain-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  31. {datachain-0.5.0 → datachain-0.6.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  32. {datachain-0.5.0 → datachain-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  33. {datachain-0.5.0 → datachain-0.6.0}/.github/codecov.yaml +0 -0
  34. {datachain-0.5.0 → datachain-0.6.0}/.github/dependabot.yml +0 -0
  35. {datachain-0.5.0 → datachain-0.6.0}/.github/workflows/benchmarks.yml +0 -0
  36. {datachain-0.5.0 → datachain-0.6.0}/.github/workflows/release.yml +0 -0
  37. {datachain-0.5.0 → datachain-0.6.0}/.github/workflows/tests-studio.yml +0 -0
  38. {datachain-0.5.0 → datachain-0.6.0}/.github/workflows/tests.yml +0 -0
  39. {datachain-0.5.0 → datachain-0.6.0}/.github/workflows/update-template.yaml +0 -0
  40. {datachain-0.5.0 → datachain-0.6.0}/.gitignore +0 -0
  41. {datachain-0.5.0 → datachain-0.6.0}/CODE_OF_CONDUCT.rst +0 -0
  42. {datachain-0.5.0 → datachain-0.6.0}/CONTRIBUTING.rst +0 -0
  43. {datachain-0.5.0 → datachain-0.6.0}/LICENSE +0 -0
  44. {datachain-0.5.0 → datachain-0.6.0}/README.rst +0 -0
  45. {datachain-0.5.0 → datachain-0.6.0}/docs/assets/captioned_cartoons.png +0 -0
  46. {datachain-0.5.0 → datachain-0.6.0}/docs/assets/datachain-white.svg +0 -0
  47. {datachain-0.5.0 → datachain-0.6.0}/docs/assets/datachain.svg +0 -0
  48. {datachain-0.5.0 → datachain-0.6.0}/docs/assets/flowchart.png +0 -0
  49. {datachain-0.5.0 → datachain-0.6.0}/docs/index.md +0 -0
  50. {datachain-0.5.0 → datachain-0.6.0}/docs/references/datachain.md +0 -0
  51. {datachain-0.5.0 → datachain-0.6.0}/docs/references/datatype.md +0 -0
  52. {datachain-0.5.0 → datachain-0.6.0}/docs/references/file.md +0 -0
  53. {datachain-0.5.0 → datachain-0.6.0}/docs/references/index.md +0 -0
  54. {datachain-0.5.0 → datachain-0.6.0}/docs/references/sql.md +0 -0
  55. {datachain-0.5.0 → datachain-0.6.0}/docs/references/torch.md +0 -0
  56. {datachain-0.5.0 → datachain-0.6.0}/docs/references/udf.md +0 -0
  57. {datachain-0.5.0 → datachain-0.6.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  58. {datachain-0.5.0 → datachain-0.6.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  59. {datachain-0.5.0 → datachain-0.6.0}/examples/computer_vision/openimage-detect.py +0 -0
  60. {datachain-0.5.0 → datachain-0.6.0}/examples/get_started/common_sql_functions.py +0 -0
  61. {datachain-0.5.0 → datachain-0.6.0}/examples/get_started/json-csv-reader.py +0 -0
  62. {datachain-0.5.0 → datachain-0.6.0}/examples/get_started/torch-loader.py +0 -0
  63. {datachain-0.5.0 → datachain-0.6.0}/examples/get_started/udfs/parallel.py +0 -0
  64. {datachain-0.5.0 → datachain-0.6.0}/examples/get_started/udfs/simple.py +0 -0
  65. {datachain-0.5.0 → datachain-0.6.0}/examples/get_started/udfs/stateful.py +0 -0
  66. {datachain-0.5.0 → datachain-0.6.0}/examples/llm_and_nlp/claude-query.py +0 -0
  67. {datachain-0.5.0 → datachain-0.6.0}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  68. {datachain-0.5.0 → datachain-0.6.0}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  69. {datachain-0.5.0 → datachain-0.6.0}/examples/multimodal/clip_inference.py +0 -0
  70. {datachain-0.5.0 → datachain-0.6.0}/examples/multimodal/hf_pipeline.py +0 -0
  71. {datachain-0.5.0 → datachain-0.6.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  72. {datachain-0.5.0 → datachain-0.6.0}/examples/multimodal/wds.py +0 -0
  73. {datachain-0.5.0 → datachain-0.6.0}/examples/multimodal/wds_filtered.py +0 -0
  74. {datachain-0.5.0 → datachain-0.6.0}/mkdocs.yml +0 -0
  75. {datachain-0.5.0 → datachain-0.6.0}/noxfile.py +0 -0
  76. {datachain-0.5.0 → datachain-0.6.0}/overrides/main.html +0 -0
  77. {datachain-0.5.0 → datachain-0.6.0}/pyproject.toml +0 -0
  78. {datachain-0.5.0 → datachain-0.6.0}/setup.cfg +0 -0
  79. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/__init__.py +0 -0
  80. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/__main__.py +0 -0
  81. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/asyn.py +0 -0
  82. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/cache.py +0 -0
  83. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/catalog/__init__.py +0 -0
  84. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/catalog/datasource.py +0 -0
  85. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/catalog/loader.py +0 -0
  86. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/cli.py +0 -0
  87. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/cli_utils.py +0 -0
  88. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/client/__init__.py +0 -0
  89. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/client/azure.py +0 -0
  90. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/client/fileslice.py +0 -0
  91. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/client/fsspec.py +0 -0
  92. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/client/gcs.py +0 -0
  93. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/client/hf.py +0 -0
  94. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/client/local.py +0 -0
  95. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/client/s3.py +0 -0
  96. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/config.py +0 -0
  97. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/data_storage/__init__.py +0 -0
  98. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/data_storage/db_engine.py +0 -0
  99. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/data_storage/id_generator.py +0 -0
  100. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/data_storage/job.py +0 -0
  101. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/data_storage/schema.py +0 -0
  102. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/data_storage/serializer.py +0 -0
  103. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/data_storage/warehouse.py +0 -0
  104. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/dataset.py +0 -0
  105. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/error.py +0 -0
  106. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/job.py +0 -0
  107. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/__init__.py +0 -0
  108. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/clip.py +0 -0
  109. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/convert/__init__.py +0 -0
  110. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/convert/flatten.py +0 -0
  111. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  112. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  113. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/convert/unflatten.py +0 -0
  114. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/dataset_info.py +0 -0
  115. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/file.py +0 -0
  116. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/hf.py +0 -0
  117. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/image.py +0 -0
  118. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/listing.py +0 -0
  119. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/listing_info.py +0 -0
  120. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/meta_formats.py +0 -0
  121. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/model_store.py +0 -0
  122. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/pytorch.py +0 -0
  123. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/settings.py +0 -0
  124. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/tar.py +0 -0
  125. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/text.py +0 -0
  126. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/utils.py +0 -0
  127. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/vfile.py +0 -0
  128. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/webdataset.py +0 -0
  129. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/lib/webdataset_laion.py +0 -0
  130. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/listing.py +0 -0
  131. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/node.py +0 -0
  132. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/nodes_fetcher.py +0 -0
  133. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/nodes_thread_pool.py +0 -0
  134. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/progress.py +0 -0
  135. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/py.typed +0 -0
  136. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/query/__init__.py +0 -0
  137. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/query/metrics.py +0 -0
  138. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/query/params.py +0 -0
  139. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/query/queue.py +0 -0
  140. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/query/schema.py +0 -0
  141. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/remote/__init__.py +0 -0
  142. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/remote/studio.py +0 -0
  143. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/__init__.py +0 -0
  144. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/default/__init__.py +0 -0
  145. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/default/base.py +0 -0
  146. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/functions/__init__.py +0 -0
  147. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/functions/array.py +0 -0
  148. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/functions/conditional.py +0 -0
  149. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/functions/path.py +0 -0
  150. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/functions/random.py +0 -0
  151. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/selectable.py +0 -0
  152. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  153. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/sqlite/types.py +0 -0
  154. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/sqlite/vector.py +0 -0
  155. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/types.py +0 -0
  156. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/sql/utils.py +0 -0
  157. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/storage.py +0 -0
  158. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/telemetry.py +0 -0
  159. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/torch/__init__.py +0 -0
  160. {datachain-0.5.0 → datachain-0.6.0}/src/datachain/utils.py +0 -0
  161. {datachain-0.5.0 → datachain-0.6.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  162. {datachain-0.5.0 → datachain-0.6.0}/src/datachain.egg-info/entry_points.txt +0 -0
  163. {datachain-0.5.0 → datachain-0.6.0}/src/datachain.egg-info/requires.txt +0 -0
  164. {datachain-0.5.0 → datachain-0.6.0}/src/datachain.egg-info/top_level.txt +0 -0
  165. {datachain-0.5.0 → datachain-0.6.0}/tests/__init__.py +0 -0
  166. {datachain-0.5.0 → datachain-0.6.0}/tests/benchmarks/__init__.py +0 -0
  167. {datachain-0.5.0 → datachain-0.6.0}/tests/benchmarks/conftest.py +0 -0
  168. {datachain-0.5.0 → datachain-0.6.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  169. {datachain-0.5.0 → datachain-0.6.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  170. {datachain-0.5.0 → datachain-0.6.0}/tests/benchmarks/datasets/.gitignore +0 -0
  171. {datachain-0.5.0 → datachain-0.6.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  172. {datachain-0.5.0 → datachain-0.6.0}/tests/benchmarks/test_datachain.py +0 -0
  173. {datachain-0.5.0 → datachain-0.6.0}/tests/benchmarks/test_ls.py +0 -0
  174. {datachain-0.5.0 → datachain-0.6.0}/tests/benchmarks/test_version.py +0 -0
  175. {datachain-0.5.0 → datachain-0.6.0}/tests/conftest.py +0 -0
  176. {datachain-0.5.0 → datachain-0.6.0}/tests/data.py +0 -0
  177. {datachain-0.5.0 → datachain-0.6.0}/tests/examples/__init__.py +0 -0
  178. {datachain-0.5.0 → datachain-0.6.0}/tests/examples/test_examples.py +0 -0
  179. {datachain-0.5.0 → datachain-0.6.0}/tests/examples/test_wds_e2e.py +0 -0
  180. {datachain-0.5.0 → datachain-0.6.0}/tests/examples/wds_data.py +0 -0
  181. {datachain-0.5.0 → datachain-0.6.0}/tests/func/__init__.py +0 -0
  182. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_catalog.py +0 -0
  183. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_client.py +0 -0
  184. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_datachain.py +0 -0
  185. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_dataset_query.py +0 -0
  186. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_datasets.py +0 -0
  187. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_feature_pickling.py +0 -0
  188. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_listing.py +0 -0
  189. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_ls.py +0 -0
  190. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_meta_formats.py +0 -0
  191. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_metrics.py +0 -0
  192. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_pull.py +0 -0
  193. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_pytorch.py +0 -0
  194. {datachain-0.5.0 → datachain-0.6.0}/tests/func/test_query.py +0 -0
  195. {datachain-0.5.0 → datachain-0.6.0}/tests/scripts/feature_class.py +0 -0
  196. {datachain-0.5.0 → datachain-0.6.0}/tests/scripts/feature_class_parallel.py +0 -0
  197. {datachain-0.5.0 → datachain-0.6.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  198. {datachain-0.5.0 → datachain-0.6.0}/tests/scripts/name_len_slow.py +0 -0
  199. {datachain-0.5.0 → datachain-0.6.0}/tests/test_cli_e2e.py +0 -0
  200. {datachain-0.5.0 → datachain-0.6.0}/tests/test_query_e2e.py +0 -0
  201. {datachain-0.5.0 → datachain-0.6.0}/tests/test_telemetry.py +0 -0
  202. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/__init__.py +0 -0
  203. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/__init__.py +0 -0
  204. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/conftest.py +0 -0
  205. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_arrow.py +0 -0
  206. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_clip.py +0 -0
  207. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  208. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  209. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_feature.py +0 -0
  210. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_feature_utils.py +0 -0
  211. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_file.py +0 -0
  212. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_hf.py +0 -0
  213. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_image.py +0 -0
  214. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_schema.py +0 -0
  215. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_signal_schema.py +0 -0
  216. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  217. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_text.py +0 -0
  218. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_udf_signature.py +0 -0
  219. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_utils.py +0 -0
  220. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/lib/test_webdataset.py +0 -0
  221. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/sql/__init__.py +0 -0
  222. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  223. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  224. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/sql/test_array.py +0 -0
  225. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/sql/test_conditional.py +0 -0
  226. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/sql/test_path.py +0 -0
  227. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/sql/test_random.py +0 -0
  228. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/sql/test_selectable.py +0 -0
  229. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_asyn.py +0 -0
  230. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_cache.py +0 -0
  231. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_catalog.py +0 -0
  232. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_catalog_loader.py +0 -0
  233. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_cli_parsing.py +0 -0
  234. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_client.py +0 -0
  235. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_client_s3.py +0 -0
  236. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_data_storage.py +0 -0
  237. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_database_engine.py +0 -0
  238. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_dataset.py +0 -0
  239. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_dispatch.py +0 -0
  240. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_fileslice.py +0 -0
  241. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_listing.py +0 -0
  242. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_metastore.py +0 -0
  243. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_module_exports.py +0 -0
  244. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_query.py +0 -0
  245. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_query_metrics.py +0 -0
  246. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_query_params.py +0 -0
  247. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_serializer.py +0 -0
  248. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_session.py +0 -0
  249. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_storage.py +0 -0
  250. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_utils.py +0 -0
  251. {datachain-0.5.0 → datachain-0.6.0}/tests/unit/test_warehouse.py +0 -0
  252. {datachain-0.5.0 → datachain-0.6.0}/tests/utils.py +0 -0
@@ -4,7 +4,7 @@ ci:
4
4
  skip: [mypy]
5
5
  repos:
6
6
  - repo: https://github.com/pre-commit/pre-commit-hooks
7
- rev: v4.6.0
7
+ rev: v5.0.0
8
8
  hooks:
9
9
  - id: check-added-large-files
10
10
  exclude: '^tests/examples/data/'
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.6.7'
27
+ rev: 'v0.6.9'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -988,6 +988,14 @@ class Catalog:
988
988
  schema = {
989
989
  c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
990
990
  }
991
+
992
+ job_id = job_id or os.getenv("DATACHAIN_JOB_ID")
993
+ if not job_id:
994
+ from datachain.query.session import Session
995
+
996
+ session = Session.get(catalog=self)
997
+ job_id = session.job_id
998
+
991
999
  dataset = self.metastore.create_dataset_version(
992
1000
  dataset,
993
1001
  version,
@@ -50,7 +50,6 @@ if TYPE_CHECKING:
50
50
  from datachain.data_storage import AbstractIDGenerator, schema
51
51
  from datachain.data_storage.db_engine import DatabaseEngine
52
52
 
53
-
54
53
  logger = logging.getLogger("datachain")
55
54
 
56
55
 
@@ -384,6 +383,11 @@ class AbstractMetastore(ABC, Serializable):
384
383
  ) -> None:
385
384
  """Set the status of the given job and dataset."""
386
385
 
386
+ @abstractmethod
387
+ def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
388
+ """Returns dataset names and versions for the job."""
389
+ raise NotImplementedError
390
+
387
391
 
388
392
  class AbstractDBMetastore(AbstractMetastore):
389
393
  """
@@ -1519,3 +1523,18 @@ class AbstractDBMetastore(AbstractMetastore):
1519
1523
  .values(status=dataset_status)
1520
1524
  )
1521
1525
  self.db.execute(query, conn=conn) # type: ignore[attr-defined]
1526
+
1527
+ def get_job_dataset_versions(self, job_id: str) -> list[tuple[str, int]]:
1528
+ """Returns dataset names and versions for the job."""
1529
+ dv = self._datasets_versions
1530
+ ds = self._datasets
1531
+
1532
+ join_condition = dv.c.dataset_id == ds.c.id
1533
+
1534
+ query = (
1535
+ self._datasets_versions_select(ds.c.name, dv.c.version)
1536
+ .select_from(dv.join(ds, join_condition))
1537
+ .where(dv.c.job_id == job_id)
1538
+ )
1539
+
1540
+ return list(self.db.execute(query))
@@ -15,6 +15,7 @@ from typing import (
15
15
  )
16
16
 
17
17
  import sqlalchemy
18
+ from packaging import version
18
19
  from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
19
20
  from sqlalchemy.dialects import sqlite
20
21
  from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
@@ -153,7 +154,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
153
154
  if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
154
155
  import sys
155
156
 
156
- db.set_trace_callback(sys.stderr.write)
157
+ db.set_trace_callback(lambda stmt: print(stmt, file=sys.stderr))
157
158
 
158
159
  load_usearch_extension(db)
159
160
 
@@ -345,45 +346,36 @@ class SQLiteIDGenerator(AbstractDBIDGenerator):
345
346
  def get_next_ids(self, uri: str, count: int) -> range:
346
347
  """Returns a range of IDs for the given URI."""
347
348
 
348
- # NOTE: we can't use RETURNING clause here because it is only available
349
- # in sqlalchemy v2, see
350
- # https://github.com/sqlalchemy/sqlalchemy/issues/6195#issuecomment-1248700677
351
- # After we upgrade to sqlalchemy v2, we can use the following code,
352
- # leaving fallback to the current implementation for older versions of SQLite,
353
- # which is still supported, for example, in Ubuntu 20.04 LTS (Focal Fossa),
354
- # where SQLite version 3.31.1 is used.
355
-
356
- # sqlite_version = version.parse(sqlite3.sqlite_version)
357
- # if sqlite_version >= version.parse("3.35.0"):
358
- # # RETURNING is supported on SQLite 3.35.0 (2021-03-12) or newer
359
- # stmt = (
360
- # sqlite.insert(self._table)
361
- # .values(uri=uri, last_id=count)
362
- # .on_conflict_do_update(
363
- # index_elements=["uri"],
364
- # set_={"last_id": self._table.c.last_id + count},
365
- # )
366
- # .returning(self._table.c.last_id)
367
- # )
368
- # last_id = self._db.execute(stmt).fetchone()[0]
369
- # else:
370
- # (fallback to the current implementation with a transaction)
371
-
372
- # Transactions ensure no concurrency conflicts
373
- with self._db.transaction() as conn:
374
- # UPSERT syntax was added to SQLite with version 3.24.0 (2018-06-04).
375
- stmt_ins = (
349
+ sqlite_version = version.parse(sqlite3.sqlite_version)
350
+ is_returning_supported = sqlite_version >= version.parse("3.35.0")
351
+ if is_returning_supported:
352
+ stmt = (
376
353
  sqlite.insert(self._table)
377
354
  .values(uri=uri, last_id=count)
378
355
  .on_conflict_do_update(
379
356
  index_elements=["uri"],
380
357
  set_={"last_id": self._table.c.last_id + count},
381
358
  )
359
+ .returning(self._table.c.last_id)
382
360
  )
383
- self._db.execute(stmt_ins, conn=conn)
361
+ last_id = self._db.execute(stmt).fetchone()[0]
362
+ else:
363
+ # Older versions of SQLite are still the default under Ubuntu LTS,
364
+ # e.g. Ubuntu 20.04 LTS (Focal Fossa) uses 3.31.1
365
+ # Transactions ensure no concurrency conflicts
366
+ with self._db.transaction() as conn:
367
+ stmt_ins = (
368
+ sqlite.insert(self._table)
369
+ .values(uri=uri, last_id=count)
370
+ .on_conflict_do_update(
371
+ index_elements=["uri"],
372
+ set_={"last_id": self._table.c.last_id + count},
373
+ )
374
+ )
375
+ self._db.execute(stmt_ins, conn=conn)
384
376
 
385
- stmt_sel = select(self._table.c.last_id).where(self._table.c.uri == uri)
386
- last_id = self._db.execute(stmt_sel, conn=conn).fetchone()[0]
377
+ stmt_sel = select(self._table.c.last_id).where(self._table.c.uri == uri)
378
+ last_id = self._db.execute(stmt_sel, conn=conn).fetchone()[0]
387
379
 
388
380
  return range(last_id - count + 1, last_id + 1)
389
381
 
@@ -1,8 +1,9 @@
1
1
  import re
2
2
  from collections.abc import Sequence
3
3
  from tempfile import NamedTemporaryFile
4
- from typing import TYPE_CHECKING, Optional
4
+ from typing import TYPE_CHECKING, Any, Optional
5
5
 
6
+ import orjson
6
7
  import pyarrow as pa
7
8
  from pyarrow.dataset import CsvFileFormat, dataset
8
9
  from tqdm import tqdm
@@ -10,6 +11,7 @@ from tqdm import tqdm
10
11
  from datachain.lib.data_model import dict_to_data_model
11
12
  from datachain.lib.file import ArrowRow, File
12
13
  from datachain.lib.model_store import ModelStore
14
+ from datachain.lib.signal_schema import SignalSchema
13
15
  from datachain.lib.udf import Generator
14
16
 
15
17
  if TYPE_CHECKING:
@@ -20,6 +22,9 @@ if TYPE_CHECKING:
20
22
  from datachain.lib.dc import DataChain
21
23
 
22
24
 
25
+ DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
26
+
27
+
23
28
  class ArrowGenerator(Generator):
24
29
  def __init__(
25
30
  self,
@@ -61,28 +66,35 @@ class ArrowGenerator(Generator):
61
66
  path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
62
67
  )
63
68
  hf_schema = _get_hf_schema(ds.schema)
69
+ use_datachain_schema = (
70
+ bool(ds.schema.metadata)
71
+ and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in ds.schema.metadata
72
+ )
64
73
  index = 0
65
74
  with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
66
75
  for record_batch in ds.to_batches():
67
76
  for record in record_batch.to_pylist():
68
- vals = list(record.values())
69
- if self.output_schema:
70
- fields = self.output_schema.model_fields
71
- vals_dict = {}
72
- for i, ((field, field_info), val) in enumerate(
73
- zip(fields.items(), vals)
74
- ):
75
- anno = field_info.annotation
76
- if hf_schema:
77
- from datachain.lib.hf import convert_feature
78
-
79
- feat = list(hf_schema[0].values())[i]
80
- vals_dict[field] = convert_feature(val, feat, anno)
81
- elif ModelStore.is_pydantic(anno):
82
- vals_dict[field] = anno(**val) # type: ignore[misc]
83
- else:
84
- vals_dict[field] = val
85
- vals = [self.output_schema(**vals_dict)]
77
+ if use_datachain_schema and self.output_schema:
78
+ vals = [_nested_model_instantiate(record, self.output_schema)]
79
+ else:
80
+ vals = list(record.values())
81
+ if self.output_schema:
82
+ fields = self.output_schema.model_fields
83
+ vals_dict = {}
84
+ for i, ((field, field_info), val) in enumerate(
85
+ zip(fields.items(), vals)
86
+ ):
87
+ anno = field_info.annotation
88
+ if hf_schema:
89
+ from datachain.lib.hf import convert_feature
90
+
91
+ feat = list(hf_schema[0].values())[i]
92
+ vals_dict[field] = convert_feature(val, feat, anno)
93
+ elif ModelStore.is_pydantic(anno):
94
+ vals_dict[field] = anno(**val) # type: ignore[misc]
95
+ else:
96
+ vals_dict[field] = val
97
+ vals = [self.output_schema(**vals_dict)]
86
98
  if self.source:
87
99
  kwargs: dict = self.kwargs
88
100
  # Can't serialize CsvFileFormat; may lose formatting options.
@@ -113,6 +125,9 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
113
125
  )
114
126
  if not col_names:
115
127
  col_names = schema.names
128
+ signal_schema = _get_datachain_schema(schema)
129
+ if signal_schema:
130
+ return signal_schema.values
116
131
  columns = _convert_col_names(col_names) # type: ignore[arg-type]
117
132
  hf_schema = _get_hf_schema(schema)
118
133
  if hf_schema:
@@ -197,3 +212,33 @@ def _get_hf_schema(
197
212
  features = schema_from_arrow(schema)
198
213
  return features, get_output_schema(features)
199
214
  return None
215
+
216
+
217
+ def _get_datachain_schema(schema: "pa.Schema") -> Optional[SignalSchema]:
218
+ """Return a restored SignalSchema from parquet metadata, if any is found."""
219
+ if schema.metadata and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in schema.metadata:
220
+ serialized_signal_schema = orjson.loads(
221
+ schema.metadata[DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY]
222
+ )
223
+ return SignalSchema.deserialize(serialized_signal_schema)
224
+ return None
225
+
226
+
227
+ def _nested_model_instantiate(
228
+ column_values: dict[str, Any], model: type["BaseModel"], prefix: str = ""
229
+ ) -> "BaseModel":
230
+ """Instantiate the given model and all sub-models/fields based on the provided
231
+ column values."""
232
+ vals_dict = {}
233
+ for field, field_info in model.model_fields.items():
234
+ anno = field_info.annotation
235
+ cur_path = f"{prefix}.{field}" if prefix else field
236
+ if ModelStore.is_pydantic(anno):
237
+ vals_dict[field] = _nested_model_instantiate(
238
+ column_values,
239
+ anno, # type: ignore[arg-type]
240
+ prefix=cur_path,
241
+ )
242
+ elif cur_path in column_values:
243
+ vals_dict[field] = column_values[cur_path]
244
+ return model(**vals_dict)
@@ -4,7 +4,7 @@ from typing import Any, Union
4
4
  from datachain.lib.data_model import (
5
5
  DataType,
6
6
  DataTypeNames,
7
- DataValuesType,
7
+ DataValue,
8
8
  is_chain_type,
9
9
  )
10
10
  from datachain.lib.utils import DataChainParamsError
@@ -20,7 +20,7 @@ class ValuesToTupleError(DataChainParamsError):
20
20
  def values_to_tuples( # noqa: C901, PLR0912
21
21
  ds_name: str = "",
22
22
  output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
23
- **fr_map: Sequence[DataValuesType],
23
+ **fr_map: Sequence[DataValue],
24
24
  ) -> tuple[Any, Any, Any]:
25
25
  if output:
26
26
  if not isinstance(output, (Sequence, str, dict)):
@@ -18,7 +18,7 @@ StandardType = Union[
18
18
  ]
19
19
  DataType = Union[type[BaseModel], StandardType]
20
20
  DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
21
- DataValuesType = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
21
+ DataValue = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
22
22
 
23
23
 
24
24
  class DataModel(BaseModel):
@@ -16,6 +16,7 @@ from typing import (
16
16
  overload,
17
17
  )
18
18
 
19
+ import orjson
19
20
  import pandas as pd
20
21
  import sqlalchemy
21
22
  from pydantic import BaseModel
@@ -58,9 +59,10 @@ from datachain.query.dataset import (
58
59
  from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
59
60
  from datachain.sql.functions import path as pathfunc
60
61
  from datachain.telemetry import telemetry
61
- from datachain.utils import inside_notebook
62
+ from datachain.utils import batched_it, inside_notebook
62
63
 
63
64
  if TYPE_CHECKING:
65
+ from pyarrow import DataType as ArrowDataType
64
66
  from typing_extensions import Concatenate, ParamSpec, Self
65
67
 
66
68
  from datachain.lib.hf import HFDatasetType
@@ -71,6 +73,10 @@ C = Column
71
73
 
72
74
  _T = TypeVar("_T")
73
75
  D = TypeVar("D", bound="DataChain")
76
+ UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
77
+
78
+
79
+ DEFAULT_PARQUET_CHUNK_SIZE = 100_000
74
80
 
75
81
 
76
82
  def resolve_columns(
@@ -819,7 +825,7 @@ class DataChain:
819
825
 
820
826
  def gen(
821
827
  self,
822
- func: Optional[Callable] = None,
828
+ func: Optional[Union[Callable, Generator]] = None,
823
829
  params: Union[None, str, Sequence[str]] = None,
824
830
  output: OutputType = None,
825
831
  **signal_map,
@@ -931,12 +937,12 @@ class DataChain:
931
937
 
932
938
  def _udf_to_obj(
933
939
  self,
934
- target_class: type[UDFBase],
935
- func: Optional[Callable],
940
+ target_class: type[UDFObjT],
941
+ func: Optional[Union[Callable, UDFObjT]],
936
942
  params: Union[None, str, Sequence[str]],
937
943
  output: OutputType,
938
944
  signal_map,
939
- ) -> UDFBase:
945
+ ) -> UDFObjT:
940
946
  is_generator = target_class.is_output_batched
941
947
  name = self.name or ""
942
948
 
@@ -1019,7 +1025,7 @@ class DataChain:
1019
1025
  The supported functions:
1020
1026
  Numerical: +, -, *, /, rand(), avg(), count(), func(),
1021
1027
  greatest(), least(), max(), min(), sum()
1022
- String: length(), split()
1028
+ String: length(), split(), replace(), regexp_replace()
1023
1029
  Filename: name(), parent(), file_stem(), file_ext()
1024
1030
  Array: length(), sip_hash_64(), euclidean_distance(),
1025
1031
  cosine_distance()
@@ -1103,6 +1109,29 @@ class DataChain:
1103
1109
  rows = (row_factory(db_signals, r) for r in rows)
1104
1110
  yield from rows
1105
1111
 
1112
+ def to_columnar_data_with_names(
1113
+ self, chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE
1114
+ ) -> tuple[list[str], Iterator[list[list[Any]]]]:
1115
+ """Returns column names and the results as an iterator that provides chunks,
1116
+ with each chunk containing a list of columns, where each column contains a
1117
+ list of the row values for that column in that chunk. Useful for columnar data
1118
+ formats, such as parquet or other OLAP databases.
1119
+ """
1120
+ headers, _ = self._effective_signals_schema.get_headers_with_length()
1121
+ column_names = [".".join(filter(None, header)) for header in headers]
1122
+
1123
+ results_iter = self.collect_flatten()
1124
+
1125
+ def column_chunks() -> Iterator[list[list[Any]]]:
1126
+ for chunk_iter in batched_it(results_iter, chunk_size):
1127
+ columns: list[list[Any]] = [[] for _ in column_names]
1128
+ for row in chunk_iter:
1129
+ for i, col in enumerate(columns):
1130
+ col.append(row[i])
1131
+ yield columns
1132
+
1133
+ return column_names, column_chunks()
1134
+
1106
1135
  @overload
1107
1136
  def results(self) -> list[tuple[Any, ...]]: ...
1108
1137
 
@@ -1681,6 +1710,7 @@ class DataChain:
1681
1710
  nrows=None,
1682
1711
  session: Optional[Session] = None,
1683
1712
  settings: Optional[dict] = None,
1713
+ column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
1684
1714
  **kwargs,
1685
1715
  ) -> "DataChain":
1686
1716
  """Generate chain from csv files.
@@ -1699,6 +1729,9 @@ class DataChain:
1699
1729
  nrows : Optional row limit.
1700
1730
  session : Session to use for the chain.
1701
1731
  settings : Settings to use for the chain.
1732
+ column_types : Dictionary of column names and their corresponding types.
1733
+ It is passed to CSV reader and for each column specified type auto
1734
+ inference is disabled.
1702
1735
 
1703
1736
  Example:
1704
1737
  Reading a csv file:
@@ -1714,6 +1747,15 @@ class DataChain:
1714
1747
  from pandas.io.parsers.readers import STR_NA_VALUES
1715
1748
  from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
1716
1749
  from pyarrow.dataset import CsvFileFormat
1750
+ from pyarrow.lib import type_for_alias
1751
+
1752
+ if column_types:
1753
+ column_types = {
1754
+ name: type_for_alias(typ) if isinstance(typ, str) else typ
1755
+ for name, typ in column_types.items()
1756
+ }
1757
+ else:
1758
+ column_types = {}
1717
1759
 
1718
1760
  chain = DataChain.from_storage(
1719
1761
  path, session=session, settings=settings, **kwargs
@@ -1739,7 +1781,9 @@ class DataChain:
1739
1781
  parse_options = ParseOptions(delimiter=delimiter)
1740
1782
  read_options = ReadOptions(column_names=column_names)
1741
1783
  convert_options = ConvertOptions(
1742
- strings_can_be_null=True, null_values=STR_NA_VALUES
1784
+ strings_can_be_null=True,
1785
+ null_values=STR_NA_VALUES,
1786
+ column_types=column_types,
1743
1787
  )
1744
1788
  format = CsvFileFormat(
1745
1789
  parse_options=parse_options,
@@ -1808,21 +1852,96 @@ class DataChain:
1808
1852
  self,
1809
1853
  path: Union[str, os.PathLike[str], BinaryIO],
1810
1854
  partition_cols: Optional[Sequence[str]] = None,
1855
+ chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
1811
1856
  **kwargs,
1812
1857
  ) -> None:
1813
- """Save chain to parquet file.
1858
+ """Save chain to parquet file with SignalSchema metadata.
1814
1859
 
1815
1860
  Parameters:
1816
1861
  path : Path or a file-like binary object to save the file.
1817
1862
  partition_cols : Column names by which to partition the dataset.
1863
+ chunk_size : The chunk size of results to read and convert to columnar
1864
+ data, to avoid running out of memory.
1818
1865
  """
1866
+ import pyarrow as pa
1867
+ import pyarrow.parquet as pq
1868
+
1869
+ from datachain.lib.arrow import DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY
1870
+
1819
1871
  _partition_cols = list(partition_cols) if partition_cols else None
1820
- return self.to_pandas().to_parquet(
1821
- path,
1822
- partition_cols=_partition_cols,
1823
- **kwargs,
1872
+ signal_schema_metadata = orjson.dumps(
1873
+ self._effective_signals_schema.serialize()
1824
1874
  )
1825
1875
 
1876
+ column_names, column_chunks = self.to_columnar_data_with_names(chunk_size)
1877
+
1878
+ parquet_schema = None
1879
+ parquet_writer = None
1880
+ first_chunk = True
1881
+
1882
+ for chunk in column_chunks:
1883
+ # pyarrow infers the best parquet schema from the python types of
1884
+ # the input data.
1885
+ table = pa.Table.from_pydict(
1886
+ dict(zip(column_names, chunk)),
1887
+ schema=parquet_schema,
1888
+ )
1889
+
1890
+ # Preserve any existing metadata, and add the DataChain SignalSchema.
1891
+ existing_metadata = table.schema.metadata or {}
1892
+ merged_metadata = {
1893
+ **existing_metadata,
1894
+ DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY: signal_schema_metadata,
1895
+ }
1896
+ table = table.replace_schema_metadata(merged_metadata)
1897
+ parquet_schema = table.schema
1898
+
1899
+ if _partition_cols:
1900
+ # Write to a partitioned parquet dataset.
1901
+ pq.write_to_dataset(
1902
+ table,
1903
+ root_path=path,
1904
+ partition_cols=_partition_cols,
1905
+ **kwargs,
1906
+ )
1907
+ else:
1908
+ if first_chunk:
1909
+ # Write to a single parquet file.
1910
+ parquet_writer = pq.ParquetWriter(path, parquet_schema, **kwargs)
1911
+ first_chunk = False
1912
+
1913
+ assert parquet_writer
1914
+ parquet_writer.write_table(table)
1915
+
1916
+ if parquet_writer:
1917
+ parquet_writer.close()
1918
+
1919
+ def to_csv(
1920
+ self,
1921
+ path: Union[str, os.PathLike[str]],
1922
+ delimiter: str = ",",
1923
+ **kwargs,
1924
+ ) -> None:
1925
+ """Save chain to a csv (comma-separated values) file.
1926
+
1927
+ Parameters:
1928
+ path : Path to save the file.
1929
+ delimiter : Delimiter to use for the resulting file.
1930
+ """
1931
+ import csv
1932
+
1933
+ headers, _ = self._effective_signals_schema.get_headers_with_length()
1934
+ column_names = [".".join(filter(None, header)) for header in headers]
1935
+
1936
+ results_iter = self.collect_flatten()
1937
+
1938
+ with open(path, "w", newline="") as f:
1939
+ writer = csv.writer(f, delimiter=delimiter, **kwargs)
1940
+ writer.writerow(column_names)
1941
+
1942
+ for row in results_iter:
1943
+ writer.writerow(row)
1944
+
1826
1945
  @classmethod
1827
1946
  def from_records(
1828
1947
  cls,
@@ -25,7 +25,7 @@ from typing_extensions import Literal as LiteralEx
25
25
  from datachain.lib.convert.python_to_sql import python_to_sql
26
26
  from datachain.lib.convert.sql_to_python import sql_to_python
27
27
  from datachain.lib.convert.unflatten import unflatten_to_json_pos
28
- from datachain.lib.data_model import DataModel, DataType
28
+ from datachain.lib.data_model import DataModel, DataType, DataValue
29
29
  from datachain.lib.file import File
30
30
  from datachain.lib.model_store import ModelStore
31
31
  from datachain.lib.utils import DataChainParamsError
@@ -110,7 +110,7 @@ class SignalSchema:
110
110
  values: dict[str, DataType]
111
111
  tree: dict[str, Any]
112
112
  setup_func: dict[str, Callable]
113
- setup_values: Optional[dict[str, Callable]]
113
+ setup_values: Optional[dict[str, Any]]
114
114
 
115
115
  def __init__(
116
116
  self,
@@ -333,21 +333,21 @@ class SignalSchema:
333
333
  res[db_name] = python_to_sql(type_)
334
334
  return res
335
335
 
336
- def row_to_objs(self, row: Sequence[Any]) -> list[DataType]:
336
+ def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
337
337
  self._init_setup_values()
338
338
 
339
- objs = []
339
+ objs: list[DataValue] = []
340
340
  pos = 0
341
341
  for name, fr_type in self.values.items():
342
342
  if self.setup_values and (val := self.setup_values.get(name, None)):
343
343
  objs.append(val)
344
344
  elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
345
345
  j, pos = unflatten_to_json_pos(fr, row, pos)
346
- objs.append(fr(**j)) # type: ignore[arg-type]
346
+ objs.append(fr(**j))
347
347
  else:
348
348
  objs.append(row[pos])
349
349
  pos += 1
350
- return objs # type: ignore[return-value]
350
+ return objs
351
351
 
352
352
  def contains_file(self) -> bool:
353
353
  for type_ in self.values.values():