datachain 0.3.4__tar.gz → 0.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (242) hide show
  1. {datachain-0.3.4 → datachain-0.3.6}/.github/workflows/tests-studio.yml +1 -1
  2. {datachain-0.3.4 → datachain-0.3.6}/.github/workflows/tests.yml +11 -3
  3. {datachain-0.3.4 → datachain-0.3.6}/.pre-commit-config.yaml +1 -1
  4. {datachain-0.3.4/src/datachain.egg-info → datachain-0.3.6}/PKG-INFO +3 -4
  5. {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/torch-loader.py +1 -1
  6. {datachain-0.3.4 → datachain-0.3.6}/pyproject.toml +4 -5
  7. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/catalog/catalog.py +15 -3
  8. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/sqlite.py +1 -0
  9. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/flatten.py +0 -28
  10. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/dc.py +49 -12
  11. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/signal_schema.py +10 -5
  12. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/dataset.py +42 -22
  13. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/types.py +14 -8
  14. {datachain-0.3.4 → datachain-0.3.6/src/datachain.egg-info}/PKG-INFO +3 -4
  15. {datachain-0.3.4 → datachain-0.3.6}/src/datachain.egg-info/requires.txt +2 -3
  16. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_datachain.py +15 -1
  17. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_datachain.py +97 -0
  18. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_signal_schema.py +43 -2
  19. {datachain-0.3.4 → datachain-0.3.6}/.cruft.json +0 -0
  20. {datachain-0.3.4 → datachain-0.3.6}/.gitattributes +0 -0
  21. {datachain-0.3.4 → datachain-0.3.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  22. {datachain-0.3.4 → datachain-0.3.6}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  23. {datachain-0.3.4 → datachain-0.3.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  24. {datachain-0.3.4 → datachain-0.3.6}/.github/codecov.yaml +0 -0
  25. {datachain-0.3.4 → datachain-0.3.6}/.github/dependabot.yml +0 -0
  26. {datachain-0.3.4 → datachain-0.3.6}/.github/workflows/benchmarks.yml +0 -0
  27. {datachain-0.3.4 → datachain-0.3.6}/.github/workflows/release.yml +0 -0
  28. {datachain-0.3.4 → datachain-0.3.6}/.github/workflows/update-template.yaml +0 -0
  29. {datachain-0.3.4 → datachain-0.3.6}/.gitignore +0 -0
  30. {datachain-0.3.4 → datachain-0.3.6}/CODE_OF_CONDUCT.rst +0 -0
  31. {datachain-0.3.4 → datachain-0.3.6}/CONTRIBUTING.rst +0 -0
  32. {datachain-0.3.4 → datachain-0.3.6}/LICENSE +0 -0
  33. {datachain-0.3.4 → datachain-0.3.6}/README.rst +0 -0
  34. {datachain-0.3.4 → datachain-0.3.6}/docs/assets/captioned_cartoons.png +0 -0
  35. {datachain-0.3.4 → datachain-0.3.6}/docs/assets/datachain.png +0 -0
  36. {datachain-0.3.4 → datachain-0.3.6}/docs/assets/flowchart.png +0 -0
  37. {datachain-0.3.4 → datachain-0.3.6}/docs/index.md +0 -0
  38. {datachain-0.3.4 → datachain-0.3.6}/docs/references/datachain.md +0 -0
  39. {datachain-0.3.4 → datachain-0.3.6}/docs/references/datatype.md +0 -0
  40. {datachain-0.3.4 → datachain-0.3.6}/docs/references/file.md +0 -0
  41. {datachain-0.3.4 → datachain-0.3.6}/docs/references/index.md +0 -0
  42. {datachain-0.3.4 → datachain-0.3.6}/docs/references/sql.md +0 -0
  43. {datachain-0.3.4 → datachain-0.3.6}/docs/references/torch.md +0 -0
  44. {datachain-0.3.4 → datachain-0.3.6}/docs/references/udf.md +0 -0
  45. {datachain-0.3.4 → datachain-0.3.6}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  46. {datachain-0.3.4 → datachain-0.3.6}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  47. {datachain-0.3.4 → datachain-0.3.6}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  48. {datachain-0.3.4 → datachain-0.3.6}/examples/computer_vision/openimage-detect.py +0 -0
  49. {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/common_sql_functions.py +0 -0
  50. {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/json-csv-reader.py +0 -0
  51. {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/udfs/parallel.py +0 -0
  52. {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/udfs/simple.py +0 -0
  53. {datachain-0.3.4 → datachain-0.3.6}/examples/get_started/udfs/stateful.py +0 -0
  54. {datachain-0.3.4 → datachain-0.3.6}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  55. {datachain-0.3.4 → datachain-0.3.6}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  56. {datachain-0.3.4 → datachain-0.3.6}/examples/llm_and_nlp/llm-claude.py +0 -0
  57. {datachain-0.3.4 → datachain-0.3.6}/examples/llm_and_nlp/unstructured-text.py +0 -0
  58. {datachain-0.3.4 → datachain-0.3.6}/examples/multimodal/clip_inference.py +0 -0
  59. {datachain-0.3.4 → datachain-0.3.6}/examples/multimodal/hf_pipeline.py +0 -0
  60. {datachain-0.3.4 → datachain-0.3.6}/examples/multimodal/openai_image_desc_lib.py +0 -0
  61. {datachain-0.3.4 → datachain-0.3.6}/examples/multimodal/wds.py +0 -0
  62. {datachain-0.3.4 → datachain-0.3.6}/examples/multimodal/wds_filtered.py +0 -0
  63. {datachain-0.3.4 → datachain-0.3.6}/mkdocs.yml +0 -0
  64. {datachain-0.3.4 → datachain-0.3.6}/noxfile.py +0 -0
  65. {datachain-0.3.4 → datachain-0.3.6}/setup.cfg +0 -0
  66. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/__init__.py +0 -0
  67. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/__main__.py +0 -0
  68. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/asyn.py +0 -0
  69. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/cache.py +0 -0
  70. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/catalog/__init__.py +0 -0
  71. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/catalog/datasource.py +0 -0
  72. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/catalog/loader.py +0 -0
  73. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/catalog/subclass.py +0 -0
  74. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/cli.py +0 -0
  75. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/cli_utils.py +0 -0
  76. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/__init__.py +0 -0
  77. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/azure.py +0 -0
  78. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/fileslice.py +0 -0
  79. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/fsspec.py +0 -0
  80. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/gcs.py +0 -0
  81. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/local.py +0 -0
  82. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/client/s3.py +0 -0
  83. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/config.py +0 -0
  84. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/__init__.py +0 -0
  85. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/db_engine.py +0 -0
  86. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/id_generator.py +0 -0
  87. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/job.py +0 -0
  88. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/metastore.py +0 -0
  89. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/schema.py +0 -0
  90. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/serializer.py +0 -0
  91. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/data_storage/warehouse.py +0 -0
  92. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/dataset.py +0 -0
  93. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/error.py +0 -0
  94. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/job.py +0 -0
  95. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/__init__.py +0 -0
  96. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/arrow.py +0 -0
  97. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/clip.py +0 -0
  98. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/__init__.py +0 -0
  99. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/python_to_sql.py +0 -0
  100. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/sql_to_python.py +0 -0
  101. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/unflatten.py +0 -0
  102. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  103. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/data_model.py +0 -0
  104. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/dataset_info.py +0 -0
  105. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/file.py +0 -0
  106. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/image.py +0 -0
  107. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/listing.py +0 -0
  108. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/meta_formats.py +0 -0
  109. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/model_store.py +0 -0
  110. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/pytorch.py +0 -0
  111. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/settings.py +0 -0
  112. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/text.py +0 -0
  113. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/udf.py +0 -0
  114. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/udf_signature.py +0 -0
  115. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/utils.py +0 -0
  116. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/vfile.py +0 -0
  117. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/webdataset.py +0 -0
  118. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/lib/webdataset_laion.py +0 -0
  119. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/listing.py +0 -0
  120. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/node.py +0 -0
  121. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/nodes_fetcher.py +0 -0
  122. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/nodes_thread_pool.py +0 -0
  123. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/progress.py +0 -0
  124. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/py.typed +0 -0
  125. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/__init__.py +0 -0
  126. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/batch.py +0 -0
  127. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/builtins.py +0 -0
  128. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/dispatch.py +0 -0
  129. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/metrics.py +0 -0
  130. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/params.py +0 -0
  131. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/queue.py +0 -0
  132. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/schema.py +0 -0
  133. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/session.py +0 -0
  134. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/query/udf.py +0 -0
  135. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/remote/__init__.py +0 -0
  136. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/remote/studio.py +0 -0
  137. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/__init__.py +0 -0
  138. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/default/__init__.py +0 -0
  139. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/default/base.py +0 -0
  140. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/__init__.py +0 -0
  141. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/array.py +0 -0
  142. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/conditional.py +0 -0
  143. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/path.py +0 -0
  144. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/random.py +0 -0
  145. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/functions/string.py +0 -0
  146. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/selectable.py +0 -0
  147. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/sqlite/__init__.py +0 -0
  148. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/sqlite/base.py +0 -0
  149. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/sqlite/types.py +0 -0
  150. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/sqlite/vector.py +0 -0
  151. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/sql/utils.py +0 -0
  152. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/storage.py +0 -0
  153. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/torch/__init__.py +0 -0
  154. {datachain-0.3.4 → datachain-0.3.6}/src/datachain/utils.py +0 -0
  155. {datachain-0.3.4 → datachain-0.3.6}/src/datachain.egg-info/SOURCES.txt +0 -0
  156. {datachain-0.3.4 → datachain-0.3.6}/src/datachain.egg-info/dependency_links.txt +0 -0
  157. {datachain-0.3.4 → datachain-0.3.6}/src/datachain.egg-info/entry_points.txt +0 -0
  158. {datachain-0.3.4 → datachain-0.3.6}/src/datachain.egg-info/top_level.txt +0 -0
  159. {datachain-0.3.4 → datachain-0.3.6}/tests/__init__.py +0 -0
  160. {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/__init__.py +0 -0
  161. {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/conftest.py +0 -0
  162. {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  163. {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/datasets/.dvc/config +0 -0
  164. {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/datasets/.gitignore +0 -0
  165. {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  166. {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/test_datachain.py +0 -0
  167. {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/test_ls.py +0 -0
  168. {datachain-0.3.4 → datachain-0.3.6}/tests/benchmarks/test_version.py +0 -0
  169. {datachain-0.3.4 → datachain-0.3.6}/tests/conftest.py +0 -0
  170. {datachain-0.3.4 → datachain-0.3.6}/tests/data.py +0 -0
  171. {datachain-0.3.4 → datachain-0.3.6}/tests/examples/__init__.py +0 -0
  172. {datachain-0.3.4 → datachain-0.3.6}/tests/examples/test_examples.py +0 -0
  173. {datachain-0.3.4 → datachain-0.3.6}/tests/examples/test_wds_e2e.py +0 -0
  174. {datachain-0.3.4 → datachain-0.3.6}/tests/examples/wds_data.py +0 -0
  175. {datachain-0.3.4 → datachain-0.3.6}/tests/func/__init__.py +0 -0
  176. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_catalog.py +0 -0
  177. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_client.py +0 -0
  178. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_dataset_query.py +0 -0
  179. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_datasets.py +0 -0
  180. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_feature_pickling.py +0 -0
  181. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_listing.py +0 -0
  182. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_ls.py +0 -0
  183. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_pull.py +0 -0
  184. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_pytorch.py +0 -0
  185. {datachain-0.3.4 → datachain-0.3.6}/tests/func/test_query.py +0 -0
  186. {datachain-0.3.4 → datachain-0.3.6}/tests/scripts/feature_class.py +0 -0
  187. {datachain-0.3.4 → datachain-0.3.6}/tests/scripts/feature_class_parallel.py +0 -0
  188. {datachain-0.3.4 → datachain-0.3.6}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  189. {datachain-0.3.4 → datachain-0.3.6}/tests/scripts/name_len_slow.py +0 -0
  190. {datachain-0.3.4 → datachain-0.3.6}/tests/test_cli_e2e.py +0 -0
  191. {datachain-0.3.4 → datachain-0.3.6}/tests/test_query_e2e.py +0 -0
  192. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/__init__.py +0 -0
  193. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/__init__.py +0 -0
  194. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/conftest.py +0 -0
  195. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_arrow.py +0 -0
  196. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_clip.py +0 -0
  197. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  198. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_datachain_merge.py +0 -0
  199. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_feature.py +0 -0
  200. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_feature_utils.py +0 -0
  201. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_file.py +0 -0
  202. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_image.py +0 -0
  203. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_schema.py +0 -0
  204. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_sql_to_python.py +0 -0
  205. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_text.py +0 -0
  206. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_udf_signature.py +0 -0
  207. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_utils.py +0 -0
  208. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/lib/test_webdataset.py +0 -0
  209. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/__init__.py +0 -0
  210. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/sqlite/__init__.py +0 -0
  211. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/sqlite/test_utils.py +0 -0
  212. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_array.py +0 -0
  213. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_conditional.py +0 -0
  214. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_path.py +0 -0
  215. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_random.py +0 -0
  216. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_selectable.py +0 -0
  217. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/sql/test_string.py +0 -0
  218. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_asyn.py +0 -0
  219. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_cache.py +0 -0
  220. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_catalog.py +0 -0
  221. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_catalog_loader.py +0 -0
  222. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_cli_parsing.py +0 -0
  223. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_client.py +0 -0
  224. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_client_s3.py +0 -0
  225. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_data_storage.py +0 -0
  226. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_database_engine.py +0 -0
  227. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_dataset.py +0 -0
  228. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_dispatch.py +0 -0
  229. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_fileslice.py +0 -0
  230. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_id_generator.py +0 -0
  231. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_listing.py +0 -0
  232. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_metastore.py +0 -0
  233. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_module_exports.py +0 -0
  234. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_query_metrics.py +0 -0
  235. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_query_params.py +0 -0
  236. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_serializer.py +0 -0
  237. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_session.py +0 -0
  238. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_storage.py +0 -0
  239. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_udf.py +0 -0
  240. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_utils.py +0 -0
  241. {datachain-0.3.4 → datachain-0.3.6}/tests/unit/test_warehouse.py +0 -0
  242. {datachain-0.3.4 → datachain-0.3.6}/tests/utils.py +0 -0
@@ -17,7 +17,7 @@ concurrency:
17
17
  jobs:
18
18
  studio:
19
19
  if: '!github.event.pull_request.head.repo.fork'
20
- runs-on: ubuntu-latest-16-cores
20
+ runs-on: ubuntu-latest
21
21
  strategy:
22
22
  matrix:
23
23
  pyv: ['3.12']
@@ -62,9 +62,9 @@ jobs:
62
62
  pyv: '3.9'
63
63
  - os: macos-latest
64
64
  pyv: '3.12'
65
- - os: windows-latest-8-cores
65
+ - os: windows-latest
66
66
  pyv: '3.9'
67
- - os: windows-latest-8-cores
67
+ - os: windows-latest
68
68
  pyv: '3.12'
69
69
 
70
70
  steps:
@@ -116,9 +116,17 @@ jobs:
116
116
  strategy:
117
117
  fail-fast: false
118
118
  matrix:
119
- os: [ubuntu-latest-16-cores, macos-latest, windows-latest-8-cores]
119
+ os: [ubuntu-latest, macos-latest, windows-latest]
120
120
  pyv: ['3.9', '3.12']
121
121
  group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
122
+ exclude:
123
+ - {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
124
+ - {os: ubuntu-latest, pyv: '3.12', group: 'multimodal'}
125
+ include:
126
+ - {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
127
+ - {os: ubuntu-latest-4-cores, pyv: "3.12", group: multimodal}
128
+
129
+
122
130
  steps:
123
131
  - uses: actions/checkout@v4
124
132
 
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.5.7'
27
+ rev: 'v0.6.1'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.4
3
+ Version: 0.3.6
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -41,6 +41,7 @@ Requires-Dist: pydantic<3,>=2
41
41
  Requires-Dist: jmespath>=1.0
42
42
  Requires-Dist: datamodel-code-generator>=0.25
43
43
  Requires-Dist: Pillow<11,>=10.0.0
44
+ Requires-Dist: msgpack<2,>=1.0.4
44
45
  Provides-Extra: docs
45
46
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
46
47
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -54,7 +55,6 @@ Requires-Dist: torchvision; extra == "torch"
54
55
  Requires-Dist: transformers>=4.36.0; extra == "torch"
55
56
  Provides-Extra: remote
56
57
  Requires-Dist: lz4; extra == "remote"
57
- Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
58
58
  Requires-Dist: requests>=2.22.0; extra == "remote"
59
59
  Provides-Extra: vector
60
60
  Requires-Dist: usearch; extra == "vector"
@@ -87,9 +87,8 @@ Requires-Dist: numpy<2,>=1; extra == "examples"
87
87
  Requires-Dist: defusedxml; extra == "examples"
88
88
  Requires-Dist: accelerate; extra == "examples"
89
89
  Requires-Dist: unstructured[pdf]; extra == "examples"
90
- Requires-Dist: pdfplumber==0.11.3; extra == "examples"
90
+ Requires-Dist: pdfplumber==0.11.4; extra == "examples"
91
91
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
92
- Requires-Dist: nltk==3.8.1; extra == "examples"
93
92
 
94
93
  |PyPI| |Python Version| |Codecov| |Tests|
95
94
 
@@ -81,6 +81,6 @@ if __name__ == "__main__":
81
81
  loss.backward()
82
82
  optimizer.step()
83
83
 
84
- print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, loss.item()))
84
+ print(f"[{epoch + 1}, {i + 1:5d}] loss: {loss.item():.3f}")
85
85
 
86
86
  print("Finished Training")
@@ -43,7 +43,8 @@ dependencies = [
43
43
  "pydantic>=2,<3",
44
44
  "jmespath>=1.0",
45
45
  "datamodel-code-generator>=0.25",
46
- "Pillow>=10.0.0,<11"
46
+ "Pillow>=10.0.0,<11",
47
+ "msgpack>=1.0.4,<2"
47
48
  ]
48
49
 
49
50
  [project.optional-dependencies]
@@ -62,7 +63,6 @@ torch = [
62
63
  ]
63
64
  remote = [
64
65
  "lz4",
65
- "msgpack>=1.0.4,<2",
66
66
  "requests>=2.22.0"
67
67
  ]
68
68
  vector = [
@@ -99,9 +99,8 @@ examples = [
99
99
  "defusedxml",
100
100
  "accelerate",
101
101
  "unstructured[pdf]",
102
- "pdfplumber==0.11.3",
103
- "huggingface_hub[hf_transfer]",
104
- "nltk==3.8.1"
102
+ "pdfplumber==0.11.4",
103
+ "huggingface_hub[hf_transfer]"
105
104
  ]
106
105
 
107
106
  [project.urls]
@@ -120,13 +120,25 @@ def noop(_: str):
120
120
 
121
121
  @contextmanager
122
122
  def print_and_capture(
123
- stream: "IO[str]", callback: Callable[[str], None] = noop
123
+ stream: "IO[bytes]|IO[str]", callback: Callable[[str], None] = noop
124
124
  ) -> "Iterator[list[str]]":
125
125
  lines: list[str] = []
126
126
  append = lines.append
127
127
 
128
128
  def loop() -> None:
129
- for line in iter(stream.readline, ""):
129
+ buffer = b""
130
+ while byt := stream.read(1): # Read one byte at a time
131
+ buffer += byt.encode("utf-8") if isinstance(byt, str) else byt
132
+
133
+ if byt in (b"\n", b"\r"): # Check for newline or carriage return
134
+ line = buffer.decode("utf-8")
135
+ print(line, end="")
136
+ callback(line)
137
+ append(line)
138
+ buffer = b"" # Clear buffer for next line
139
+
140
+ if buffer: # Handle any remaining data in the buffer
141
+ line = buffer.decode("utf-8")
130
142
  print(line, end="")
131
143
  callback(line)
132
144
  append(line)
@@ -2128,7 +2140,7 @@ class Catalog:
2128
2140
  stdout=subprocess.PIPE if capture_output else None,
2129
2141
  stderr=subprocess.STDOUT if capture_output else None,
2130
2142
  bufsize=1,
2131
- text=True,
2143
+ text=False,
2132
2144
  **kwargs,
2133
2145
  ) as proc:
2134
2146
  os.close(w)
@@ -209,6 +209,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
209
209
  return cursor.executemany(self.compile(query).string, params)
210
210
  return self.db.executemany(self.compile(query).string, params)
211
211
 
212
+ @retry_sqlite_locks
212
213
  def execute_str(self, sql: str, parameters=None) -> sqlite3.Cursor:
213
214
  if parameters is None:
214
215
  return self.db.execute(sql)
@@ -1,34 +1,6 @@
1
- from datetime import datetime
2
-
3
1
  from pydantic import BaseModel
4
2
 
5
3
  from datachain.lib.model_store import ModelStore
6
- from datachain.sql.types import (
7
- JSON,
8
- Array,
9
- Binary,
10
- Boolean,
11
- DateTime,
12
- Float,
13
- Int,
14
- Int32,
15
- Int64,
16
- NullType,
17
- String,
18
- )
19
-
20
- DATACHAIN_TO_TYPE = {
21
- Int: int,
22
- Int32: int,
23
- Int64: int,
24
- String: str,
25
- Float: float,
26
- Boolean: bool,
27
- DateTime: datetime,
28
- Binary: bytes,
29
- Array(NullType): list,
30
- JSON: dict,
31
- }
32
4
 
33
5
 
34
6
  def flatten(obj: BaseModel):
@@ -839,6 +839,10 @@ class DataChain(DatasetQuery):
839
839
  def mutate(self, **kwargs) -> "Self":
840
840
  """Create new signals based on existing signals.
841
841
 
842
+ This method cannot modify existing columns. If you need to modify an
843
+ existing column, use a different name for the new column and then use
844
+ `select()` to choose which columns to keep.
845
+
842
846
  This method is vectorized and more efficient compared to map(), and it does not
843
847
  extract or download any data from the internal database. However, it can only
844
848
  utilize predefined built-in functions and their combinations.
@@ -859,7 +863,26 @@ class DataChain(DatasetQuery):
859
863
  dist=cosine_distance(embedding_text, embedding_image)
860
864
  )
861
865
  ```
866
+
867
+ This method can be also used to rename signals. If the Column("name") provided
868
+ as value for the new signal - the old column will be dropped. Otherwise a new
869
+ column is created.
870
+
871
+ Example:
872
+ ```py
873
+ dc.mutate(
874
+ newkey=Column("oldkey")
875
+ )
876
+ ```
862
877
  """
878
+ existing_columns = set(self.signals_schema.values.keys())
879
+ for col_name in kwargs:
880
+ if col_name in existing_columns:
881
+ raise DataChainColumnError(
882
+ col_name,
883
+ "Cannot modify existing column with mutate(). "
884
+ "Use a different name for the new column.",
885
+ )
863
886
  for col_name, expr in kwargs.items():
864
887
  if not isinstance(expr, Column) and isinstance(expr.type, NullType):
865
888
  raise DataChainColumnError(
@@ -1224,14 +1247,11 @@ class DataChain(DatasetQuery):
1224
1247
  """
1225
1248
  headers, max_length = self._effective_signals_schema.get_headers_with_length()
1226
1249
  if flatten or max_length < 2:
1227
- columns = []
1228
- if headers:
1229
- columns = [".".join(filter(None, header)) for header in headers]
1230
- return pd.DataFrame.from_records(self.to_records(), columns=columns)
1250
+ columns = [".".join(filter(None, header)) for header in headers]
1251
+ else:
1252
+ columns = pd.MultiIndex.from_tuples(map(tuple, headers))
1231
1253
 
1232
- return pd.DataFrame(
1233
- self.results(), columns=pd.MultiIndex.from_tuples(map(tuple, headers))
1234
- )
1254
+ return pd.DataFrame.from_records(self.results(), columns=columns)
1235
1255
 
1236
1256
  def show(
1237
1257
  self,
@@ -1524,6 +1544,7 @@ class DataChain(DatasetQuery):
1524
1544
  to_insert: Optional[Union[dict, list[dict]]],
1525
1545
  session: Optional[Session] = None,
1526
1546
  in_memory: bool = False,
1547
+ schema: Optional[dict[str, DataType]] = None,
1527
1548
  ) -> "DataChain":
1528
1549
  """Create a DataChain from the provided records. This method can be used for
1529
1550
  programmatically generating a chain in contrast of reading data from storages
@@ -1532,10 +1553,10 @@ class DataChain(DatasetQuery):
1532
1553
  Parameters:
1533
1554
  to_insert : records (or a single record) to insert. Each record is
1534
1555
  a dictionary of signals and theirs values.
1556
+ schema : describes chain signals and their corresponding types
1535
1557
 
1536
1558
  Example:
1537
1559
  ```py
1538
- empty = DataChain.from_records()
1539
1560
  single_record = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
1540
1561
  ```
1541
1562
  """
@@ -1543,11 +1564,27 @@ class DataChain(DatasetQuery):
1543
1564
  catalog = session.catalog
1544
1565
 
1545
1566
  name = session.generate_temp_dataset_name()
1546
- columns: tuple[sqlalchemy.Column[Any], ...] = tuple(
1547
- sqlalchemy.Column(name, typ)
1548
- for name, typ in File._datachain_column_types.items()
1567
+ signal_schema = None
1568
+ columns: list[sqlalchemy.Column] = []
1569
+
1570
+ if schema:
1571
+ signal_schema = SignalSchema(schema)
1572
+ columns = signal_schema.db_signals(as_columns=True) # type: ignore[assignment]
1573
+ else:
1574
+ columns = [
1575
+ sqlalchemy.Column(name, typ)
1576
+ for name, typ in File._datachain_column_types.items()
1577
+ ]
1578
+
1579
+ dsr = catalog.create_dataset(
1580
+ name,
1581
+ columns=columns,
1582
+ feature_schema=(
1583
+ signal_schema.clone_without_sys_signals().serialize()
1584
+ if signal_schema
1585
+ else None
1586
+ ),
1549
1587
  )
1550
- dsr = catalog.create_dataset(name, columns=columns)
1551
1588
 
1552
1589
  if isinstance(to_insert, dict):
1553
1590
  to_insert = [to_insert]
@@ -2,6 +2,7 @@ import copy
2
2
  from collections.abc import Iterator, Sequence
3
3
  from dataclasses import dataclass
4
4
  from datetime import datetime
5
+ from inspect import isclass
5
6
  from typing import (
6
7
  TYPE_CHECKING,
7
8
  Annotated,
@@ -14,10 +15,10 @@ from typing import (
14
15
  get_origin,
15
16
  )
16
17
 
18
+ import sqlalchemy as sa
17
19
  from pydantic import BaseModel, create_model
18
20
  from typing_extensions import Literal as LiteralEx
19
21
 
20
- from datachain.lib.convert.flatten import DATACHAIN_TO_TYPE
21
22
  from datachain.lib.convert.python_to_sql import python_to_sql
22
23
  from datachain.lib.convert.sql_to_python import sql_to_python
23
24
  from datachain.lib.convert.unflatten import unflatten_to_json_pos
@@ -26,6 +27,7 @@ from datachain.lib.file import File
26
27
  from datachain.lib.model_store import ModelStore
27
28
  from datachain.lib.utils import DataChainParamsError
28
29
  from datachain.query.schema import DEFAULT_DELIMITER, Column
30
+ from datachain.sql.types import SQLType
29
31
 
30
32
  if TYPE_CHECKING:
31
33
  from datachain.catalog import Catalog
@@ -104,12 +106,15 @@ class SignalSchema:
104
106
  def from_column_types(col_types: dict[str, Any]) -> "SignalSchema":
105
107
  signals: dict[str, DataType] = {}
106
108
  for field, col_type in col_types.items():
107
- if (py_type := DATACHAIN_TO_TYPE.get(col_type, None)) is None:
109
+ if isinstance(col_type, SQLType):
110
+ signals[field] = col_type.python_type
111
+ elif isclass(col_type) and issubclass(col_type, SQLType):
112
+ signals[field] = col_type().python_type
113
+ else:
108
114
  raise SignalSchemaError(
109
115
  f"signal schema cannot be obtained for column '{field}':"
110
- f" unsupported type '{py_type}'"
116
+ f" unsupported type '{col_type}'"
111
117
  )
112
- signals[field] = py_type
113
118
  return SignalSchema(signals)
114
119
 
115
120
  def serialize(self) -> dict[str, str]:
@@ -232,7 +237,7 @@ class SignalSchema:
232
237
  signals = [
233
238
  DEFAULT_DELIMITER.join(path)
234
239
  if not as_columns
235
- else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
240
+ else sa.Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
236
241
  for path, _type, has_subtree, _ in self.get_flat_tree()
237
242
  if not has_subtree
238
243
  ]
@@ -878,17 +878,14 @@ class SQLUnion(Step):
878
878
  temp_tables.extend(self.query1.temp_table_names)
879
879
  q2 = self.query2.apply_steps().select().subquery()
880
880
  temp_tables.extend(self.query2.temp_table_names)
881
- columns1, columns2 = fill_columns(q1.columns, q2.columns)
881
+
882
+ columns1, columns2 = _order_columns(q1.columns, q2.columns)
882
883
 
883
884
  def q(*columns):
884
885
  names = {c.name for c in columns}
885
886
  col1 = [c for c in columns1 if c.name in names]
886
887
  col2 = [c for c in columns2 if c.name in names]
887
- res = (
888
- sqlalchemy.select(*col1)
889
- .select_from(q1)
890
- .union_all(sqlalchemy.select(*col2).select_from(q2))
891
- )
888
+ res = sqlalchemy.select(*col1).union_all(sqlalchemy.select(*col2))
892
889
 
893
890
  subquery = res.subquery()
894
891
  return sqlalchemy.select(*subquery.c).select_from(subquery)
@@ -1021,23 +1018,46 @@ class GroupBy(Step):
1021
1018
  return step_result(q, grouped_query.selected_columns)
1022
1019
 
1023
1020
 
1024
- def fill_columns(
1025
- *column_iterables: Iterable[ColumnElement],
1021
+ def _validate_columns(
1022
+ left_columns: Iterable[ColumnElement], right_columns: Iterable[ColumnElement]
1023
+ ) -> set[str]:
1024
+ left_names = {c.name for c in left_columns}
1025
+ right_names = {c.name for c in right_columns}
1026
+
1027
+ if left_names == right_names:
1028
+ return left_names
1029
+
1030
+ missing_right = left_names - right_names
1031
+ missing_left = right_names - left_names
1032
+
1033
+ def _prepare_msg_part(missing_columns: set[str], side: str) -> str:
1034
+ return f"{', '.join(sorted(missing_columns))} only present in {side}"
1035
+
1036
+ msg_parts = [
1037
+ _prepare_msg_part(missing_columns, found_side)
1038
+ for missing_columns, found_side in zip(
1039
+ [
1040
+ missing_right,
1041
+ missing_left,
1042
+ ],
1043
+ ["left", "right"],
1044
+ )
1045
+ if missing_columns
1046
+ ]
1047
+ msg = f"Cannot perform union. {'. '.join(msg_parts)}"
1048
+
1049
+ raise ValueError(msg)
1050
+
1051
+
1052
+ def _order_columns(
1053
+ left_columns: Iterable[ColumnElement], right_columns: Iterable[ColumnElement]
1026
1054
  ) -> list[list[ColumnElement]]:
1027
- column_dicts = [{c.name: c for c in columns} for columns in column_iterables]
1028
- combined_columns = {n: c for col_dict in column_dicts for n, c in col_dict.items()}
1029
-
1030
- result: list[list[ColumnElement]] = [[] for _ in column_dicts]
1031
- for n in combined_columns:
1032
- col = next(col_dict[n] for col_dict in column_dicts if n in col_dict)
1033
- for col_dict, out in zip(column_dicts, result):
1034
- if n in col_dict:
1035
- out.append(col_dict[n])
1036
- else:
1037
- # Cast the NULL to ensure all columns are aware of their type
1038
- # Label it to ensure it's aware of its name
1039
- out.append(sqlalchemy.cast(sqlalchemy.null(), col.type).label(n))
1040
- return result
1055
+ column_order = _validate_columns(left_columns, right_columns)
1056
+ column_dicts = [
1057
+ {c.name: c for c in columns} for columns in [left_columns, right_columns]
1058
+ ]
1059
+
1060
+ return [[d[n] for n in column_order] for d in column_dicts]
1041
1061
 
1042
1062
 
1043
1063
  @attrs.define
@@ -20,6 +20,8 @@ from typing import Any, Union
20
20
  import sqlalchemy as sa
21
21
  from sqlalchemy import TypeDecorator, types
22
22
 
23
+ from datachain.lib.data_model import StandardType
24
+
23
25
  _registry: dict[str, "TypeConverter"] = {}
24
26
  registry = MappingProxyType(_registry)
25
27
 
@@ -91,6 +93,10 @@ class SQLType(TypeDecorator):
91
93
  impl: type[types.TypeEngine[Any]] = types.TypeEngine
92
94
  cache_ok = True
93
95
 
96
+ @property
97
+ def python_type(self) -> StandardType:
98
+ raise NotImplementedError
99
+
94
100
  def to_dict(self) -> dict[str, Any]:
95
101
  return {"type": self.__class__.__name__}
96
102
 
@@ -103,7 +109,7 @@ class String(SQLType):
103
109
  impl = types.String
104
110
 
105
111
  @property
106
- def python_type(self):
112
+ def python_type(self) -> StandardType:
107
113
  return str
108
114
 
109
115
  def load_dialect_impl(self, dialect):
@@ -125,7 +131,7 @@ class Boolean(SQLType):
125
131
  impl = types.Boolean
126
132
 
127
133
  @property
128
- def python_type(self):
134
+ def python_type(self) -> StandardType:
129
135
  return bool
130
136
 
131
137
  def load_dialect_impl(self, dialect):
@@ -147,7 +153,7 @@ class Int(SQLType):
147
153
  impl = types.INTEGER
148
154
 
149
155
  @property
150
- def python_type(self):
156
+ def python_type(self) -> StandardType:
151
157
  return int
152
158
 
153
159
  def load_dialect_impl(self, dialect):
@@ -217,7 +223,7 @@ class Float(SQLType):
217
223
  impl = types.FLOAT
218
224
 
219
225
  @property
220
- def python_type(self):
226
+ def python_type(self) -> StandardType:
221
227
  return float
222
228
 
223
229
  def load_dialect_impl(self, dialect):
@@ -271,7 +277,7 @@ class Array(SQLType):
271
277
  impl = types.ARRAY
272
278
 
273
279
  @property
274
- def python_type(self):
280
+ def python_type(self) -> StandardType:
275
281
  return list
276
282
 
277
283
  def load_dialect_impl(self, dialect):
@@ -314,7 +320,7 @@ class JSON(SQLType):
314
320
  impl = types.JSON
315
321
 
316
322
  @property
317
- def python_type(self):
323
+ def python_type(self) -> StandardType:
318
324
  return dict
319
325
 
320
326
  def load_dialect_impl(self, dialect):
@@ -336,7 +342,7 @@ class DateTime(SQLType):
336
342
  impl = types.DATETIME
337
343
 
338
344
  @property
339
- def python_type(self):
345
+ def python_type(self) -> StandardType:
340
346
  return datetime
341
347
 
342
348
  def load_dialect_impl(self, dialect):
@@ -358,7 +364,7 @@ class Binary(SQLType):
358
364
  impl = types.BINARY
359
365
 
360
366
  @property
361
- def python_type(self):
367
+ def python_type(self) -> StandardType:
362
368
  return bytes
363
369
 
364
370
  def load_dialect_impl(self, dialect):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.4
3
+ Version: 0.3.6
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -41,6 +41,7 @@ Requires-Dist: pydantic<3,>=2
41
41
  Requires-Dist: jmespath>=1.0
42
42
  Requires-Dist: datamodel-code-generator>=0.25
43
43
  Requires-Dist: Pillow<11,>=10.0.0
44
+ Requires-Dist: msgpack<2,>=1.0.4
44
45
  Provides-Extra: docs
45
46
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
46
47
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -54,7 +55,6 @@ Requires-Dist: torchvision; extra == "torch"
54
55
  Requires-Dist: transformers>=4.36.0; extra == "torch"
55
56
  Provides-Extra: remote
56
57
  Requires-Dist: lz4; extra == "remote"
57
- Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
58
58
  Requires-Dist: requests>=2.22.0; extra == "remote"
59
59
  Provides-Extra: vector
60
60
  Requires-Dist: usearch; extra == "vector"
@@ -87,9 +87,8 @@ Requires-Dist: numpy<2,>=1; extra == "examples"
87
87
  Requires-Dist: defusedxml; extra == "examples"
88
88
  Requires-Dist: accelerate; extra == "examples"
89
89
  Requires-Dist: unstructured[pdf]; extra == "examples"
90
- Requires-Dist: pdfplumber==0.11.3; extra == "examples"
90
+ Requires-Dist: pdfplumber==0.11.4; extra == "examples"
91
91
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
92
- Requires-Dist: nltk==3.8.1; extra == "examples"
93
92
 
94
93
  |PyPI| |Python Version| |Codecov| |Tests|
95
94
 
@@ -22,6 +22,7 @@ pydantic<3,>=2
22
22
  jmespath>=1.0
23
23
  datamodel-code-generator>=0.25
24
24
  Pillow<11,>=10.0.0
25
+ msgpack<2,>=1.0.4
25
26
 
26
27
  [:sys_platform == "win32"]
27
28
  numpy<2,>=1
@@ -48,13 +49,11 @@ numpy<2,>=1
48
49
  defusedxml
49
50
  accelerate
50
51
  unstructured[pdf]
51
- pdfplumber==0.11.3
52
+ pdfplumber==0.11.4
52
53
  huggingface_hub[hf_transfer]
53
- nltk==3.8.1
54
54
 
55
55
  [remote]
56
56
  lz4
57
- msgpack<2,>=1.0.4
58
57
  requests>=2.22.0
59
58
 
60
59
  [tests]
@@ -8,10 +8,11 @@ import pandas as pd
8
8
  import pytest
9
9
  import pytz
10
10
  from PIL import Image
11
+ from sqlalchemy import Column
11
12
 
12
13
  from datachain.data_storage.sqlite import SQLiteWarehouse
13
14
  from datachain.dataset import DatasetStats
14
- from datachain.lib.dc import DataChain
15
+ from datachain.lib.dc import DataChain, DataChainColumnError
15
16
  from datachain.lib.file import File, ImageFile
16
17
  from tests.utils import images_equal
17
18
 
@@ -314,3 +315,16 @@ def test_from_storage_check_rows(tmp_dir, test_session):
314
315
  location=None,
315
316
  vtype="",
316
317
  )
318
+
319
+
320
+ def test_mutate_existing_column(catalog):
321
+ ds = DataChain.from_values(ids=[1, 2, 3])
322
+
323
+ with pytest.raises(DataChainColumnError) as excinfo:
324
+ ds.mutate(ids=Column("ids") + 1)
325
+
326
+ assert (
327
+ str(excinfo.value)
328
+ == "Error for column ids: Cannot modify existing column with mutate()."
329
+ " Use a different name for the new column."
330
+ )