datachain 0.12.0__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (341) hide show
  1. {datachain-0.12.0 → datachain-0.13.0}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.12.0 → datachain-0.13.0}/PKG-INFO +1 -1
  3. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/catalog/catalog.py +6 -2
  4. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/ls.py +8 -6
  5. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/show.py +7 -0
  6. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/gcs.py +1 -1
  7. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/s3.py +1 -1
  8. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/metastore.py +6 -0
  9. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/dc.py +36 -7
  10. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/file.py +8 -1
  11. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/meta_formats.py +2 -2
  12. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/signal_schema.py +65 -18
  13. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/udf.py +3 -0
  14. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/udf_signature.py +17 -9
  15. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/dataset.py +4 -0
  16. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/sqlite/base.py +2 -2
  17. {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/PKG-INFO +1 -1
  18. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_datachain.py +16 -1
  19. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_datachain.py +49 -0
  20. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_datachain_bootstrap.py +2 -2
  21. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_signal_schema.py +209 -26
  22. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_udf_signature.py +17 -7
  23. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_array.py +7 -2
  24. {datachain-0.12.0 → datachain-0.13.0}/.cruft.json +0 -0
  25. {datachain-0.12.0 → datachain-0.13.0}/.gitattributes +0 -0
  26. {datachain-0.12.0 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  27. {datachain-0.12.0 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  28. {datachain-0.12.0 → datachain-0.13.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  29. {datachain-0.12.0 → datachain-0.13.0}/.github/codecov.yaml +0 -0
  30. {datachain-0.12.0 → datachain-0.13.0}/.github/dependabot.yml +0 -0
  31. {datachain-0.12.0 → datachain-0.13.0}/.github/workflows/benchmarks.yml +0 -0
  32. {datachain-0.12.0 → datachain-0.13.0}/.github/workflows/release.yml +0 -0
  33. {datachain-0.12.0 → datachain-0.13.0}/.github/workflows/tests-studio.yml +0 -0
  34. {datachain-0.12.0 → datachain-0.13.0}/.github/workflows/tests.yml +0 -0
  35. {datachain-0.12.0 → datachain-0.13.0}/.github/workflows/update-template.yaml +0 -0
  36. {datachain-0.12.0 → datachain-0.13.0}/.gitignore +0 -0
  37. {datachain-0.12.0 → datachain-0.13.0}/CODE_OF_CONDUCT.rst +0 -0
  38. {datachain-0.12.0 → datachain-0.13.0}/LICENSE +0 -0
  39. {datachain-0.12.0 → datachain-0.13.0}/README.rst +0 -0
  40. {datachain-0.12.0 → datachain-0.13.0}/docs/assets/captioned_cartoons.png +0 -0
  41. {datachain-0.12.0 → datachain-0.13.0}/docs/assets/datachain-white.svg +0 -0
  42. {datachain-0.12.0 → datachain-0.13.0}/docs/assets/datachain.svg +0 -0
  43. {datachain-0.12.0 → datachain-0.13.0}/docs/contributing.md +0 -0
  44. {datachain-0.12.0 → datachain-0.13.0}/docs/css/github-permalink-style.css +0 -0
  45. {datachain-0.12.0 → datachain-0.13.0}/docs/examples.md +0 -0
  46. {datachain-0.12.0 → datachain-0.13.0}/docs/index.md +0 -0
  47. {datachain-0.12.0 → datachain-0.13.0}/docs/overrides/main.html +0 -0
  48. {datachain-0.12.0 → datachain-0.13.0}/docs/quick-start.md +0 -0
  49. {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/arrowrow.md +0 -0
  50. {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/bbox.md +0 -0
  51. {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/file.md +0 -0
  52. {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/imagefile.md +0 -0
  53. {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/index.md +0 -0
  54. {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/pose.md +0 -0
  55. {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/segment.md +0 -0
  56. {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/tarvfile.md +0 -0
  57. {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/textfile.md +0 -0
  58. {datachain-0.12.0 → datachain-0.13.0}/docs/references/data-types/videofile.md +0 -0
  59. {datachain-0.12.0 → datachain-0.13.0}/docs/references/datachain.md +0 -0
  60. {datachain-0.12.0 → datachain-0.13.0}/docs/references/func.md +0 -0
  61. {datachain-0.12.0 → datachain-0.13.0}/docs/references/index.md +0 -0
  62. {datachain-0.12.0 → datachain-0.13.0}/docs/references/remotes.md +0 -0
  63. {datachain-0.12.0 → datachain-0.13.0}/docs/references/toolkit.md +0 -0
  64. {datachain-0.12.0 → datachain-0.13.0}/docs/references/torch.md +0 -0
  65. {datachain-0.12.0 → datachain-0.13.0}/docs/references/udf.md +0 -0
  66. {datachain-0.12.0 → datachain-0.13.0}/docs/tutorials.md +0 -0
  67. {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  68. {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  69. {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/openimage-detect.py +0 -0
  70. {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  71. {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  72. {datachain-0.12.0 → datachain-0.13.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  73. {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/common_sql_functions.py +0 -0
  74. {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/json-csv-reader.py +0 -0
  75. {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/torch-loader.py +0 -0
  76. {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/udfs/parallel.py +0 -0
  77. {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/udfs/simple.py +0 -0
  78. {datachain-0.12.0 → datachain-0.13.0}/examples/get_started/udfs/stateful.py +0 -0
  79. {datachain-0.12.0 → datachain-0.13.0}/examples/llm_and_nlp/claude-query.py +0 -0
  80. {datachain-0.12.0 → datachain-0.13.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  81. {datachain-0.12.0 → datachain-0.13.0}/examples/multimodal/clip_inference.py +0 -0
  82. {datachain-0.12.0 → datachain-0.13.0}/examples/multimodal/hf_pipeline.py +0 -0
  83. {datachain-0.12.0 → datachain-0.13.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  84. {datachain-0.12.0 → datachain-0.13.0}/examples/multimodal/wds.py +0 -0
  85. {datachain-0.12.0 → datachain-0.13.0}/examples/multimodal/wds_filtered.py +0 -0
  86. {datachain-0.12.0 → datachain-0.13.0}/mkdocs.yml +0 -0
  87. {datachain-0.12.0 → datachain-0.13.0}/noxfile.py +0 -0
  88. {datachain-0.12.0 → datachain-0.13.0}/pyproject.toml +0 -0
  89. {datachain-0.12.0 → datachain-0.13.0}/setup.cfg +0 -0
  90. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/__init__.py +0 -0
  91. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/__main__.py +0 -0
  92. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/asyn.py +0 -0
  93. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cache.py +0 -0
  94. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/catalog/__init__.py +0 -0
  95. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/catalog/datasource.py +0 -0
  96. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/catalog/loader.py +0 -0
  97. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/__init__.py +0 -0
  98. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/__init__.py +0 -0
  99. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/datasets.py +0 -0
  100. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/du.py +0 -0
  101. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/index.py +0 -0
  102. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/misc.py +0 -0
  103. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/commands/query.py +0 -0
  104. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/parser/__init__.py +0 -0
  105. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/parser/job.py +0 -0
  106. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/parser/studio.py +0 -0
  107. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/parser/utils.py +0 -0
  108. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/cli/utils.py +0 -0
  109. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/__init__.py +0 -0
  110. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/azure.py +0 -0
  111. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/fileslice.py +0 -0
  112. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/fsspec.py +0 -0
  113. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/hf.py +0 -0
  114. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/client/local.py +0 -0
  115. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/config.py +0 -0
  116. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/__init__.py +0 -0
  117. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/db_engine.py +0 -0
  118. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/job.py +0 -0
  119. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/schema.py +0 -0
  120. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/serializer.py +0 -0
  121. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/sqlite.py +0 -0
  122. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/data_storage/warehouse.py +0 -0
  123. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/dataset.py +0 -0
  124. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/diff/__init__.py +0 -0
  125. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/error.py +0 -0
  126. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/fs/__init__.py +0 -0
  127. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/fs/reference.py +0 -0
  128. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/fs/utils.py +0 -0
  129. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/__init__.py +0 -0
  130. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/aggregate.py +0 -0
  131. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/array.py +0 -0
  132. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/base.py +0 -0
  133. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/conditional.py +0 -0
  134. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/func.py +0 -0
  135. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/numeric.py +0 -0
  136. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/path.py +0 -0
  137. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/random.py +0 -0
  138. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/string.py +0 -0
  139. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/func/window.py +0 -0
  140. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/job.py +0 -0
  141. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/__init__.py +0 -0
  142. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/arrow.py +0 -0
  143. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/clip.py +0 -0
  144. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/__init__.py +0 -0
  145. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/flatten.py +0 -0
  146. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  147. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  148. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/unflatten.py +0 -0
  149. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  150. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/data_model.py +0 -0
  151. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/dataset_info.py +0 -0
  152. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/hf.py +0 -0
  153. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/image.py +0 -0
  154. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/listing.py +0 -0
  155. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/listing_info.py +0 -0
  156. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/model_store.py +0 -0
  157. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/pytorch.py +0 -0
  158. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/settings.py +0 -0
  159. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/tar.py +0 -0
  160. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/text.py +0 -0
  161. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/utils.py +0 -0
  162. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/video.py +0 -0
  163. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/webdataset.py +0 -0
  164. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/lib/webdataset_laion.py +0 -0
  165. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/listing.py +0 -0
  166. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/__init__.py +0 -0
  167. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/bbox.py +0 -0
  168. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/pose.py +0 -0
  169. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/segment.py +0 -0
  170. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  171. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  172. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/ultralytics/pose.py +0 -0
  173. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/ultralytics/segment.py +0 -0
  174. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/model/utils.py +0 -0
  175. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/node.py +0 -0
  176. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/nodes_fetcher.py +0 -0
  177. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/nodes_thread_pool.py +0 -0
  178. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/progress.py +0 -0
  179. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/py.typed +0 -0
  180. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/__init__.py +0 -0
  181. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/batch.py +0 -0
  182. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/dispatch.py +0 -0
  183. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/metrics.py +0 -0
  184. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/params.py +0 -0
  185. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/queue.py +0 -0
  186. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/schema.py +0 -0
  187. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/session.py +0 -0
  188. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/udf.py +0 -0
  189. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/query/utils.py +0 -0
  190. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/remote/__init__.py +0 -0
  191. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/remote/studio.py +0 -0
  192. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/script_meta.py +0 -0
  193. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/__init__.py +0 -0
  194. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/default/__init__.py +0 -0
  195. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/default/base.py +0 -0
  196. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/__init__.py +0 -0
  197. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/aggregate.py +0 -0
  198. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/array.py +0 -0
  199. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/conditional.py +0 -0
  200. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/numeric.py +0 -0
  201. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/path.py +0 -0
  202. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/random.py +0 -0
  203. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/functions/string.py +0 -0
  204. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/selectable.py +0 -0
  205. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  206. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/sqlite/types.py +0 -0
  207. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/sqlite/vector.py +0 -0
  208. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/types.py +0 -0
  209. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/sql/utils.py +0 -0
  210. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/studio.py +0 -0
  211. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/telemetry.py +0 -0
  212. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/toolkit/__init__.py +0 -0
  213. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/toolkit/split.py +0 -0
  214. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/torch/__init__.py +0 -0
  215. {datachain-0.12.0 → datachain-0.13.0}/src/datachain/utils.py +0 -0
  216. {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/SOURCES.txt +0 -0
  217. {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  218. {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/entry_points.txt +0 -0
  219. {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/requires.txt +0 -0
  220. {datachain-0.12.0 → datachain-0.13.0}/src/datachain.egg-info/top_level.txt +0 -0
  221. {datachain-0.12.0 → datachain-0.13.0}/tests/__init__.py +0 -0
  222. {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/__init__.py +0 -0
  223. {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/conftest.py +0 -0
  224. {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  225. {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  226. {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/datasets/.gitignore +0 -0
  227. {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  228. {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/test_datachain.py +0 -0
  229. {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/test_ls.py +0 -0
  230. {datachain-0.12.0 → datachain-0.13.0}/tests/benchmarks/test_version.py +0 -0
  231. {datachain-0.12.0 → datachain-0.13.0}/tests/conftest.py +0 -0
  232. {datachain-0.12.0 → datachain-0.13.0}/tests/data.py +0 -0
  233. {datachain-0.12.0 → datachain-0.13.0}/tests/examples/__init__.py +0 -0
  234. {datachain-0.12.0 → datachain-0.13.0}/tests/examples/test_examples.py +0 -0
  235. {datachain-0.12.0 → datachain-0.13.0}/tests/examples/test_wds_e2e.py +0 -0
  236. {datachain-0.12.0 → datachain-0.13.0}/tests/examples/wds_data.py +0 -0
  237. {datachain-0.12.0 → datachain-0.13.0}/tests/func/__init__.py +0 -0
  238. {datachain-0.12.0 → datachain-0.13.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  239. {datachain-0.12.0 → datachain-0.13.0}/tests/func/data/lena.jpg +0 -0
  240. {datachain-0.12.0 → datachain-0.13.0}/tests/func/fake-service-account-credentials.json +0 -0
  241. {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/__init__.py +0 -0
  242. {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/data/running-mask0.png +0 -0
  243. {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/data/running-mask1.png +0 -0
  244. {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/data/running.jpg +0 -0
  245. {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/data/ships.jpg +0 -0
  246. {datachain-0.12.0 → datachain-0.13.0}/tests/func/model/test_yolo.py +0 -0
  247. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_catalog.py +0 -0
  248. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_client.py +0 -0
  249. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_cloud_transfer.py +0 -0
  250. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_data_storage.py +0 -0
  251. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_datachain_merge.py +0 -0
  252. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_dataset_query.py +0 -0
  253. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_datasets.py +0 -0
  254. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_feature_pickling.py +0 -0
  255. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_file.py +0 -0
  256. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_hf.py +0 -0
  257. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_hidden_field.py +0 -0
  258. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_image.py +0 -0
  259. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_listing.py +0 -0
  260. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_ls.py +0 -0
  261. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_meta_formats.py +0 -0
  262. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_metrics.py +0 -0
  263. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_pull.py +0 -0
  264. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_pytorch.py +0 -0
  265. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_query.py +0 -0
  266. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_session.py +0 -0
  267. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_toolkit.py +0 -0
  268. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_video.py +0 -0
  269. {datachain-0.12.0 → datachain-0.13.0}/tests/func/test_warehouse.py +0 -0
  270. {datachain-0.12.0 → datachain-0.13.0}/tests/scripts/feature_class.py +0 -0
  271. {datachain-0.12.0 → datachain-0.13.0}/tests/scripts/feature_class_exception.py +0 -0
  272. {datachain-0.12.0 → datachain-0.13.0}/tests/scripts/feature_class_parallel.py +0 -0
  273. {datachain-0.12.0 → datachain-0.13.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  274. {datachain-0.12.0 → datachain-0.13.0}/tests/scripts/name_len_slow.py +0 -0
  275. {datachain-0.12.0 → datachain-0.13.0}/tests/test_atomicity.py +0 -0
  276. {datachain-0.12.0 → datachain-0.13.0}/tests/test_cli_e2e.py +0 -0
  277. {datachain-0.12.0 → datachain-0.13.0}/tests/test_cli_studio.py +0 -0
  278. {datachain-0.12.0 → datachain-0.13.0}/tests/test_import_time.py +0 -0
  279. {datachain-0.12.0 → datachain-0.13.0}/tests/test_query_e2e.py +0 -0
  280. {datachain-0.12.0 → datachain-0.13.0}/tests/test_telemetry.py +0 -0
  281. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/__init__.py +0 -0
  282. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/__init__.py +0 -0
  283. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/conftest.py +0 -0
  284. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_arrow.py +0 -0
  285. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_clip.py +0 -0
  286. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  287. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_diff.py +0 -0
  288. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_feature.py +0 -0
  289. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_feature_utils.py +0 -0
  290. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_file.py +0 -0
  291. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_hf.py +0 -0
  292. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_image.py +0 -0
  293. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_listing_info.py +0 -0
  294. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  295. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_schema.py +0 -0
  296. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  297. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_text.py +0 -0
  298. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_utils.py +0 -0
  299. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/lib/test_webdataset.py +0 -0
  300. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/model/__init__.py +0 -0
  301. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/model/test_bbox.py +0 -0
  302. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/model/test_pose.py +0 -0
  303. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/model/test_segment.py +0 -0
  304. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/model/test_utils.py +0 -0
  305. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/__init__.py +0 -0
  306. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  307. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  308. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  309. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_conditional.py +0 -0
  310. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_path.py +0 -0
  311. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_random.py +0 -0
  312. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_selectable.py +0 -0
  313. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/sql/test_string.py +0 -0
  314. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_asyn.py +0 -0
  315. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_cache.py +0 -0
  316. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_catalog.py +0 -0
  317. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_catalog_loader.py +0 -0
  318. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_cli_parsing.py +0 -0
  319. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_client.py +0 -0
  320. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_client_gcs.py +0 -0
  321. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_client_s3.py +0 -0
  322. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_config.py +0 -0
  323. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_data_storage.py +0 -0
  324. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_database_engine.py +0 -0
  325. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_dataset.py +0 -0
  326. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_dispatch.py +0 -0
  327. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_fileslice.py +0 -0
  328. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_func.py +0 -0
  329. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_listing.py +0 -0
  330. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_metastore.py +0 -0
  331. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_module_exports.py +0 -0
  332. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_pytorch.py +0 -0
  333. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_query.py +0 -0
  334. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_query_metrics.py +0 -0
  335. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_query_params.py +0 -0
  336. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_script_meta.py +0 -0
  337. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_serializer.py +0 -0
  338. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_session.py +0 -0
  339. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_utils.py +0 -0
  340. {datachain-0.12.0 → datachain-0.13.0}/tests/unit/test_warehouse.py +0 -0
  341. {datachain-0.12.0 → datachain-0.13.0}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.9.10'
27
+ rev: 'v0.11.0'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.12.0
3
+ Version: 0.13.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -777,6 +777,8 @@ class Catalog:
777
777
  validate_version: Optional[bool] = True,
778
778
  listing: Optional[bool] = False,
779
779
  uuid: Optional[str] = None,
780
+ description: Optional[str] = None,
781
+ labels: Optional[list[str]] = None,
780
782
  ) -> "DatasetRecord":
781
783
  """
782
784
  Creates new dataset of a specific version.
@@ -803,6 +805,8 @@ class Catalog:
803
805
  query_script=query_script,
804
806
  schema=schema,
805
807
  ignore_if_exists=True,
808
+ description=description,
809
+ labels=labels,
806
810
  )
807
811
 
808
812
  version = version or default_version
@@ -1608,7 +1612,7 @@ class Catalog:
1608
1612
  except TerminationSignal as exc:
1609
1613
  signal.signal(signal.SIGTERM, orig_sigterm_handler)
1610
1614
  signal.signal(signal.SIGINT, orig_sigint_handler)
1611
- logging.info("Shutting down process %s, received %r", proc.pid, exc)
1615
+ logger.info("Shutting down process %s, received %r", proc.pid, exc)
1612
1616
  # Rather than forwarding the signal to the child, we try to shut it down
1613
1617
  # gracefully. This is because we consider the script to be interactive
1614
1618
  # and special, so we give it time to cleanup before exiting.
@@ -1623,7 +1627,7 @@ class Catalog:
1623
1627
  if thread:
1624
1628
  thread.join() # wait for the reader thread
1625
1629
 
1626
- logging.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1630
+ logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1627
1631
  if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1628
1632
  raise QueryScriptCancelError(
1629
1633
  "Query script was canceled by user",
@@ -38,11 +38,12 @@ def ls_local(
38
38
  ):
39
39
  from datachain import DataChain
40
40
 
41
- if catalog is None:
42
- from datachain.catalog import get_catalog
43
-
44
- catalog = get_catalog(client_config=client_config)
45
41
  if sources:
42
+ if catalog is None:
43
+ from datachain.catalog import get_catalog
44
+
45
+ catalog = get_catalog(client_config=client_config)
46
+
46
47
  actual_sources = list(ls_urls(sources, catalog=catalog, long=long, **kwargs))
47
48
  if len(actual_sources) == 1:
48
49
  for _, entries in actual_sources:
@@ -61,8 +62,9 @@ def ls_local(
61
62
  for entry in entries:
62
63
  print(format_ls_entry(entry))
63
64
  else:
64
- chain = DataChain.listings()
65
- for ls in chain.collect("listing"):
65
+ # Collect results in a list here to prevent interference from `tqdm` and `print`
66
+ listing = list(DataChain.listings().collect("listing"))
67
+ for ls in listing:
66
68
  print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
67
69
 
68
70
 
@@ -40,6 +40,13 @@ def show(
40
40
  .offset(offset)
41
41
  )
42
42
  records = query.to_db_records()
43
+ print("Name: ", name)
44
+ if dataset.description:
45
+ print("Description: ", dataset.description)
46
+ if dataset.labels:
47
+ print("Labels: ", ",".join(dataset.labels))
48
+ print("\n")
49
+
43
50
  show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
44
51
 
45
52
  if schema and dataset_version.feature_schema:
@@ -30,7 +30,7 @@ class GCSClient(Client):
30
30
  if kwargs.pop("anon", False):
31
31
  kwargs["token"] = "anon" # noqa: S105
32
32
 
33
- return cast(GCSFileSystem, super().create_fs(**kwargs))
33
+ return cast("GCSFileSystem", super().create_fs(**kwargs))
34
34
 
35
35
  def url(self, path: str, expires: int = 3600, **kwargs) -> str:
36
36
  """
@@ -55,7 +55,7 @@ class ClientS3(Client):
55
55
  except NotImplementedError:
56
56
  pass
57
57
 
58
- return cast(S3FileSystem, super().create_fs(**kwargs))
58
+ return cast("S3FileSystem", super().create_fs(**kwargs))
59
59
 
60
60
  def url(self, path: str, expires: int = 3600, **kwargs) -> str:
61
61
  """
@@ -119,6 +119,8 @@ class AbstractMetastore(ABC, Serializable):
119
119
  query_script: str = "",
120
120
  schema: Optional[dict[str, Any]] = None,
121
121
  ignore_if_exists: bool = False,
122
+ description: Optional[str] = None,
123
+ labels: Optional[list[str]] = None,
122
124
  ) -> DatasetRecord:
123
125
  """Creates new dataset."""
124
126
 
@@ -518,6 +520,8 @@ class AbstractDBMetastore(AbstractMetastore):
518
520
  query_script: str = "",
519
521
  schema: Optional[dict[str, Any]] = None,
520
522
  ignore_if_exists: bool = False,
523
+ description: Optional[str] = None,
524
+ labels: Optional[list[str]] = None,
521
525
  **kwargs, # TODO registered = True / False
522
526
  ) -> DatasetRecord:
523
527
  """Creates new dataset."""
@@ -533,6 +537,8 @@ class AbstractDBMetastore(AbstractMetastore):
533
537
  sources="\n".join(sources) if sources else "",
534
538
  query_script=query_script,
535
539
  schema=json.dumps(schema or {}),
540
+ description=description,
541
+ labels=json.dumps(labels or []),
536
542
  )
537
543
  if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
538
544
  # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
@@ -6,6 +6,7 @@ import sys
6
6
  from collections.abc import Iterator, Sequence
7
7
  from functools import wraps
8
8
  from typing import (
9
+ IO,
9
10
  TYPE_CHECKING,
10
11
  Any,
11
12
  BinaryIO,
@@ -270,6 +271,18 @@ class DataChain:
270
271
  self._setup: dict = setup or {}
271
272
  self._sys = _sys
272
273
 
274
+ def __repr__(self) -> str:
275
+ """Return a string representation of the chain."""
276
+ classname = self.__class__.__name__
277
+ if not self._effective_signals_schema.values:
278
+ return f"Empty {classname}"
279
+
280
+ import io
281
+
282
+ file = io.StringIO()
283
+ self.print_schema(file=file)
284
+ return file.getvalue()
285
+
273
286
  @property
274
287
  def schema(self) -> dict[str, DataType]:
275
288
  """Get schema of the chain."""
@@ -323,9 +336,9 @@ class DataChain:
323
336
  """Return `self.union(other)`."""
324
337
  return self.union(other)
325
338
 
326
- def print_schema(self) -> None:
339
+ def print_schema(self, file: Optional[IO] = None) -> None:
327
340
  """Print schema of the chain."""
328
- self._effective_signals_schema.print_tree()
341
+ self._effective_signals_schema.print_tree(file=file)
329
342
 
330
343
  def clone(self) -> "Self":
331
344
  """Make a copy of the chain in a new table."""
@@ -629,7 +642,8 @@ class DataChain:
629
642
  model_name=model_name,
630
643
  jmespath=jmespath,
631
644
  nrows=nrows,
632
- )
645
+ ),
646
+ "params": {"file": File},
633
647
  }
634
648
  # disable prefetch if nrows is set
635
649
  settings = {"prefetch": 0} if nrows else {}
@@ -773,7 +787,12 @@ class DataChain:
773
787
  )
774
788
 
775
789
  def save( # type: ignore[override]
776
- self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
790
+ self,
791
+ name: Optional[str] = None,
792
+ version: Optional[int] = None,
793
+ description: Optional[str] = None,
794
+ labels: Optional[list[str]] = None,
795
+ **kwargs,
777
796
  ) -> "Self":
778
797
  """Save to a Dataset. It returns the chain itself.
779
798
 
@@ -781,11 +800,18 @@ class DataChain:
781
800
  name : dataset name. Empty name saves to a temporary dataset that will be
782
801
  removed after process ends. Temp dataset are useful for optimization.
783
802
  version : version of a dataset. Default - the last version that exist.
803
+ description : description of a dataset.
804
+ labels : labels of a dataset.
784
805
  """
785
806
  schema = self.signals_schema.clone_without_sys_signals().serialize()
786
807
  return self._evolve(
787
808
  query=self._query.save(
788
- name=name, version=version, feature_schema=schema, **kwargs
809
+ name=name,
810
+ version=version,
811
+ description=description,
812
+ labels=labels,
813
+ feature_schema=schema,
814
+ **kwargs,
789
815
  )
790
816
  )
791
817
 
@@ -1003,8 +1029,9 @@ class DataChain:
1003
1029
  func: Optional[Union[Callable, UDFObjT]],
1004
1030
  params: Union[None, str, Sequence[str]],
1005
1031
  output: OutputType,
1006
- signal_map,
1032
+ signal_map: dict[str, Callable],
1007
1033
  ) -> UDFObjT:
1034
+ is_batch = target_class.is_input_batched
1008
1035
  is_generator = target_class.is_output_batched
1009
1036
  name = self.name or ""
1010
1037
 
@@ -1015,7 +1042,9 @@ class DataChain:
1015
1042
  if self._sys:
1016
1043
  signals_schema = SignalSchema({"sys": Sys}) | signals_schema
1017
1044
 
1018
- params_schema = signals_schema.slice(sign.params, self._setup)
1045
+ params_schema = signals_schema.slice(
1046
+ sign.params, self._setup, is_batch=is_batch
1047
+ )
1019
1048
 
1020
1049
  return target_class._create(sign, params_schema)
1021
1050
 
@@ -193,7 +193,14 @@ class File(DataModel):
193
193
  "last_modified": DateTime,
194
194
  "location": JSON,
195
195
  }
196
- _hidden_fields: ClassVar[list[str]] = ["version", "source"]
196
+ _hidden_fields: ClassVar[list[str]] = [
197
+ "source",
198
+ "version",
199
+ "etag",
200
+ "is_latest",
201
+ "last_modified",
202
+ "location",
203
+ ]
197
204
 
198
205
  _unique_id_keys: ClassVar[list[str]] = [
199
206
  "source",
@@ -10,7 +10,7 @@ import jmespath as jsp
10
10
  from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
11
11
 
12
12
  from datachain.lib.data_model import DataModel # noqa: F401
13
- from datachain.lib.file import File
13
+ from datachain.lib.file import TextFile
14
14
 
15
15
 
16
16
  class UserModel(BaseModel):
@@ -130,7 +130,7 @@ def read_meta( # noqa: C901
130
130
  #
131
131
 
132
132
  def parse_data(
133
- file: File,
133
+ file: TextFile,
134
134
  data_model=spec,
135
135
  format=format,
136
136
  jmespath=jmespath,
@@ -5,6 +5,7 @@ from dataclasses import dataclass
5
5
  from datetime import datetime
6
6
  from inspect import isclass
7
7
  from typing import ( # noqa: UP035
8
+ IO,
8
9
  TYPE_CHECKING,
9
10
  Annotated,
10
11
  Any,
@@ -154,9 +155,9 @@ class SignalSchema:
154
155
  if not callable(func):
155
156
  raise SetupError(key, "value must be function or callable class")
156
157
 
157
- def _init_setup_values(self):
158
+ def _init_setup_values(self) -> None:
158
159
  if self.setup_values is not None:
159
- return self.setup_values
160
+ return
160
161
 
161
162
  res = {}
162
163
  for key, func in self.setup_func.items():
@@ -398,7 +399,7 @@ class SignalSchema:
398
399
  return SignalSchema(signals)
399
400
 
400
401
  @staticmethod
401
- def get_flatten_hidden_fields(schema):
402
+ def get_flatten_hidden_fields(schema: dict):
402
403
  custom_types = schema.get("_custom_types", {})
403
404
  if not custom_types:
404
405
  return []
@@ -464,19 +465,61 @@ class SignalSchema:
464
465
  return False
465
466
 
466
467
  def slice(
467
- self, keys: Sequence[str], setup: Optional[dict[str, Callable]] = None
468
+ self,
469
+ params: dict[str, Union[DataType, Any]],
470
+ setup: Optional[dict[str, Callable]] = None,
471
+ is_batch: bool = False,
468
472
  ) -> "SignalSchema":
469
- # Make new schema that combines current schema and setup signals
470
- setup = setup or {}
471
- setup_no_types = dict.fromkeys(setup.keys(), str)
472
- union = SignalSchema(self.values | setup_no_types)
473
- # Slice combined schema by keys
474
- schema = {}
475
- for k in keys:
476
- try:
477
- schema[k] = union._find_in_tree(k.split("."))
478
- except SignalResolvingError:
479
- pass
473
+ """
474
+ Returns new schema that combines current schema and setup signals.
475
+ """
476
+ setup_params = setup.keys() if setup else []
477
+ schema: dict[str, DataType] = {}
478
+
479
+ for param, param_type in params.items():
480
+ # This is special case for setup params, they are always treated as strings
481
+ if param in setup_params:
482
+ schema[param] = str
483
+ continue
484
+
485
+ schema_type = self._find_in_tree(param.split("."))
486
+
487
+ if param_type is Any:
488
+ schema[param] = schema_type
489
+ continue
490
+
491
+ schema_origin = get_origin(schema_type)
492
+ param_origin = get_origin(param_type)
493
+
494
+ if schema_origin is Union and type(None) in get_args(schema_type):
495
+ schema_type = get_args(schema_type)[0]
496
+ if param_origin is Union and type(None) in get_args(param_type):
497
+ param_type = get_args(param_type)[0]
498
+
499
+ if is_batch:
500
+ if param_type is list:
501
+ schema[param] = schema_type
502
+ continue
503
+
504
+ if param_origin is not list:
505
+ raise SignalResolvingError(param.split("."), "is not a list")
506
+
507
+ param_type = get_args(param_type)[0]
508
+
509
+ if param_type == schema_type or (
510
+ isclass(param_type)
511
+ and isclass(schema_type)
512
+ and issubclass(param_type, File)
513
+ and issubclass(schema_type, File)
514
+ ):
515
+ schema[param] = schema_type
516
+ continue
517
+
518
+ raise SignalResolvingError(
519
+ param.split("."),
520
+ f"types mismatch: {param_type} != {schema_type}",
521
+ )
522
+
480
523
  return SignalSchema(schema, setup)
481
524
 
482
525
  def row_to_features(
@@ -696,16 +739,20 @@ class SignalSchema:
696
739
  substree, new_prefix, depth + 1, include_hidden
697
740
  )
698
741
 
699
- def print_tree(self, indent: int = 4, start_at: int = 0):
742
+ def print_tree(self, indent: int = 2, start_at: int = 0, file: Optional[IO] = None):
700
743
  for path, type_, _, depth in self.get_flat_tree():
701
744
  total_indent = start_at + depth * indent
702
- print(" " * total_indent, f"{path[-1]}:", SignalSchema._type_to_str(type_))
745
+ col_name = " " * total_indent + path[-1]
746
+ col_type = SignalSchema._type_to_str(type_)
747
+ print(col_name, col_type, sep=": ", file=file)
703
748
 
704
749
  if get_origin(type_) is list:
705
750
  args = get_args(type_)
706
751
  if len(args) > 0 and ModelStore.is_pydantic(args[0]):
707
752
  sub_schema = SignalSchema({"* list of": args[0]})
708
- sub_schema.print_tree(indent=indent, start_at=total_indent + indent)
753
+ sub_schema.print_tree(
754
+ indent=indent, start_at=total_indent + indent, file=file
755
+ )
709
756
 
710
757
  def get_headers_with_length(self, include_hidden: bool = True):
711
758
  paths = [
@@ -159,6 +159,7 @@ class UDFBase(AbstractUDF):
159
159
  ```
160
160
  """
161
161
 
162
+ is_input_batched = False
162
163
  is_output_batched = False
163
164
  prefetch: int = 0
164
165
 
@@ -395,6 +396,7 @@ class Mapper(UDFBase):
395
396
  class BatchMapper(UDFBase):
396
397
  """Inherit from this class to pass to `DataChain.batch_map()`."""
397
398
 
399
+ is_input_batched = True
398
400
  is_output_batched = True
399
401
 
400
402
  def run(
@@ -481,6 +483,7 @@ class Generator(UDFBase):
481
483
  class Aggregator(UDFBase):
482
484
  """Inherit from this class to pass to `DataChain.agg()`."""
483
485
 
486
+ is_input_batched = True
484
487
  is_output_batched = True
485
488
 
486
489
  def run(
@@ -1,7 +1,7 @@
1
1
  import inspect
2
2
  from collections.abc import Generator, Iterator, Sequence
3
3
  from dataclasses import dataclass
4
- from typing import Callable, Union, get_args, get_origin
4
+ from typing import Any, Callable, Union, get_args, get_origin
5
5
 
6
6
  from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
7
7
  from datachain.lib.signal_schema import SignalSchema
@@ -18,7 +18,7 @@ class UdfSignatureError(DataChainParamsError):
18
18
  @dataclass
19
19
  class UdfSignature:
20
20
  func: Union[Callable, UDFBase]
21
- params: Sequence[str]
21
+ params: dict[str, Union[DataType, Any]]
22
22
  output_schema: SignalSchema
23
23
 
24
24
  DEFAULT_RETURN_TYPE = str
@@ -58,15 +58,23 @@ class UdfSignature:
58
58
  if not isinstance(udf_func, UDFBase) and not callable(udf_func):
59
59
  raise UdfSignatureError(chain, f"UDF '{udf_func}' is not callable")
60
60
 
61
- func_params_map_sign, func_outs_sign, is_iterator = (
62
- UdfSignature._func_signature(chain, udf_func)
61
+ func_params_map_sign, func_outs_sign, is_iterator = cls._func_signature(
62
+ chain, udf_func
63
63
  )
64
+
65
+ udf_params: dict[str, Union[DataType, Any]] = {}
64
66
  if params:
65
- udf_params = [params] if isinstance(params, str) else params
66
- elif not func_params_map_sign:
67
- udf_params = []
68
- else:
69
- udf_params = list(func_params_map_sign.keys())
67
+ udf_params = (
68
+ {params: Any} if isinstance(params, str) else dict.fromkeys(params, Any)
69
+ )
70
+ elif func_params_map_sign:
71
+ udf_params = {
72
+ param: (
73
+ param_type if param_type is not inspect.Parameter.empty else Any
74
+ )
75
+ for param, param_type in func_params_map_sign.items()
76
+ }
77
+
70
78
  if output:
71
79
  udf_output_map = UdfSignature._validate_output(
72
80
  chain, signal_name, func, func_outs_sign, output
@@ -1646,6 +1646,8 @@ class DatasetQuery:
1646
1646
  name: Optional[str] = None,
1647
1647
  version: Optional[int] = None,
1648
1648
  feature_schema: Optional[dict] = None,
1649
+ description: Optional[str] = None,
1650
+ labels: Optional[list[str]] = None,
1649
1651
  **kwargs,
1650
1652
  ) -> "Self":
1651
1653
  """Save the query as a dataset."""
@@ -1678,6 +1680,8 @@ class DatasetQuery:
1678
1680
  version=version,
1679
1681
  feature_schema=feature_schema,
1680
1682
  columns=columns,
1683
+ description=description,
1684
+ labels=labels,
1681
1685
  **kwargs,
1682
1686
  )
1683
1687
  version = version or dataset.latest_version
@@ -290,9 +290,9 @@ def adapt_datetime(val: datetime) -> str:
290
290
  val = val.astimezone(timezone.utc)
291
291
  except (OverflowError, ValueError, OSError):
292
292
  if val.year == MAXYEAR:
293
- val = datetime.max
293
+ val = datetime.max.replace(tzinfo=timezone.utc)
294
294
  elif val.year == MINYEAR:
295
- val = datetime.min
295
+ val = datetime.min.replace(tzinfo=timezone.utc)
296
296
  else:
297
297
  raise
298
298
  return val.replace(tzinfo=None).isoformat(" ")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.12.0
3
+ Version: 0.13.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -447,6 +447,21 @@ def test_show(capsys, test_session):
447
447
  assert f"{i} {first_name[i]}" in normalized_output
448
448
 
449
449
 
450
+ def test_save(test_session):
451
+ chain = DataChain.from_values(key=["a", "b", "c"])
452
+ chain.save(
453
+ name="new_name",
454
+ version=1,
455
+ description="new description",
456
+ labels=["new_label", "old_label"],
457
+ )
458
+
459
+ ds = test_session.catalog.get_dataset("new_name")
460
+ assert ds.name == "new_name"
461
+ assert ds.description == "new description"
462
+ assert ds.labels == ["new_label", "old_label"]
463
+
464
+
450
465
  def test_show_nested_empty(capsys, test_session):
451
466
  files = [File(size=s, path=p) for p, s in zip(list("abcde"), range(5))]
452
467
  DataChain.from_values(file=files, session=test_session).limit(0).show()
@@ -707,7 +722,7 @@ def test_udf_parallel_boostrap(test_session_tmpfile):
707
722
  self.value = MyMapper.DEFAULT_VALUE
708
723
  self._had_teardown = False
709
724
 
710
- def process(self, *args) -> int:
725
+ def process(self, key) -> int:
711
726
  return self.value
712
727
 
713
728
  def setup(self):
@@ -79,6 +79,55 @@ def sort_files(files):
79
79
  return sorted(files, key=lambda f: (f.path, f.size))
80
80
 
81
81
 
82
+ def test_repr(test_session):
83
+ dc = DataChain.from_values(
84
+ sign1=features_nested, col1=["a", "b", "c"], session=test_session
85
+ )
86
+ assert (
87
+ repr(dc)
88
+ == """\
89
+ sign1: MyNested
90
+ label: str
91
+ fr: MyFr
92
+ nnn: str
93
+ count: int
94
+ col1: str
95
+ """
96
+ )
97
+
98
+ # datachain without any columns
99
+ assert repr(dc.select_except("col1", "sign1")) == "Empty DataChain"
100
+
101
+ dc = dc.map(col2=lambda col1: col1 * 2)
102
+ assert (
103
+ repr(dc)
104
+ == """\
105
+ sign1: MyNested
106
+ label: str
107
+ fr: MyFr
108
+ nnn: str
109
+ count: int
110
+ col1: str
111
+ col2: str
112
+ """
113
+ )
114
+
115
+ dc = dc.mutate(countplusone=dc.column("sign1.fr.count") + 1)
116
+ assert (
117
+ repr(dc)
118
+ == """\
119
+ sign1: MyNested
120
+ label: str
121
+ fr: MyFr
122
+ nnn: str
123
+ count: int
124
+ col1: str
125
+ col2: str
126
+ countplusone: int
127
+ """
128
+ )
129
+
130
+
82
131
  def test_pandas_conversion(test_session):
83
132
  df = pd.DataFrame(DF_DATA)
84
133
  df1 = DataChain.from_pandas(df, session=test_session)
@@ -15,7 +15,7 @@ def test_udf():
15
15
  self.value = MyMapper.DEFAULT_VALUE
16
16
  self._had_teardown = False
17
17
 
18
- def process(self, *args) -> int:
18
+ def process(self, key) -> int:
19
19
  return self.value
20
20
 
21
21
  def setup(self):
@@ -40,7 +40,7 @@ def test_no_bootstrap_for_callable():
40
40
  self._had_bootstrap = False
41
41
  self._had_teardown = False
42
42
 
43
- def __call__(self, *args):
43
+ def __call__(self, key):
44
44
  return None
45
45
 
46
46
  def bootstrap(self):