datachain 0.16.0__tar.gz → 0.16.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (355) hide show
  1. {datachain-0.16.0/src/datachain.egg-info → datachain-0.16.1}/PKG-INFO +1 -1
  2. {datachain-0.16.0 → datachain-0.16.1}/docs/examples.md +5 -5
  3. {datachain-0.16.0 → datachain-0.16.1}/docs/quick-start.md +3 -3
  4. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/aggregate.py +3 -3
  5. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/values_to_tuples.py +6 -8
  6. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/datachain.py +16 -10
  7. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/records.py +16 -10
  8. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/utils.py +2 -2
  9. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/signal_schema.py +1 -10
  10. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/dataset.py +13 -6
  11. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/schema.py +1 -4
  12. {datachain-0.16.0 → datachain-0.16.1/src/datachain.egg-info}/PKG-INFO +1 -1
  13. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_read_database.py +31 -17
  14. {datachain-0.16.0 → datachain-0.16.1}/.cruft.json +0 -0
  15. {datachain-0.16.0 → datachain-0.16.1}/.gitattributes +0 -0
  16. {datachain-0.16.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  17. {datachain-0.16.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  18. {datachain-0.16.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  19. {datachain-0.16.0 → datachain-0.16.1}/.github/codecov.yaml +0 -0
  20. {datachain-0.16.0 → datachain-0.16.1}/.github/dependabot.yml +0 -0
  21. {datachain-0.16.0 → datachain-0.16.1}/.github/workflows/benchmarks.yml +0 -0
  22. {datachain-0.16.0 → datachain-0.16.1}/.github/workflows/release.yml +0 -0
  23. {datachain-0.16.0 → datachain-0.16.1}/.github/workflows/tests-studio.yml +0 -0
  24. {datachain-0.16.0 → datachain-0.16.1}/.github/workflows/tests.yml +0 -0
  25. {datachain-0.16.0 → datachain-0.16.1}/.github/workflows/update-template.yaml +0 -0
  26. {datachain-0.16.0 → datachain-0.16.1}/.gitignore +0 -0
  27. {datachain-0.16.0 → datachain-0.16.1}/.pre-commit-config.yaml +0 -0
  28. {datachain-0.16.0 → datachain-0.16.1}/CODE_OF_CONDUCT.rst +0 -0
  29. {datachain-0.16.0 → datachain-0.16.1}/LICENSE +0 -0
  30. {datachain-0.16.0 → datachain-0.16.1}/README.rst +0 -0
  31. {datachain-0.16.0 → datachain-0.16.1}/docs/assets/captioned_cartoons.png +0 -0
  32. {datachain-0.16.0 → datachain-0.16.1}/docs/assets/datachain-white.svg +0 -0
  33. {datachain-0.16.0 → datachain-0.16.1}/docs/assets/datachain.svg +0 -0
  34. {datachain-0.16.0 → datachain-0.16.1}/docs/contributing.md +0 -0
  35. {datachain-0.16.0 → datachain-0.16.1}/docs/css/github-permalink-style.css +0 -0
  36. {datachain-0.16.0 → datachain-0.16.1}/docs/index.md +0 -0
  37. {datachain-0.16.0 → datachain-0.16.1}/docs/overrides/main.html +0 -0
  38. {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/arrowrow.md +0 -0
  39. {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/bbox.md +0 -0
  40. {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/file.md +0 -0
  41. {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/imagefile.md +0 -0
  42. {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/index.md +0 -0
  43. {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/pose.md +0 -0
  44. {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/segment.md +0 -0
  45. {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/tarvfile.md +0 -0
  46. {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/textfile.md +0 -0
  47. {datachain-0.16.0 → datachain-0.16.1}/docs/references/data-types/videofile.md +0 -0
  48. {datachain-0.16.0 → datachain-0.16.1}/docs/references/datachain.md +0 -0
  49. {datachain-0.16.0 → datachain-0.16.1}/docs/references/func.md +0 -0
  50. {datachain-0.16.0 → datachain-0.16.1}/docs/references/index.md +0 -0
  51. {datachain-0.16.0 → datachain-0.16.1}/docs/references/remotes.md +0 -0
  52. {datachain-0.16.0 → datachain-0.16.1}/docs/references/toolkit.md +0 -0
  53. {datachain-0.16.0 → datachain-0.16.1}/docs/references/torch.md +0 -0
  54. {datachain-0.16.0 → datachain-0.16.1}/docs/references/udf.md +0 -0
  55. {datachain-0.16.0 → datachain-0.16.1}/docs/tutorials.md +0 -0
  56. {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  57. {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  58. {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/openimage-detect.py +0 -0
  59. {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
  60. {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-pose.py +0 -0
  61. {datachain-0.16.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-segment.py +0 -0
  62. {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/common_sql_functions.py +0 -0
  63. {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/json-csv-reader.py +0 -0
  64. {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/torch-loader.py +0 -0
  65. {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/udfs/parallel.py +0 -0
  66. {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/udfs/simple.py +0 -0
  67. {datachain-0.16.0 → datachain-0.16.1}/examples/get_started/udfs/stateful.py +0 -0
  68. {datachain-0.16.0 → datachain-0.16.1}/examples/llm_and_nlp/claude-query.py +0 -0
  69. {datachain-0.16.0 → datachain-0.16.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  70. {datachain-0.16.0 → datachain-0.16.1}/examples/multimodal/clip_inference.py +0 -0
  71. {datachain-0.16.0 → datachain-0.16.1}/examples/multimodal/hf_pipeline.py +0 -0
  72. {datachain-0.16.0 → datachain-0.16.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  73. {datachain-0.16.0 → datachain-0.16.1}/examples/multimodal/wds.py +0 -0
  74. {datachain-0.16.0 → datachain-0.16.1}/examples/multimodal/wds_filtered.py +0 -0
  75. {datachain-0.16.0 → datachain-0.16.1}/mkdocs.yml +0 -0
  76. {datachain-0.16.0 → datachain-0.16.1}/noxfile.py +0 -0
  77. {datachain-0.16.0 → datachain-0.16.1}/pyproject.toml +0 -0
  78. {datachain-0.16.0 → datachain-0.16.1}/setup.cfg +0 -0
  79. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/__init__.py +0 -0
  80. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/__main__.py +0 -0
  81. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/asyn.py +0 -0
  82. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cache.py +0 -0
  83. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/catalog/__init__.py +0 -0
  84. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/catalog/catalog.py +0 -0
  85. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/catalog/datasource.py +0 -0
  86. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/catalog/loader.py +0 -0
  87. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/__init__.py +0 -0
  88. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/__init__.py +0 -0
  89. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/datasets.py +0 -0
  90. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/du.py +0 -0
  91. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/index.py +0 -0
  92. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/ls.py +0 -0
  93. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/misc.py +0 -0
  94. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/query.py +0 -0
  95. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/commands/show.py +0 -0
  96. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/parser/__init__.py +0 -0
  97. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/parser/job.py +0 -0
  98. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/parser/studio.py +0 -0
  99. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/parser/utils.py +0 -0
  100. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/cli/utils.py +0 -0
  101. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/__init__.py +0 -0
  102. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/azure.py +0 -0
  103. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/fileslice.py +0 -0
  104. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/fsspec.py +0 -0
  105. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/gcs.py +0 -0
  106. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/hf.py +0 -0
  107. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/local.py +0 -0
  108. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/client/s3.py +0 -0
  109. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/config.py +0 -0
  110. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/__init__.py +0 -0
  111. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/db_engine.py +0 -0
  112. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/job.py +0 -0
  113. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/metastore.py +0 -0
  114. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/schema.py +0 -0
  115. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/serializer.py +0 -0
  116. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/sqlite.py +0 -0
  117. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/data_storage/warehouse.py +0 -0
  118. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/dataset.py +0 -0
  119. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/diff/__init__.py +0 -0
  120. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/error.py +0 -0
  121. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/fs/__init__.py +0 -0
  122. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/fs/reference.py +0 -0
  123. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/fs/utils.py +0 -0
  124. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/__init__.py +0 -0
  125. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/array.py +0 -0
  126. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/base.py +0 -0
  127. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/conditional.py +0 -0
  128. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/func.py +0 -0
  129. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/numeric.py +0 -0
  130. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/path.py +0 -0
  131. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/random.py +0 -0
  132. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/string.py +0 -0
  133. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/func/window.py +0 -0
  134. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/job.py +0 -0
  135. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/__init__.py +0 -0
  136. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/arrow.py +0 -0
  137. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/clip.py +0 -0
  138. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/__init__.py +0 -0
  139. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/flatten.py +0 -0
  140. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  141. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  142. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/convert/unflatten.py +0 -0
  143. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/data_model.py +0 -0
  144. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dataset_info.py +0 -0
  145. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/__init__.py +0 -0
  146. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/csv.py +0 -0
  147. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/database.py +0 -0
  148. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/datasets.py +0 -0
  149. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/hf.py +0 -0
  150. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/json.py +0 -0
  151. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/listings.py +0 -0
  152. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/pandas.py +0 -0
  153. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/parquet.py +0 -0
  154. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/storage.py +0 -0
  155. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/dc/values.py +0 -0
  156. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/file.py +0 -0
  157. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/hf.py +0 -0
  158. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/image.py +0 -0
  159. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/listing.py +0 -0
  160. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/listing_info.py +0 -0
  161. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/meta_formats.py +0 -0
  162. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/model_store.py +0 -0
  163. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/pytorch.py +0 -0
  164. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/settings.py +0 -0
  165. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/tar.py +0 -0
  166. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/text.py +0 -0
  167. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/udf.py +0 -0
  168. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/udf_signature.py +0 -0
  169. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/utils.py +0 -0
  170. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/video.py +0 -0
  171. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/webdataset.py +0 -0
  172. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/lib/webdataset_laion.py +0 -0
  173. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/listing.py +0 -0
  174. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/__init__.py +0 -0
  175. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/bbox.py +0 -0
  176. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/pose.py +0 -0
  177. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/segment.py +0 -0
  178. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/ultralytics/__init__.py +0 -0
  179. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/ultralytics/bbox.py +0 -0
  180. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/ultralytics/pose.py +0 -0
  181. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/ultralytics/segment.py +0 -0
  182. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/model/utils.py +0 -0
  183. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/node.py +0 -0
  184. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/nodes_fetcher.py +0 -0
  185. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/nodes_thread_pool.py +0 -0
  186. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/progress.py +0 -0
  187. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/py.typed +0 -0
  188. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/__init__.py +0 -0
  189. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/batch.py +0 -0
  190. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/dispatch.py +0 -0
  191. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/metrics.py +0 -0
  192. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/params.py +0 -0
  193. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/queue.py +0 -0
  194. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/session.py +0 -0
  195. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/udf.py +0 -0
  196. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/query/utils.py +0 -0
  197. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/remote/__init__.py +0 -0
  198. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/remote/studio.py +0 -0
  199. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/script_meta.py +0 -0
  200. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/__init__.py +0 -0
  201. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/default/__init__.py +0 -0
  202. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/default/base.py +0 -0
  203. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/__init__.py +0 -0
  204. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/aggregate.py +0 -0
  205. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/array.py +0 -0
  206. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/conditional.py +0 -0
  207. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/numeric.py +0 -0
  208. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/path.py +0 -0
  209. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/random.py +0 -0
  210. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/functions/string.py +0 -0
  211. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/selectable.py +0 -0
  212. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  213. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/sqlite/base.py +0 -0
  214. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/sqlite/types.py +0 -0
  215. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/sqlite/vector.py +0 -0
  216. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/types.py +0 -0
  217. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/sql/utils.py +0 -0
  218. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/studio.py +0 -0
  219. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/telemetry.py +0 -0
  220. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/toolkit/__init__.py +0 -0
  221. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/toolkit/split.py +0 -0
  222. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/torch/__init__.py +0 -0
  223. {datachain-0.16.0 → datachain-0.16.1}/src/datachain/utils.py +0 -0
  224. {datachain-0.16.0 → datachain-0.16.1}/src/datachain.egg-info/SOURCES.txt +0 -0
  225. {datachain-0.16.0 → datachain-0.16.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  226. {datachain-0.16.0 → datachain-0.16.1}/src/datachain.egg-info/entry_points.txt +0 -0
  227. {datachain-0.16.0 → datachain-0.16.1}/src/datachain.egg-info/requires.txt +0 -0
  228. {datachain-0.16.0 → datachain-0.16.1}/src/datachain.egg-info/top_level.txt +0 -0
  229. {datachain-0.16.0 → datachain-0.16.1}/tests/__init__.py +0 -0
  230. {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/__init__.py +0 -0
  231. {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/conftest.py +0 -0
  232. {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  233. {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  234. {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/datasets/.gitignore +0 -0
  235. {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  236. {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/test_datachain.py +0 -0
  237. {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/test_ls.py +0 -0
  238. {datachain-0.16.0 → datachain-0.16.1}/tests/benchmarks/test_version.py +0 -0
  239. {datachain-0.16.0 → datachain-0.16.1}/tests/conftest.py +0 -0
  240. {datachain-0.16.0 → datachain-0.16.1}/tests/data.py +0 -0
  241. {datachain-0.16.0 → datachain-0.16.1}/tests/examples/__init__.py +0 -0
  242. {datachain-0.16.0 → datachain-0.16.1}/tests/examples/test_examples.py +0 -0
  243. {datachain-0.16.0 → datachain-0.16.1}/tests/examples/test_wds_e2e.py +0 -0
  244. {datachain-0.16.0 → datachain-0.16.1}/tests/examples/wds_data.py +0 -0
  245. {datachain-0.16.0 → datachain-0.16.1}/tests/func/__init__.py +0 -0
  246. {datachain-0.16.0 → datachain-0.16.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  247. {datachain-0.16.0 → datachain-0.16.1}/tests/func/data/lena.jpg +0 -0
  248. {datachain-0.16.0 → datachain-0.16.1}/tests/func/fake-service-account-credentials.json +0 -0
  249. {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/__init__.py +0 -0
  250. {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/data/running-mask0.png +0 -0
  251. {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/data/running-mask1.png +0 -0
  252. {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/data/running.jpg +0 -0
  253. {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/data/ships.jpg +0 -0
  254. {datachain-0.16.0 → datachain-0.16.1}/tests/func/model/test_yolo.py +0 -0
  255. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_catalog.py +0 -0
  256. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_client.py +0 -0
  257. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_cloud_transfer.py +0 -0
  258. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_data_storage.py +0 -0
  259. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_datachain.py +0 -0
  260. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_datachain_merge.py +0 -0
  261. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_dataset_query.py +0 -0
  262. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_datasets.py +0 -0
  263. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_feature_pickling.py +0 -0
  264. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_file.py +0 -0
  265. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_hf.py +0 -0
  266. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_hidden_field.py +0 -0
  267. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_image.py +0 -0
  268. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_listing.py +0 -0
  269. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_ls.py +0 -0
  270. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_meta_formats.py +0 -0
  271. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_metrics.py +0 -0
  272. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_pull.py +0 -0
  273. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_pytorch.py +0 -0
  274. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_query.py +0 -0
  275. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_session.py +0 -0
  276. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_toolkit.py +0 -0
  277. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_video.py +0 -0
  278. {datachain-0.16.0 → datachain-0.16.1}/tests/func/test_warehouse.py +0 -0
  279. {datachain-0.16.0 → datachain-0.16.1}/tests/scripts/feature_class.py +0 -0
  280. {datachain-0.16.0 → datachain-0.16.1}/tests/scripts/feature_class_exception.py +0 -0
  281. {datachain-0.16.0 → datachain-0.16.1}/tests/scripts/feature_class_parallel.py +0 -0
  282. {datachain-0.16.0 → datachain-0.16.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  283. {datachain-0.16.0 → datachain-0.16.1}/tests/scripts/name_len_slow.py +0 -0
  284. {datachain-0.16.0 → datachain-0.16.1}/tests/test_atomicity.py +0 -0
  285. {datachain-0.16.0 → datachain-0.16.1}/tests/test_cli_e2e.py +0 -0
  286. {datachain-0.16.0 → datachain-0.16.1}/tests/test_cli_studio.py +0 -0
  287. {datachain-0.16.0 → datachain-0.16.1}/tests/test_import_time.py +0 -0
  288. {datachain-0.16.0 → datachain-0.16.1}/tests/test_query_e2e.py +0 -0
  289. {datachain-0.16.0 → datachain-0.16.1}/tests/test_telemetry.py +0 -0
  290. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/__init__.py +0 -0
  291. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/__init__.py +0 -0
  292. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/conftest.py +0 -0
  293. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_arrow.py +0 -0
  294. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_clip.py +0 -0
  295. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_datachain.py +0 -0
  296. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  297. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  298. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_diff.py +0 -0
  299. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_feature.py +0 -0
  300. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_feature_utils.py +0 -0
  301. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_file.py +0 -0
  302. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_hf.py +0 -0
  303. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_image.py +0 -0
  304. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_listing_info.py +0 -0
  305. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_python_to_sql.py +0 -0
  306. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_schema.py +0 -0
  307. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_signal_schema.py +0 -0
  308. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  309. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_text.py +0 -0
  310. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_udf_signature.py +0 -0
  311. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_utils.py +0 -0
  312. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/lib/test_webdataset.py +0 -0
  313. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/model/__init__.py +0 -0
  314. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/model/test_bbox.py +0 -0
  315. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/model/test_pose.py +0 -0
  316. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/model/test_segment.py +0 -0
  317. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/model/test_utils.py +0 -0
  318. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/__init__.py +0 -0
  319. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  320. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/sqlite/test_types.py +0 -0
  321. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  322. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_array.py +0 -0
  323. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_conditional.py +0 -0
  324. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_path.py +0 -0
  325. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_random.py +0 -0
  326. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_selectable.py +0 -0
  327. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/sql/test_string.py +0 -0
  328. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_asyn.py +0 -0
  329. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_cache.py +0 -0
  330. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_catalog.py +0 -0
  331. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_catalog_loader.py +0 -0
  332. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_cli_parsing.py +0 -0
  333. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_client.py +0 -0
  334. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_client_gcs.py +0 -0
  335. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_client_s3.py +0 -0
  336. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_config.py +0 -0
  337. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_data_storage.py +0 -0
  338. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_database_engine.py +0 -0
  339. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_dataset.py +0 -0
  340. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_dispatch.py +0 -0
  341. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_fileslice.py +0 -0
  342. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_func.py +0 -0
  343. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_listing.py +0 -0
  344. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_metastore.py +0 -0
  345. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_module_exports.py +0 -0
  346. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_pytorch.py +0 -0
  347. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_query.py +0 -0
  348. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_query_metrics.py +0 -0
  349. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_query_params.py +0 -0
  350. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_script_meta.py +0 -0
  351. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_serializer.py +0 -0
  352. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_session.py +0 -0
  353. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_utils.py +0 -0
  354. {datachain-0.16.0 → datachain-0.16.1}/tests/unit/test_warehouse.py +0 -0
  355. {datachain-0.16.0 → datachain-0.16.1}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.16.0
3
+ Version: 0.16.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -94,7 +94,7 @@ dc.DataModel.register(MistralModel)
94
94
  chain = (
95
95
  dc
96
96
  .read_storage("gs://datachain-demo/chatbot-KiT/", type="text")
97
- .filter(dc.Column("file.name").glob("*.txt"))
97
+ .filter(dc.Column("file.path").glob("*.txt"))
98
98
  .limit(5)
99
99
  .settings(parallel=4, cache=True)
100
100
  .map(
@@ -228,7 +228,7 @@ Here is an example from MS COCO “captions” JSON which employs separate secti
228
228
 
229
229
  Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations” array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
230
230
 
231
- However, Datachain can easily parse the entire COCO structure via several reading and merging operators:
231
+ However, DataChain can easily parse the entire COCO structure via several reading and merging operators:
232
232
 
233
233
  ```python
234
234
  import datachain as dc
@@ -240,7 +240,7 @@ images = dc.read_storage(images_uri)
240
240
  meta = dc.read_json(captions_uri, jmespath="images")
241
241
  captions = dc.read_json(captions_uri, jmespath="annotations")
242
242
 
243
- images_meta = images.merge(meta, on="file.name", right_on="images.file_name")
243
+ images_meta = images.merge(meta, on="file.path", right_on="images.file_name")
244
244
  captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id")
245
245
  ```
246
246
 
@@ -248,12 +248,12 @@ The resulting dataset has image entries as files decorated with all the metadata
248
248
 
249
249
  ```python
250
250
  images_with_dogs = captioned_images.filter(dc.Column("annotations.caption").glob("*dog*"))
251
- images_with_dogs.select("annotations", "file.name").show()
251
+ images_with_dogs.select("annotations", "file.path").show()
252
252
  ```
253
253
 
254
254
  ```
255
255
  captions captions captions file
256
- image_id id caption name
256
+ image_id id caption path
257
257
  0 17029 778902 a dog jumping to catch a frisbee in a yard 000000017029.jpg
258
258
  1 17029 779838 A dog jumping to catch a red frisbee in a garden 000000017029.jpg
259
259
  2 17029 781941 The dog is catching the Frisbee in mid air in ... 000000017029.jpg
@@ -184,7 +184,7 @@ chain = (
184
184
  .save("response")
185
185
  )
186
186
 
187
- chain.select("file.name", "status", "response.usage").show(5)
187
+ chain.select("file.path", "status", "response.usage").show(5)
188
188
 
189
189
  success_rate = chain.filter(dc.Column("status") == "success").count() / chain.count()
190
190
  print(f"{100*success_rate:.1f}% dialogs were successful")
@@ -194,7 +194,7 @@ Output:
194
194
 
195
195
  ``` shell
196
196
  file status response response response
197
- name usage usage usage
197
+ path usage usage usage
198
198
  prompt_tokens total_tokens completion_tokens
199
199
  0 1.txt success 547 548 1
200
200
  1 10.txt failure 3576 3578 2
@@ -277,7 +277,7 @@ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
277
277
 
278
278
  chain = (
279
279
  dc.read_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
280
- .map(label=lambda name: name.split(".")[0], params=["file.name"])
280
+ .map(label=lambda name: name.split(".")[0], params=["file.path"])
281
281
  .select("file", "label").to_pytorch(
282
282
  transform=processor.image_processor,
283
283
  tokenizer=processor.tokenizer,
@@ -165,7 +165,7 @@ def any_value(col: str) -> Func:
165
165
  Example:
166
166
  ```py
167
167
  dc.group_by(
168
- file_example=func.any_value("file.name"),
168
+ file_example=func.any_value("file.path"),
169
169
  partition_by="signal.category",
170
170
  )
171
171
  ```
@@ -227,7 +227,7 @@ def concat(col: str, separator="") -> Func:
227
227
  Example:
228
228
  ```py
229
229
  dc.group_by(
230
- files=func.concat("file.name", separator=", "),
230
+ files=func.concat("file.path", separator=", "),
231
231
  partition_by="signal.category",
232
232
  )
233
233
  ```
@@ -343,7 +343,7 @@ def first(col: str) -> Func:
343
343
  ```py
344
344
  window = func.window(partition_by="signal.category", order_by="created_at")
345
345
  dc.mutate(
346
- first_file=func.first("file.name").over(window),
346
+ first_file=func.first("file.path").over(window),
347
347
  )
348
348
  ```
349
349
 
@@ -1,6 +1,6 @@
1
1
  import itertools
2
2
  from collections.abc import Sequence
3
- from typing import Any, Optional, Union
3
+ from typing import Any, Union
4
4
 
5
5
  from datachain.lib.data_model import (
6
6
  DataType,
@@ -71,14 +71,13 @@ def values_to_tuples( # noqa: C901, PLR0912
71
71
  # If a non-None value appears early, it won't check the remaining items for
72
72
  # `None` values.
73
73
  try:
74
- pos, first_not_none_element = next(
75
- itertools.dropwhile(lambda pair: pair[1] is None, enumerate(v))
74
+ first_not_none_element = next(
75
+ itertools.dropwhile(lambda i: i is None, v)
76
76
  )
77
77
  except StopIteration:
78
- typ = str # default to str if all values are None or has length 0
79
- nullable = True
78
+ # set default type to `str` if column is empty or all values are `None`
79
+ typ = str
80
80
  else:
81
- nullable = pos > 0
82
81
  typ = type(first_not_none_element) # type: ignore[assignment]
83
82
  if not is_chain_type(typ):
84
83
  raise ValuesToTupleError(
@@ -88,8 +87,7 @@ def values_to_tuples( # noqa: C901, PLR0912
88
87
  )
89
88
  if isinstance(first_not_none_element, list):
90
89
  typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
91
-
92
- types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
90
+ types_map[k] = typ
93
91
 
94
92
  if length < 0:
95
93
  length = len_
@@ -756,7 +756,7 @@ class DataChain:
756
756
 
757
757
  Example:
758
758
  ```py
759
- dc.distinct("file.parent", "file.name")
759
+ dc.distinct("file.path")
760
760
  ```
761
761
  """
762
762
  return self._evolve(
@@ -882,7 +882,7 @@ class DataChain:
882
882
  ```py
883
883
  dc.mutate(
884
884
  area=Column("image.height") * Column("image.width"),
885
- extension=file_ext(Column("file.name")),
885
+ extension=file_ext(Column("file.path")),
886
886
  dist=cosine_distance(embedding_text, embedding_image)
887
887
  )
888
888
  ```
@@ -1071,13 +1071,13 @@ class DataChain:
1071
1071
 
1072
1072
  Iterating over all rows with selected columns:
1073
1073
  ```py
1074
- for name, size in dc.collect("file.name", "file.size"):
1074
+ for name, size in dc.collect("file.path", "file.size"):
1075
1075
  print(name, size)
1076
1076
  ```
1077
1077
 
1078
1078
  Iterating over a single column:
1079
1079
  ```py
1080
- for file in dc.collect("file.name"):
1080
+ for file in dc.collect("file.path"):
1081
1081
  print(file)
1082
1082
  ```
1083
1083
  """
@@ -1630,7 +1630,7 @@ class DataChain:
1630
1630
  import datachain as dc
1631
1631
 
1632
1632
  chain = dc.read_storage("s3://mybucket")
1633
- chain = chain.filter(dc.C("file.name").glob("*.jsonl"))
1633
+ chain = chain.filter(dc.C("file.path").glob("*.jsonl"))
1634
1634
  chain = chain.parse_tabular(format="json")
1635
1635
  ```
1636
1636
  """
@@ -2089,25 +2089,31 @@ class DataChain:
2089
2089
 
2090
2090
  Using glob to match patterns
2091
2091
  ```py
2092
- dc.filter(C("file.name").glob("*.jpg"))
2092
+ dc.filter(C("file.path").glob("*.jpg"))
2093
+ ```
2094
+
2095
+ Using in to match lists
2096
+ ```py
2097
+ ids = [1,2,3]
2098
+ dc.filter(C("experiment_id").in_(ids))
2093
2099
  ```
2094
2100
 
2095
2101
  Using `datachain.func`
2096
2102
  ```py
2097
2103
  from datachain.func import string
2098
- dc.filter(string.length(C("file.name")) > 5)
2104
+ dc.filter(string.length(C("file.path")) > 5)
2099
2105
  ```
2100
2106
 
2101
2107
  Combining filters with "or"
2102
2108
  ```py
2103
- dc.filter(C("file.name").glob("cat*") | C("file.name").glob("dog*))
2109
+ dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
2104
2110
  ```
2105
2111
 
2106
2112
  Combining filters with "and"
2107
2113
  ```py
2108
2114
  dc.filter(
2109
- C("file.name").glob("*.jpg) &
2110
- (string.length(C("file.name")) > 5)
2115
+ C("file.path").glob("*.jpg) &
2116
+ (string.length(C("file.path")) > 5)
2111
2117
  )
2112
2118
  ```
2113
2119
  """
@@ -4,12 +4,9 @@ from typing import TYPE_CHECKING, Optional, Union
4
4
  import sqlalchemy
5
5
 
6
6
  from datachain.lib.data_model import DataType
7
- from datachain.lib.file import (
8
- File,
9
- )
7
+ from datachain.lib.file import File
10
8
  from datachain.lib.signal_schema import SignalSchema
11
9
  from datachain.query import Session
12
- from datachain.query.schema import Column
13
10
 
14
11
  if TYPE_CHECKING:
15
12
  from typing_extensions import ParamSpec
@@ -41,6 +38,9 @@ def read_records(
41
38
  single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
42
39
  ```
43
40
  """
41
+ from datachain.query.dataset import adjust_outputs, get_col_types
42
+ from datachain.sql.types import SQLType
43
+
44
44
  from .datasets import read_dataset
45
45
 
46
46
  session = Session.get(session, in_memory=in_memory)
@@ -52,11 +52,10 @@ def read_records(
52
52
 
53
53
  if schema:
54
54
  signal_schema = SignalSchema(schema)
55
- columns = []
56
- for c in signal_schema.db_signals(as_columns=True):
57
- assert isinstance(c, Column)
58
- kw = {"nullable": c.nullable} if c.nullable is not None else {}
59
- columns.append(sqlalchemy.Column(c.name, c.type, **kw))
55
+ columns = [
56
+ sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
57
+ for c in signal_schema.db_signals(as_columns=True)
58
+ ]
60
59
  else:
61
60
  columns = [
62
61
  sqlalchemy.Column(name, typ)
@@ -83,6 +82,13 @@ def read_records(
83
82
  warehouse = catalog.warehouse
84
83
  dr = warehouse.dataset_rows(dsr)
85
84
  table = dr.get_table()
86
- warehouse.insert_rows(table, to_insert)
85
+
86
+ # Optimization: Compute row types once, rather than for every row.
87
+ col_types = get_col_types(
88
+ warehouse,
89
+ {c.name: c.type for c in columns if isinstance(c.type, SQLType)},
90
+ )
91
+ records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
92
+ warehouse.insert_rows(table, records)
87
93
  warehouse.insert_rows_done(table)
88
94
  return read_dataset(name=dsr.name, session=session, settings=settings)
@@ -31,8 +31,8 @@ def resolve_columns(
31
31
  ) -> "Callable[Concatenate[D, P], D]":
32
32
  """Decorator that resolvs input column names to their actual DB names. This is
33
33
  specially important for nested columns as user works with them by using dot
34
- notation e.g (file.name) but are actually defined with default delimiter
35
- in DB, e.g file__name.
34
+ notation e.g (file.path) but are actually defined with default delimiter
35
+ in DB, e.g file__path.
36
36
  If there are any sql functions in arguments, they will just be transferred as is
37
37
  to a method.
38
38
  """
@@ -581,11 +581,7 @@ class SignalSchema:
581
581
  signals = [
582
582
  DEFAULT_DELIMITER.join(path)
583
583
  if not as_columns
584
- else Column(
585
- DEFAULT_DELIMITER.join(path),
586
- python_to_sql(_type),
587
- nullable=is_optional(_type),
588
- )
584
+ else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
589
585
  for path, _type, has_subtree, _ in self.get_flat_tree(
590
586
  include_hidden=include_hidden
591
587
  )
@@ -994,8 +990,3 @@ class SignalSchema:
994
990
  }
995
991
 
996
992
  return SignalSchema.deserialize(schema)
997
-
998
-
999
- def is_optional(type_: Any) -> bool:
1000
- """Check if a type is Optional."""
1001
- return get_origin(type_) is Union and type(None) in get_args(type_)
@@ -57,6 +57,7 @@ from datachain.query.schema import C, UDFParamSpec, normalize_param
57
57
  from datachain.query.session import Session
58
58
  from datachain.query.udf import UdfInfo
59
59
  from datachain.sql.functions.random import rand
60
+ from datachain.sql.types import SQLType
60
61
  from datachain.utils import (
61
62
  batched,
62
63
  determine_processes,
@@ -67,6 +68,8 @@ from datachain.utils import (
67
68
  )
68
69
 
69
70
  if TYPE_CHECKING:
71
+ from collections.abc import Mapping
72
+
70
73
  from sqlalchemy.sql.elements import ClauseElement
71
74
  from sqlalchemy.sql.schema import Table
72
75
  from sqlalchemy.sql.selectable import GenerativeSelect
@@ -273,7 +276,9 @@ class Subtract(DatasetDiffOperation):
273
276
 
274
277
 
275
278
  def adjust_outputs(
276
- warehouse: "AbstractWarehouse", row: dict[str, Any], udf_col_types: list[tuple]
279
+ warehouse: "AbstractWarehouse",
280
+ row: dict[str, Any],
281
+ col_types: list[tuple[str, SQLType, type, str, Any]],
277
282
  ) -> dict[str, Any]:
278
283
  """
279
284
  This function does a couple of things to prepare a row for inserting into the db:
@@ -289,7 +294,7 @@ def adjust_outputs(
289
294
  col_python_type,
290
295
  col_type_name,
291
296
  default_value,
292
- ) in udf_col_types:
297
+ ) in col_types:
293
298
  row_val = row.get(col_name)
294
299
 
295
300
  # Fill None or missing values with defaults (get returns None if not in the row)
@@ -304,8 +309,10 @@ def adjust_outputs(
304
309
  return row
305
310
 
306
311
 
307
- def get_udf_col_types(warehouse: "AbstractWarehouse", udf: "UDFAdapter") -> list[tuple]:
308
- """Optimization: Precompute UDF column types so these don't have to be computed
312
+ def get_col_types(
313
+ warehouse: "AbstractWarehouse", output: "Mapping[str, Any]"
314
+ ) -> list[tuple]:
315
+ """Optimization: Precompute column types so these don't have to be computed
309
316
  in the convert_type function for each row in a loop."""
310
317
  dialect = warehouse.db.dialect
311
318
  return [
@@ -317,7 +324,7 @@ def get_udf_col_types(warehouse: "AbstractWarehouse", udf: "UDFAdapter") -> list
317
324
  type(col_type_inst).__name__,
318
325
  col_type.default_value(dialect),
319
326
  )
320
- for col_name, col_type in udf.output.items()
327
+ for col_name, col_type in output.items()
321
328
  ]
322
329
 
323
330
 
@@ -333,7 +340,7 @@ def process_udf_outputs(
333
340
 
334
341
  rows: list[UDFResult] = []
335
342
  # Optimization: Compute row types once, rather than for every row.
336
- udf_col_types = get_udf_col_types(warehouse, udf)
343
+ udf_col_types = get_col_types(warehouse, udf.output)
337
344
 
338
345
  for udf_output in udf_results:
339
346
  if not udf_output:
@@ -40,15 +40,12 @@ class ColumnMeta(type):
40
40
  class Column(sa.ColumnClause, metaclass=ColumnMeta):
41
41
  inherit_cache: Optional[bool] = True
42
42
 
43
- def __init__(
44
- self, text, type_=None, is_literal=False, nullable=None, _selectable=None
45
- ):
43
+ def __init__(self, text, type_=None, is_literal=False, _selectable=None):
46
44
  """Dataset column."""
47
45
  self.name = ColumnMeta.to_db_name(text)
48
46
  super().__init__(
49
47
  self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
50
48
  )
51
- self.nullable = nullable
52
49
 
53
50
  def __getattr__(self, name: str):
54
51
  return Column(self.name + DEFAULT_DELIMITER + name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.16.0
3
+ Version: 0.16.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -1,15 +1,15 @@
1
+ import json
1
2
  import os
2
3
  import sqlite3
3
4
  from contextlib import closing
4
- from typing import Optional
5
5
 
6
6
  import pytest
7
7
  import sqlalchemy
8
8
  from sqlalchemy.orm import Session
9
9
 
10
10
  from datachain import read_database
11
+ from datachain.data_storage.sqlite import SQLiteWarehouse
11
12
  from datachain.lib.dc import database
12
- from tests.utils import skip_if_not_sqlite
13
13
 
14
14
 
15
15
  @pytest.fixture
@@ -81,14 +81,7 @@ def test(sqlite3_connection, connection, test_session):
81
81
  ]
82
82
 
83
83
 
84
- # FIXME: `clickhouse` requires wrapping column types in `Nullable` to make the column
85
- # nullable, setting `nullable=True` is not enough.
86
- # https://github.com/xzkostyan/clickhouse-sqlalchemy/issues/189#issuecomment-1274736713
87
- # Also, was not able to figure out how to read nullable columns back from clickhouse.
88
-
89
-
90
- @skip_if_not_sqlite
91
- def test_nullable(sqlite3_connection, test_session):
84
+ def test_nullable(sqlite3_connection, test_session, warehouse):
92
85
  """
93
86
  Verify that a column containing a sequence of NULL values is handled correctly
94
87
  when the number of leading NULLs is less than `infer_schema_length`.
@@ -101,14 +94,14 @@ def test_nullable(sqlite3_connection, test_session):
101
94
  sqlite3_connection.commit()
102
95
 
103
96
  chain = read_database("select * from tbl", sqlite3_connection, session=test_session)
104
- assert chain.schema == {"id": int, "value": Optional[str]}
97
+ assert chain.schema == {"id": int, "value": str}
98
+ default_value = None if isinstance(warehouse, SQLiteWarehouse) else ""
105
99
  assert sorted(chain.to_records(), key=lambda r: r["id"]) == [
106
- {"id": i, "value": None if i < 50 else str(i)} for i in range(1, 1000)
100
+ {"id": i, "value": default_value if i < 50 else str(i)} for i in range(1, 1000)
107
101
  ]
108
102
 
109
103
 
110
- @skip_if_not_sqlite
111
- def test_all_null_values(sqlite3_connection, test_session):
104
+ def test_all_null_values(sqlite3_connection, test_session, warehouse):
112
105
  sqlite3_connection.execute("CREATE TABLE tbl (id INTEGER PRIMARY KEY, num INTEGER)")
113
106
  sqlite3_connection.executemany(
114
107
  "INSERT INTO tbl(num) VALUES(?)", [(None,) for _ in range(1, 1000)]
@@ -117,9 +110,10 @@ def test_all_null_values(sqlite3_connection, test_session):
117
110
 
118
111
  chain = read_database("select * from tbl", sqlite3_connection, session=test_session)
119
112
  # if all values are null, the column type defaults to str
120
- assert chain.schema == {"id": int, "num": Optional[str]}
113
+ assert chain.schema == {"id": int, "num": str}
114
+ default_value = None if isinstance(warehouse, SQLiteWarehouse) else ""
121
115
  assert sorted(chain.to_records(), key=lambda r: r["id"]) == [
122
- {"id": i, "num": None} for i in range(1, 1000)
116
+ {"id": i, "num": default_value} for i in range(1, 1000)
123
117
  ]
124
118
 
125
119
 
@@ -128,7 +122,7 @@ def test_empty(sqlite3_connection, test_session):
128
122
 
129
123
  chain = read_database("select * from tbl", sqlite3_connection, session=test_session)
130
124
  # if the table is empty, the column type defaults to str
131
- assert chain.schema == {"id": Optional[str], "value": Optional[str]}
125
+ assert chain.schema == {"id": str, "value": str}
132
126
  assert chain.to_records() == []
133
127
 
134
128
 
@@ -173,3 +167,23 @@ def test_schema_is_not_inferred_when_all_types_are_provided(
173
167
  )
174
168
  spy.assert_called_once_with(mocker.ANY, [], 100)
175
169
  assert chain.schema == {"id": int, "value": int}
170
+
171
+
172
+ def test_json_type(sqlite3_connection, test_session):
173
+ sqlite3_connection.execute("CREATE TABLE tbl (id INTEGER PRIMARY KEY, value TEXT)")
174
+ sqlite3_connection.executemany(
175
+ "INSERT INTO tbl(value) VALUES(?)",
176
+ [(json.dumps({"i": i}),) for i in range(1, 10)],
177
+ )
178
+ sqlite3_connection.commit()
179
+
180
+ chain = read_database(
181
+ "select * from tbl",
182
+ sqlite3_connection,
183
+ output={"value": dict},
184
+ session=test_session,
185
+ )
186
+ assert chain.schema == {"id": int, "value": dict}
187
+ assert sorted(chain.to_records(), key=lambda r: r["id"]) == [
188
+ {"id": i, "value": {"i": i}} for i in range(1, 10)
189
+ ]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes