datachain 0.26.1__tar.gz → 0.26.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (404) hide show
  1. {datachain-0.26.1 → datachain-0.26.3}/PKG-INFO +2 -2
  2. {datachain-0.26.1 → datachain-0.26.3}/docs/tutorials.md +1 -0
  3. {datachain-0.26.1 → datachain-0.26.3}/pyproject.toml +1 -1
  4. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/arrow.py +1 -1
  5. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/convert/flatten.py +5 -3
  6. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/data_model.py +11 -1
  7. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/hf.py +4 -2
  8. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/hf.py +31 -10
  9. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/signal_schema.py +1 -1
  10. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/udf.py +1 -1
  11. {datachain-0.26.1 → datachain-0.26.3}/src/datachain.egg-info/PKG-INFO +2 -2
  12. {datachain-0.26.1 → datachain-0.26.3}/src/datachain.egg-info/requires.txt +1 -1
  13. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_hf.py +6 -4
  14. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_hf.py +23 -17
  15. {datachain-0.26.1 → datachain-0.26.3}/.cruft.json +0 -0
  16. {datachain-0.26.1 → datachain-0.26.3}/.gitattributes +0 -0
  17. {datachain-0.26.1 → datachain-0.26.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  18. {datachain-0.26.1 → datachain-0.26.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  19. {datachain-0.26.1 → datachain-0.26.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  20. {datachain-0.26.1 → datachain-0.26.3}/.github/codecov.yaml +0 -0
  21. {datachain-0.26.1 → datachain-0.26.3}/.github/dependabot.yml +0 -0
  22. {datachain-0.26.1 → datachain-0.26.3}/.github/workflows/benchmarks.yml +0 -0
  23. {datachain-0.26.1 → datachain-0.26.3}/.github/workflows/release.yml +0 -0
  24. {datachain-0.26.1 → datachain-0.26.3}/.github/workflows/tests-studio.yml +0 -0
  25. {datachain-0.26.1 → datachain-0.26.3}/.github/workflows/tests.yml +0 -0
  26. {datachain-0.26.1 → datachain-0.26.3}/.github/workflows/update-template.yaml +0 -0
  27. {datachain-0.26.1 → datachain-0.26.3}/.gitignore +0 -0
  28. {datachain-0.26.1 → datachain-0.26.3}/.pre-commit-config.yaml +0 -0
  29. {datachain-0.26.1 → datachain-0.26.3}/CODE_OF_CONDUCT.rst +0 -0
  30. {datachain-0.26.1 → datachain-0.26.3}/LICENSE +0 -0
  31. {datachain-0.26.1 → datachain-0.26.3}/README.rst +0 -0
  32. {datachain-0.26.1 → datachain-0.26.3}/docs/assets/captioned_cartoons.png +0 -0
  33. {datachain-0.26.1 → datachain-0.26.3}/docs/assets/datachain-white.svg +0 -0
  34. {datachain-0.26.1 → datachain-0.26.3}/docs/assets/datachain.svg +0 -0
  35. {datachain-0.26.1 → datachain-0.26.3}/docs/commands/auth/login.md +0 -0
  36. {datachain-0.26.1 → datachain-0.26.3}/docs/commands/auth/logout.md +0 -0
  37. {datachain-0.26.1 → datachain-0.26.3}/docs/commands/auth/team.md +0 -0
  38. {datachain-0.26.1 → datachain-0.26.3}/docs/commands/auth/token.md +0 -0
  39. {datachain-0.26.1 → datachain-0.26.3}/docs/commands/index.md +0 -0
  40. {datachain-0.26.1 → datachain-0.26.3}/docs/commands/job/cancel.md +0 -0
  41. {datachain-0.26.1 → datachain-0.26.3}/docs/commands/job/clusters.md +0 -0
  42. {datachain-0.26.1 → datachain-0.26.3}/docs/commands/job/logs.md +0 -0
  43. {datachain-0.26.1 → datachain-0.26.3}/docs/commands/job/ls.md +0 -0
  44. {datachain-0.26.1 → datachain-0.26.3}/docs/commands/job/run.md +0 -0
  45. {datachain-0.26.1 → datachain-0.26.3}/docs/contributing.md +0 -0
  46. {datachain-0.26.1 → datachain-0.26.3}/docs/css/github-permalink-style.css +0 -0
  47. {datachain-0.26.1 → datachain-0.26.3}/docs/examples.md +0 -0
  48. {datachain-0.26.1 → datachain-0.26.3}/docs/guide/db_migrations.md +0 -0
  49. {datachain-0.26.1 → datachain-0.26.3}/docs/guide/delta.md +0 -0
  50. {datachain-0.26.1 → datachain-0.26.3}/docs/guide/env.md +0 -0
  51. {datachain-0.26.1 → datachain-0.26.3}/docs/guide/index.md +0 -0
  52. {datachain-0.26.1 → datachain-0.26.3}/docs/guide/namespaces.md +0 -0
  53. {datachain-0.26.1 → datachain-0.26.3}/docs/guide/processing.md +0 -0
  54. {datachain-0.26.1 → datachain-0.26.3}/docs/guide/remotes.md +0 -0
  55. {datachain-0.26.1 → datachain-0.26.3}/docs/guide/retry.md +0 -0
  56. {datachain-0.26.1 → datachain-0.26.3}/docs/index.md +0 -0
  57. {datachain-0.26.1 → datachain-0.26.3}/docs/overrides/main.html +0 -0
  58. {datachain-0.26.1 → datachain-0.26.3}/docs/quick-start.md +0 -0
  59. {datachain-0.26.1 → datachain-0.26.3}/docs/references/data-types/arrowrow.md +0 -0
  60. {datachain-0.26.1 → datachain-0.26.3}/docs/references/data-types/bbox.md +0 -0
  61. {datachain-0.26.1 → datachain-0.26.3}/docs/references/data-types/file.md +0 -0
  62. {datachain-0.26.1 → datachain-0.26.3}/docs/references/data-types/imagefile.md +0 -0
  63. {datachain-0.26.1 → datachain-0.26.3}/docs/references/data-types/index.md +0 -0
  64. {datachain-0.26.1 → datachain-0.26.3}/docs/references/data-types/pose.md +0 -0
  65. {datachain-0.26.1 → datachain-0.26.3}/docs/references/data-types/segment.md +0 -0
  66. {datachain-0.26.1 → datachain-0.26.3}/docs/references/data-types/tarvfile.md +0 -0
  67. {datachain-0.26.1 → datachain-0.26.3}/docs/references/data-types/textfile.md +0 -0
  68. {datachain-0.26.1 → datachain-0.26.3}/docs/references/data-types/videofile.md +0 -0
  69. {datachain-0.26.1 → datachain-0.26.3}/docs/references/datachain.md +0 -0
  70. {datachain-0.26.1 → datachain-0.26.3}/docs/references/func.md +0 -0
  71. {datachain-0.26.1 → datachain-0.26.3}/docs/references/index.md +0 -0
  72. {datachain-0.26.1 → datachain-0.26.3}/docs/references/toolkit.md +0 -0
  73. {datachain-0.26.1 → datachain-0.26.3}/docs/references/torch.md +0 -0
  74. {datachain-0.26.1 → datachain-0.26.3}/docs/references/udf.md +0 -0
  75. {datachain-0.26.1 → datachain-0.26.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  76. {datachain-0.26.1 → datachain-0.26.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  77. {datachain-0.26.1 → datachain-0.26.3}/examples/computer_vision/openimage-detect.py +0 -0
  78. {datachain-0.26.1 → datachain-0.26.3}/examples/computer_vision/ultralytics-bbox.py +0 -0
  79. {datachain-0.26.1 → datachain-0.26.3}/examples/computer_vision/ultralytics-pose.py +0 -0
  80. {datachain-0.26.1 → datachain-0.26.3}/examples/computer_vision/ultralytics-segment.py +0 -0
  81. {datachain-0.26.1 → datachain-0.26.3}/examples/get_started/common_sql_functions.py +0 -0
  82. {datachain-0.26.1 → datachain-0.26.3}/examples/get_started/json-csv-reader.py +0 -0
  83. {datachain-0.26.1 → datachain-0.26.3}/examples/get_started/torch-loader.py +0 -0
  84. {datachain-0.26.1 → datachain-0.26.3}/examples/get_started/udfs/parallel.py +0 -0
  85. {datachain-0.26.1 → datachain-0.26.3}/examples/get_started/udfs/simple.py +0 -0
  86. {datachain-0.26.1 → datachain-0.26.3}/examples/get_started/udfs/stateful.py +0 -0
  87. {datachain-0.26.1 → datachain-0.26.3}/examples/incremental_processing/delta.py +0 -0
  88. {datachain-0.26.1 → datachain-0.26.3}/examples/incremental_processing/retry.py +0 -0
  89. {datachain-0.26.1 → datachain-0.26.3}/examples/incremental_processing/utils.py +0 -0
  90. {datachain-0.26.1 → datachain-0.26.3}/examples/llm_and_nlp/claude-query.py +0 -0
  91. {datachain-0.26.1 → datachain-0.26.3}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  92. {datachain-0.26.1 → datachain-0.26.3}/examples/multimodal/audio-to-text.py +0 -0
  93. {datachain-0.26.1 → datachain-0.26.3}/examples/multimodal/clip_inference.py +0 -0
  94. {datachain-0.26.1 → datachain-0.26.3}/examples/multimodal/hf_pipeline.py +0 -0
  95. {datachain-0.26.1 → datachain-0.26.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
  96. {datachain-0.26.1 → datachain-0.26.3}/examples/multimodal/wds.py +0 -0
  97. {datachain-0.26.1 → datachain-0.26.3}/examples/multimodal/wds_filtered.py +0 -0
  98. {datachain-0.26.1 → datachain-0.26.3}/mkdocs.yml +0 -0
  99. {datachain-0.26.1 → datachain-0.26.3}/noxfile.py +0 -0
  100. {datachain-0.26.1 → datachain-0.26.3}/setup.cfg +0 -0
  101. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/__init__.py +0 -0
  102. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/__main__.py +0 -0
  103. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/asyn.py +0 -0
  104. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cache.py +0 -0
  105. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/catalog/__init__.py +0 -0
  106. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/catalog/catalog.py +0 -0
  107. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/catalog/datasource.py +0 -0
  108. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/catalog/loader.py +0 -0
  109. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/__init__.py +0 -0
  110. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/commands/__init__.py +0 -0
  111. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/commands/datasets.py +0 -0
  112. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/commands/du.py +0 -0
  113. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/commands/index.py +0 -0
  114. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/commands/ls.py +0 -0
  115. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/commands/misc.py +0 -0
  116. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/commands/query.py +0 -0
  117. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/commands/show.py +0 -0
  118. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/parser/__init__.py +0 -0
  119. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/parser/job.py +0 -0
  120. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/parser/studio.py +0 -0
  121. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/parser/utils.py +0 -0
  122. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/cli/utils.py +0 -0
  123. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/client/__init__.py +0 -0
  124. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/client/azure.py +0 -0
  125. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/client/fileslice.py +0 -0
  126. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/client/fsspec.py +0 -0
  127. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/client/gcs.py +0 -0
  128. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/client/hf.py +0 -0
  129. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/client/local.py +0 -0
  130. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/client/s3.py +0 -0
  131. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/config.py +0 -0
  132. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/data_storage/__init__.py +0 -0
  133. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/data_storage/db_engine.py +0 -0
  134. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/data_storage/job.py +0 -0
  135. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/data_storage/metastore.py +0 -0
  136. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/data_storage/schema.py +0 -0
  137. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/data_storage/serializer.py +0 -0
  138. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/data_storage/sqlite.py +0 -0
  139. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/data_storage/warehouse.py +0 -0
  140. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/dataset.py +0 -0
  141. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/delta.py +0 -0
  142. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/diff/__init__.py +0 -0
  143. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/error.py +0 -0
  144. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/fs/__init__.py +0 -0
  145. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/fs/reference.py +0 -0
  146. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/fs/utils.py +0 -0
  147. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/__init__.py +0 -0
  148. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/aggregate.py +0 -0
  149. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/array.py +0 -0
  150. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/base.py +0 -0
  151. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/conditional.py +0 -0
  152. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/func.py +0 -0
  153. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/numeric.py +0 -0
  154. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/path.py +0 -0
  155. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/random.py +0 -0
  156. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/string.py +0 -0
  157. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/func/window.py +0 -0
  158. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/job.py +0 -0
  159. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/__init__.py +0 -0
  160. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/audio.py +0 -0
  161. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/clip.py +0 -0
  162. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/convert/__init__.py +0 -0
  163. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
  164. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
  165. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/convert/unflatten.py +0 -0
  166. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  167. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dataset_info.py +0 -0
  168. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/__init__.py +0 -0
  169. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/csv.py +0 -0
  170. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/database.py +0 -0
  171. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/datachain.py +0 -0
  172. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/datasets.py +0 -0
  173. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/json.py +0 -0
  174. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/listings.py +0 -0
  175. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/pandas.py +0 -0
  176. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/parquet.py +0 -0
  177. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/records.py +0 -0
  178. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/storage.py +0 -0
  179. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/utils.py +0 -0
  180. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/dc/values.py +0 -0
  181. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/file.py +0 -0
  182. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/image.py +0 -0
  183. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/listing.py +0 -0
  184. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/listing_info.py +0 -0
  185. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/meta_formats.py +0 -0
  186. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/model_store.py +0 -0
  187. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/namespaces.py +0 -0
  188. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/projects.py +0 -0
  189. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/pytorch.py +0 -0
  190. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/settings.py +0 -0
  191. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/tar.py +0 -0
  192. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/text.py +0 -0
  193. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/udf_signature.py +0 -0
  194. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/utils.py +0 -0
  195. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/video.py +0 -0
  196. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/webdataset.py +0 -0
  197. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/lib/webdataset_laion.py +0 -0
  198. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/listing.py +0 -0
  199. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/model/__init__.py +0 -0
  200. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/model/bbox.py +0 -0
  201. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/model/pose.py +0 -0
  202. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/model/segment.py +0 -0
  203. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/model/ultralytics/__init__.py +0 -0
  204. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/model/ultralytics/bbox.py +0 -0
  205. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/model/ultralytics/pose.py +0 -0
  206. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/model/ultralytics/segment.py +0 -0
  207. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/model/utils.py +0 -0
  208. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/namespace.py +0 -0
  209. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/node.py +0 -0
  210. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/nodes_fetcher.py +0 -0
  211. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/nodes_thread_pool.py +0 -0
  212. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/progress.py +0 -0
  213. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/project.py +0 -0
  214. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/py.typed +0 -0
  215. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/__init__.py +0 -0
  216. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/batch.py +0 -0
  217. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/dataset.py +0 -0
  218. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/dispatch.py +0 -0
  219. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/metrics.py +0 -0
  220. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/params.py +0 -0
  221. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/queue.py +0 -0
  222. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/schema.py +0 -0
  223. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/session.py +0 -0
  224. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/udf.py +0 -0
  225. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/query/utils.py +0 -0
  226. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/remote/__init__.py +0 -0
  227. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/remote/studio.py +0 -0
  228. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/script_meta.py +0 -0
  229. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/semver.py +0 -0
  230. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/__init__.py +0 -0
  231. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/default/__init__.py +0 -0
  232. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/default/base.py +0 -0
  233. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/functions/__init__.py +0 -0
  234. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/functions/aggregate.py +0 -0
  235. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/functions/array.py +0 -0
  236. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/functions/conditional.py +0 -0
  237. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/functions/numeric.py +0 -0
  238. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/functions/path.py +0 -0
  239. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/functions/random.py +0 -0
  240. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/functions/string.py +0 -0
  241. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/selectable.py +0 -0
  242. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/sqlite/__init__.py +0 -0
  243. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/sqlite/base.py +0 -0
  244. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/sqlite/types.py +0 -0
  245. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/sqlite/vector.py +0 -0
  246. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/types.py +0 -0
  247. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/sql/utils.py +0 -0
  248. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/studio.py +0 -0
  249. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/telemetry.py +0 -0
  250. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/toolkit/__init__.py +0 -0
  251. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/toolkit/split.py +0 -0
  252. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/torch/__init__.py +0 -0
  253. {datachain-0.26.1 → datachain-0.26.3}/src/datachain/utils.py +0 -0
  254. {datachain-0.26.1 → datachain-0.26.3}/src/datachain.egg-info/SOURCES.txt +0 -0
  255. {datachain-0.26.1 → datachain-0.26.3}/src/datachain.egg-info/dependency_links.txt +0 -0
  256. {datachain-0.26.1 → datachain-0.26.3}/src/datachain.egg-info/entry_points.txt +0 -0
  257. {datachain-0.26.1 → datachain-0.26.3}/src/datachain.egg-info/top_level.txt +0 -0
  258. {datachain-0.26.1 → datachain-0.26.3}/tests/__init__.py +0 -0
  259. {datachain-0.26.1 → datachain-0.26.3}/tests/benchmarks/__init__.py +0 -0
  260. {datachain-0.26.1 → datachain-0.26.3}/tests/benchmarks/conftest.py +0 -0
  261. {datachain-0.26.1 → datachain-0.26.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  262. {datachain-0.26.1 → datachain-0.26.3}/tests/benchmarks/datasets/.dvc/config +0 -0
  263. {datachain-0.26.1 → datachain-0.26.3}/tests/benchmarks/datasets/.gitignore +0 -0
  264. {datachain-0.26.1 → datachain-0.26.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  265. {datachain-0.26.1 → datachain-0.26.3}/tests/benchmarks/test_datachain.py +0 -0
  266. {datachain-0.26.1 → datachain-0.26.3}/tests/benchmarks/test_ls.py +0 -0
  267. {datachain-0.26.1 → datachain-0.26.3}/tests/benchmarks/test_version.py +0 -0
  268. {datachain-0.26.1 → datachain-0.26.3}/tests/conftest.py +0 -0
  269. {datachain-0.26.1 → datachain-0.26.3}/tests/data.py +0 -0
  270. {datachain-0.26.1 → datachain-0.26.3}/tests/examples/__init__.py +0 -0
  271. {datachain-0.26.1 → datachain-0.26.3}/tests/examples/test_examples.py +0 -0
  272. {datachain-0.26.1 → datachain-0.26.3}/tests/examples/test_wds_e2e.py +0 -0
  273. {datachain-0.26.1 → datachain-0.26.3}/tests/examples/wds_data.py +0 -0
  274. {datachain-0.26.1 → datachain-0.26.3}/tests/func/__init__.py +0 -0
  275. {datachain-0.26.1 → datachain-0.26.3}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  276. {datachain-0.26.1 → datachain-0.26.3}/tests/func/data/lena.jpg +0 -0
  277. {datachain-0.26.1 → datachain-0.26.3}/tests/func/fake-service-account-credentials.json +0 -0
  278. {datachain-0.26.1 → datachain-0.26.3}/tests/func/functions/__init__.py +0 -0
  279. {datachain-0.26.1 → datachain-0.26.3}/tests/func/functions/test_aggregate.py +0 -0
  280. {datachain-0.26.1 → datachain-0.26.3}/tests/func/functions/test_array.py +0 -0
  281. {datachain-0.26.1 → datachain-0.26.3}/tests/func/functions/test_conditional.py +0 -0
  282. {datachain-0.26.1 → datachain-0.26.3}/tests/func/functions/test_numeric.py +0 -0
  283. {datachain-0.26.1 → datachain-0.26.3}/tests/func/functions/test_path.py +0 -0
  284. {datachain-0.26.1 → datachain-0.26.3}/tests/func/functions/test_random.py +0 -0
  285. {datachain-0.26.1 → datachain-0.26.3}/tests/func/functions/test_string.py +0 -0
  286. {datachain-0.26.1 → datachain-0.26.3}/tests/func/model/__init__.py +0 -0
  287. {datachain-0.26.1 → datachain-0.26.3}/tests/func/model/data/running-mask0.png +0 -0
  288. {datachain-0.26.1 → datachain-0.26.3}/tests/func/model/data/running-mask1.png +0 -0
  289. {datachain-0.26.1 → datachain-0.26.3}/tests/func/model/data/running.jpg +0 -0
  290. {datachain-0.26.1 → datachain-0.26.3}/tests/func/model/data/ships.jpg +0 -0
  291. {datachain-0.26.1 → datachain-0.26.3}/tests/func/model/test_yolo.py +0 -0
  292. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_audio.py +0 -0
  293. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_batching.py +0 -0
  294. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_catalog.py +0 -0
  295. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_client.py +0 -0
  296. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_cloud_transfer.py +0 -0
  297. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_data_storage.py +0 -0
  298. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_datachain.py +0 -0
  299. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_datachain_merge.py +0 -0
  300. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_dataset_query.py +0 -0
  301. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_datasets.py +0 -0
  302. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_delta.py +0 -0
  303. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_feature_pickling.py +0 -0
  304. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_file.py +0 -0
  305. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_hidden_field.py +0 -0
  306. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_image.py +0 -0
  307. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_listing.py +0 -0
  308. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_ls.py +0 -0
  309. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_meta_formats.py +0 -0
  310. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_metastore.py +0 -0
  311. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_metrics.py +0 -0
  312. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_pull.py +0 -0
  313. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_pytorch.py +0 -0
  314. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_query.py +0 -0
  315. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_read_database.py +0 -0
  316. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_read_dataset_remote.py +0 -0
  317. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  318. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_retry.py +0 -0
  319. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_session.py +0 -0
  320. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_toolkit.py +0 -0
  321. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_video.py +0 -0
  322. {datachain-0.26.1 → datachain-0.26.3}/tests/func/test_warehouse.py +0 -0
  323. {datachain-0.26.1 → datachain-0.26.3}/tests/scripts/feature_class.py +0 -0
  324. {datachain-0.26.1 → datachain-0.26.3}/tests/scripts/feature_class_exception.py +0 -0
  325. {datachain-0.26.1 → datachain-0.26.3}/tests/scripts/feature_class_parallel.py +0 -0
  326. {datachain-0.26.1 → datachain-0.26.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  327. {datachain-0.26.1 → datachain-0.26.3}/tests/scripts/name_len_slow.py +0 -0
  328. {datachain-0.26.1 → datachain-0.26.3}/tests/test_atomicity.py +0 -0
  329. {datachain-0.26.1 → datachain-0.26.3}/tests/test_cli_e2e.py +0 -0
  330. {datachain-0.26.1 → datachain-0.26.3}/tests/test_cli_studio.py +0 -0
  331. {datachain-0.26.1 → datachain-0.26.3}/tests/test_import_time.py +0 -0
  332. {datachain-0.26.1 → datachain-0.26.3}/tests/test_query_e2e.py +0 -0
  333. {datachain-0.26.1 → datachain-0.26.3}/tests/test_telemetry.py +0 -0
  334. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/__init__.py +0 -0
  335. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/__init__.py +0 -0
  336. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/conftest.py +0 -0
  337. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_arrow.py +0 -0
  338. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_audio.py +0 -0
  339. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_clip.py +0 -0
  340. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_datachain.py +0 -0
  341. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  342. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_datachain_merge.py +0 -0
  343. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_diff.py +0 -0
  344. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_feature.py +0 -0
  345. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_feature_utils.py +0 -0
  346. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_file.py +0 -0
  347. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_image.py +0 -0
  348. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_listing_info.py +0 -0
  349. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_namespace.py +0 -0
  350. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_partition_by.py +0 -0
  351. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_project.py +0 -0
  352. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_python_to_sql.py +0 -0
  353. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_schema.py +0 -0
  354. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_signal_schema.py +0 -0
  355. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_sql_to_python.py +0 -0
  356. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_text.py +0 -0
  357. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_udf.py +0 -0
  358. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_udf_signature.py +0 -0
  359. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_utils.py +0 -0
  360. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/lib/test_webdataset.py +0 -0
  361. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/model/__init__.py +0 -0
  362. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/model/test_bbox.py +0 -0
  363. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/model/test_pose.py +0 -0
  364. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/model/test_segment.py +0 -0
  365. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/model/test_utils.py +0 -0
  366. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/sql/__init__.py +0 -0
  367. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/sql/sqlite/__init__.py +0 -0
  368. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/sql/sqlite/test_types.py +0 -0
  369. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
  370. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/sql/test_array.py +0 -0
  371. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/sql/test_conditional.py +0 -0
  372. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/sql/test_path.py +0 -0
  373. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/sql/test_random.py +0 -0
  374. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/sql/test_selectable.py +0 -0
  375. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/sql/test_string.py +0 -0
  376. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_asyn.py +0 -0
  377. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_cache.py +0 -0
  378. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_catalog.py +0 -0
  379. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_catalog_loader.py +0 -0
  380. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_cli_parsing.py +0 -0
  381. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_client.py +0 -0
  382. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_client_gcs.py +0 -0
  383. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_client_s3.py +0 -0
  384. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_config.py +0 -0
  385. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_data_storage.py +0 -0
  386. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_database_engine.py +0 -0
  387. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_dataset.py +0 -0
  388. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_dispatch.py +0 -0
  389. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_fileslice.py +0 -0
  390. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_func.py +0 -0
  391. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_listing.py +0 -0
  392. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_metastore.py +0 -0
  393. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_module_exports.py +0 -0
  394. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_pytorch.py +0 -0
  395. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_query.py +0 -0
  396. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_query_metrics.py +0 -0
  397. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_query_params.py +0 -0
  398. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_script_meta.py +0 -0
  399. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_semver.py +0 -0
  400. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_serializer.py +0 -0
  401. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_session.py +0 -0
  402. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_utils.py +0 -0
  403. {datachain-0.26.1 → datachain-0.26.3}/tests/unit/test_warehouse.py +0 -0
  404. {datachain-0.26.1 → datachain-0.26.3}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.26.1
3
+ Version: 0.26.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
98
98
  Requires-Dist: ultralytics; extra == "tests"
99
99
  Provides-Extra: dev
100
100
  Requires-Dist: datachain[docs,tests]; extra == "dev"
101
- Requires-Dist: mypy==1.16.1; extra == "dev"
101
+ Requires-Dist: mypy==1.17.0; extra == "dev"
102
102
  Requires-Dist: types-python-dateutil; extra == "dev"
103
103
  Requires-Dist: types-pytz; extra == "dev"
104
104
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -7,3 +7,4 @@ title: Tutorials
7
7
  * Multimodal: [GitHub](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
8
8
  * LLM evaluations: [GitHub](https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb)
9
9
  * Reading JSON metadata: [GitHub](https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb)
10
+ * Processing video data: [GitHub](https://github.com/iterative/datachain-examples/blob/main/computer_vision/video_pose_detection_yolo/video-pose-detection-yolov11.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/computer_vision/video_pose_detection_yolo/video-pose-detection-yolov11.ipynb)
@@ -114,7 +114,7 @@ tests = [
114
114
  ]
115
115
  dev = [
116
116
  "datachain[docs,tests]",
117
- "mypy==1.16.1",
117
+ "mypy==1.17.0",
118
118
  "types-python-dateutil",
119
119
  "types-pytz",
120
120
  "types-PyYAML",
@@ -262,7 +262,7 @@ def _get_hf_schema(
262
262
  from datachain.lib.hf import get_output_schema, schema_from_arrow
263
263
 
264
264
  features = schema_from_arrow(schema)
265
- return features, get_output_schema(features)
265
+ return features, get_output_schema(features)[0]
266
266
  return None
267
267
 
268
268
 
@@ -6,12 +6,14 @@ from datachain.lib.model_store import ModelStore
6
6
 
7
7
 
8
8
  def flatten(obj: BaseModel) -> tuple:
9
- return tuple(_flatten_fields_values(obj.model_fields, obj))
9
+ return tuple(_flatten_fields_values(type(obj).model_fields, obj))
10
10
 
11
11
 
12
12
  def flatten_list(obj_list: list[BaseModel]) -> tuple:
13
13
  return tuple(
14
- val for obj in obj_list for val in _flatten_fields_values(obj.model_fields, obj)
14
+ val
15
+ for obj in obj_list
16
+ for val in _flatten_fields_values(type(obj).model_fields, obj)
15
17
  )
16
18
 
17
19
 
@@ -43,4 +45,4 @@ def _flatten_fields_values(fields: dict, obj: BaseModel) -> Generator:
43
45
 
44
46
 
45
47
  def _flatten(obj: BaseModel) -> tuple:
46
- return tuple(_flatten_fields_values(obj.model_fields, obj))
48
+ return tuple(_flatten_fields_values(type(obj).model_fields, obj))
@@ -3,6 +3,7 @@ from datetime import datetime
3
3
  from typing import ClassVar, Optional, Union, get_args, get_origin
4
4
 
5
5
  from pydantic import AliasChoices, BaseModel, Field, create_model
6
+ from pydantic.fields import FieldInfo
6
7
 
7
8
  from datachain.lib.model_store import ModelStore
8
9
  from datachain.lib.utils import normalize_col_names
@@ -89,7 +90,16 @@ def dict_to_data_model(
89
90
  }
90
91
 
91
92
  class _DataModelStrict(BaseModel, extra="forbid"):
92
- pass
93
+ @classmethod
94
+ def _model_fields_by_aliases(cls) -> dict[str, tuple[str, FieldInfo]]:
95
+ """Returns a map of aliases to original field names and info."""
96
+ field_info = {}
97
+ for _name, field in cls.model_fields.items():
98
+ assert isinstance(field.validation_alias, AliasChoices)
99
+ # Add mapping for all aliases (both normalized and original names)
100
+ for alias in field.validation_alias.choices:
101
+ field_info[str(alias)] = (_name, field)
102
+ return field_info
93
103
 
94
104
  return create_model(
95
105
  name,
@@ -32,6 +32,7 @@ def read_hf(
32
32
  Parameters:
33
33
  dataset : Path or name of the dataset to read from Hugging Face Hub,
34
34
  or an instance of `datasets.Dataset`-like object.
35
+ args : Additional positional arguments to pass to datasets.load_dataset.
35
36
  session : Session to use for the chain.
36
37
  settings : Settings to use for the chain.
37
38
  column : Generated object column name.
@@ -64,8 +65,9 @@ def read_hf(
64
65
 
65
66
  model_name = model_name or column or ""
66
67
  hf_features = next(iter(ds_dict.values())).features
67
- output = output | get_output_schema(hf_features)
68
- model = dict_to_data_model(model_name, output)
68
+ hf_output, normalized_names = get_output_schema(hf_features, list(output.keys()))
69
+ output = output | hf_output
70
+ model = dict_to_data_model(model_name, output, list(normalized_names.values()))
69
71
  if column:
70
72
  output = {column: model}
71
73
 
@@ -26,7 +26,7 @@ except ImportError as exc:
26
26
  ) from exc
27
27
 
28
28
  from io import BytesIO
29
- from typing import TYPE_CHECKING, Any, Union
29
+ from typing import TYPE_CHECKING, Any, Optional, Union
30
30
 
31
31
  import PIL
32
32
  from tqdm.auto import tqdm
@@ -34,6 +34,7 @@ from tqdm.auto import tqdm
34
34
  from datachain.lib.arrow import arrow_type_mapper
35
35
  from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
36
36
  from datachain.lib.udf import Generator
37
+ from datachain.lib.utils import normalize_col_names
37
38
 
38
39
  if TYPE_CHECKING:
39
40
  import pyarrow as pa
@@ -94,14 +95,18 @@ class HFGenerator(Generator):
94
95
  ds = self.ds_dict[split]
95
96
  if split:
96
97
  desc += f" split '{split}'"
98
+ model_fields = self.output_schema._model_fields_by_aliases() # type: ignore[attr-defined]
97
99
  with tqdm(desc=desc, unit=" rows", leave=False) as pbar:
98
100
  for row in ds:
99
101
  output_dict = {}
100
102
  if split and "split" in self.output_schema.model_fields:
101
103
  output_dict["split"] = split
102
104
  for name, feat in ds.features.items():
103
- anno = self.output_schema.model_fields[name].annotation
104
- output_dict[name] = convert_feature(row[name], feat, anno)
105
+ normalized_name, info = model_fields[name]
106
+ anno = info.annotation
107
+ output_dict[normalized_name] = convert_feature(
108
+ row[name], feat, anno
109
+ )
105
110
  yield self.output_schema(**output_dict)
106
111
  pbar.update(1)
107
112
 
@@ -122,10 +127,12 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
122
127
  return HFClassLabel(string=feat.names[val], integer=val)
123
128
  if isinstance(feat, dict):
124
129
  sdict = {}
130
+ model_fields = anno._model_fields_by_aliases() # type: ignore[attr-defined]
125
131
  for sname in val:
126
132
  sfeat = feat[sname]
127
- sanno = anno.model_fields[sname].annotation
128
- sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
133
+ norm_name, info = model_fields[sname]
134
+ sanno = info.annotation
135
+ sdict[norm_name] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
129
136
  return anno(**sdict)
130
137
  if isinstance(feat, Image):
131
138
  if isinstance(val, dict):
@@ -135,12 +142,26 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
135
142
  return HFAudio(array=val["array"], sampling_rate=val["sampling_rate"])
136
143
 
137
144
 
138
- def get_output_schema(features: Features) -> dict[str, DataType]:
139
- """Generate UDF output schema from huggingface datasets features."""
145
+ def get_output_schema(
146
+ features: Features, existing_column_names: Optional[list[str]] = None
147
+ ) -> tuple[dict[str, DataType], dict[str, str]]:
148
+ """
149
+ Generate UDF output schema from Hugging Face datasets features. It normalizes the
150
+ column names and returns a mapping of normalized names to original names along with
151
+ the data types. `existing_column_names` is the list of column names that already
152
+ exist in the dataset (to avoid name collisions due to normalization).
153
+ """
154
+ existing_column_names = existing_column_names or []
140
155
  fields_dict = {}
141
- for name, val in features.items():
142
- fields_dict[name] = _feature_to_chain_type(name, val)
143
- return fields_dict
156
+ normalized_names = normalize_col_names(
157
+ existing_column_names + list(features.keys())
158
+ )
159
+ # List of tuple(str, str) for HF dataset feature names, (normalized, original)
160
+ new_feature_names = list(normalized_names.items())[len(existing_column_names) :]
161
+ for idx, feat in enumerate(features.items()):
162
+ name, val = feat
163
+ fields_dict[new_feature_names[idx][0]] = _feature_to_chain_type(name, val)
164
+ return fields_dict, normalized_names
144
165
 
145
166
 
146
167
  def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
@@ -550,7 +550,7 @@ class SignalSchema:
550
550
  ) -> None:
551
551
  if isinstance(obj, File):
552
552
  obj._set_stream(catalog, caching_enabled=cache)
553
- for field, finfo in obj.model_fields.items():
553
+ for field, finfo in type(obj).model_fields.items():
554
554
  if ModelStore.is_pydantic(finfo.annotation):
555
555
  SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
556
556
 
@@ -282,7 +282,7 @@ class UDFBase(AbstractUDF):
282
282
 
283
283
  # Check all fields for nested File objects, but only for DataModel objects
284
284
  if isinstance(obj, DataModel):
285
- for field_name in obj.model_fields:
285
+ for field_name in type(obj).model_fields:
286
286
  field_value = getattr(obj, field_name, None)
287
287
  if isinstance(field_value, DataModel):
288
288
  self._set_stream_recursive(field_value, catalog, cache, download_cb)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.26.1
3
+ Version: 0.26.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
98
98
  Requires-Dist: ultralytics; extra == "tests"
99
99
  Provides-Extra: dev
100
100
  Requires-Dist: datachain[docs,tests]; extra == "dev"
101
- Requires-Dist: mypy==1.16.1; extra == "dev"
101
+ Requires-Dist: mypy==1.17.0; extra == "dev"
102
102
  Requires-Dist: types-python-dateutil; extra == "dev"
103
103
  Requires-Dist: types-pytz; extra == "dev"
104
104
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -41,7 +41,7 @@ soundfile
41
41
 
42
42
  [dev]
43
43
  datachain[docs,tests]
44
- mypy==1.16.1
44
+ mypy==1.17.0
45
45
  types-python-dateutil
46
46
  types-pytz
47
47
  types-PyYAML
@@ -34,10 +34,11 @@ def test_hf_image(tmp_path):
34
34
  img.save(train_dir / "img1.png")
35
35
 
36
36
  ds = load_dataset("imagefolder", data_dir=tmp_path)
37
- schema = {"split": str} | get_output_schema(ds["train"].features)
37
+ hf_schema, norm_names = get_output_schema(ds["train"].features, ["split"])
38
+ schema = {"split": str} | hf_schema
38
39
  assert schema["image"] is HFImage
39
40
 
40
- gen = HFGenerator(ds, dict_to_data_model("", schema))
41
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
41
42
  gen.setup()
42
43
  row = next(iter(gen.process("train")))
43
44
  assert row.image.img == image_to_bytes(img)
@@ -56,9 +57,10 @@ def test_hf_audio(tmp_path):
56
57
  write(train_dir / "example.wav", samplerate, data.astype(np.int16))
57
58
 
58
59
  ds = load_dataset("audiofolder", data_dir=tmp_path)
59
- schema = {"split": str} | get_output_schema(ds["train"].features)
60
+ hf_schema, norm_names = get_output_schema(ds["train"].features, ["split"])
61
+ schema = {"split": str} | hf_schema
60
62
 
61
- gen = HFGenerator(ds, dict_to_data_model("", schema))
63
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
62
64
  gen.setup()
63
65
  row = next(iter(gen.process("train")))
64
66
  assert np.allclose(row.audio.array, data / amplitude, atol=1e-4)
@@ -11,37 +11,41 @@ from datachain.lib.hf import (
11
11
 
12
12
  def test_hf():
13
13
  ds = Dataset.from_dict({"pokemon": ["bulbasaur", "squirtle"]})
14
- schema = get_output_schema(ds.features)
14
+ schema, norm_names = get_output_schema(ds.features)
15
15
  assert schema["pokemon"] is str
16
16
 
17
- gen = HFGenerator(ds, dict_to_data_model("", schema))
17
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
18
18
  gen.setup()
19
19
  row = next(iter(gen.process()))
20
20
  assert row.pokemon == "bulbasaur"
21
21
 
22
22
 
23
23
  def test_hf_split():
24
- ds_train = Dataset.from_dict({"pokemon": ["bulbasaur", "squirtle"]})
25
- ds_test = Dataset.from_dict({"pokemon": ["charizard", "pikachu"]})
24
+ # Space in the column name should be normalized
25
+ ds_train = Dataset.from_dict({"pok emon": ["bulbasaur", "squirtle"]})
26
+ ds_test = Dataset.from_dict({"pok emon": ["charizard", "pikachu"]})
26
27
  ds_dict = DatasetDict({"train": ds_train, "test": ds_test})
27
28
  ds_dict = stream_splits(ds_dict)
28
- schema = {"split": str} | get_output_schema(ds_dict["train"].features)
29
+ hf_schema, norm_names = get_output_schema(ds_dict["train"].features, ["split"])
30
+ schema = {"split": str} | hf_schema
29
31
 
30
- gen = HFGenerator(ds_dict, dict_to_data_model("", schema))
32
+ gen = HFGenerator(
33
+ ds_dict, dict_to_data_model("", schema, list(norm_names.values()))
34
+ )
31
35
  gen.setup()
32
36
  row = next(iter(gen.process("train")))
33
37
 
34
38
  assert row.split == "train"
35
- assert row.pokemon == "bulbasaur"
39
+ assert row.pok_emon == "bulbasaur"
36
40
 
37
41
 
38
42
  def test_hf_class_label():
39
43
  ds = Dataset.from_dict({"pokemon": ["bulbasaur", "squirtle"]})
40
44
  ds = ds.class_encode_column("pokemon")
41
- schema = get_output_schema(ds.features)
45
+ schema, norm_names = get_output_schema(ds.features)
42
46
  assert schema["pokemon"] is HFClassLabel
43
47
 
44
- gen = HFGenerator(ds, dict_to_data_model("", schema))
48
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
45
49
  gen.setup()
46
50
  row = next(iter(gen.process()))
47
51
  assert row.pokemon.string == "bulbasaur"
@@ -50,26 +54,28 @@ def test_hf_class_label():
50
54
 
51
55
  def test_hf_sequence_list():
52
56
  ds = Dataset.from_dict({"seq": [[0, 1], [2, 3]]})
53
- schema = get_output_schema(ds.features)
57
+ schema, norm_names = get_output_schema(ds.features)
54
58
  assert schema["seq"] == list[int]
55
59
 
56
- gen = HFGenerator(ds, dict_to_data_model("", schema))
60
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
57
61
  gen.setup()
58
62
  row = next(iter(gen.process()))
59
63
  assert row.seq == [0, 1]
60
64
 
61
65
 
62
66
  def test_hf_sequence_dict():
67
+ # ? in the column name should be normalized
68
+ # Check if even nested names are not normalized we handle it correctly
63
69
  ds = Dataset.from_dict(
64
- {"pokemon": [{"name": ["bulbasaur"]}, {"name": ["squirtle"]}]}
70
+ {"pokemon": [{"name?": ["bulbasaur"]}, {"name?": ["squirtle"]}]}
65
71
  )
66
72
  new_features = ds.features.copy()
67
- new_features["pokemon"] = Sequence(feature={"name": Value(dtype="string")})
73
+ new_features["pokemon"] = Sequence(feature={"name?": Value(dtype="string")})
68
74
  ds = ds.cast(new_features)
69
- schema = get_output_schema(ds.features)
75
+ schema, norm_names = get_output_schema(ds.features)
70
76
  assert schema["pokemon"].model_fields["name"].annotation == list[str]
71
77
 
72
- gen = HFGenerator(ds, dict_to_data_model("", schema))
78
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
73
79
  gen.setup()
74
80
  row = next(iter(gen.process()))
75
81
  assert row.pokemon.name == ["bulbasaur"]
@@ -80,10 +86,10 @@ def test_hf_array():
80
86
  new_features = ds.features.copy()
81
87
  new_features["arr"] = Array2D(shape=(2, 2), dtype="int32")
82
88
  ds = ds.cast(new_features)
83
- schema = get_output_schema(ds.features)
89
+ schema, norm_names = get_output_schema(ds.features)
84
90
  assert schema["arr"] == list[list[int]]
85
91
 
86
- gen = HFGenerator(ds, dict_to_data_model("", schema))
92
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
87
93
  gen.setup()
88
94
  row = next(iter(gen.process()))
89
95
  assert row.arr == [[0, 1], [2, 3]]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes