datachain 0.26.1__tar.gz → 0.26.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (404) hide show
  1. {datachain-0.26.1 → datachain-0.26.2}/PKG-INFO +2 -2
  2. {datachain-0.26.1 → datachain-0.26.2}/pyproject.toml +1 -1
  3. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/arrow.py +1 -1
  4. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/data_model.py +11 -1
  5. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/hf.py +4 -2
  6. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/hf.py +31 -10
  7. {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/PKG-INFO +2 -2
  8. {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/requires.txt +1 -1
  9. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_hf.py +6 -4
  10. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_hf.py +23 -17
  11. {datachain-0.26.1 → datachain-0.26.2}/.cruft.json +0 -0
  12. {datachain-0.26.1 → datachain-0.26.2}/.gitattributes +0 -0
  13. {datachain-0.26.1 → datachain-0.26.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  14. {datachain-0.26.1 → datachain-0.26.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  15. {datachain-0.26.1 → datachain-0.26.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  16. {datachain-0.26.1 → datachain-0.26.2}/.github/codecov.yaml +0 -0
  17. {datachain-0.26.1 → datachain-0.26.2}/.github/dependabot.yml +0 -0
  18. {datachain-0.26.1 → datachain-0.26.2}/.github/workflows/benchmarks.yml +0 -0
  19. {datachain-0.26.1 → datachain-0.26.2}/.github/workflows/release.yml +0 -0
  20. {datachain-0.26.1 → datachain-0.26.2}/.github/workflows/tests-studio.yml +0 -0
  21. {datachain-0.26.1 → datachain-0.26.2}/.github/workflows/tests.yml +0 -0
  22. {datachain-0.26.1 → datachain-0.26.2}/.github/workflows/update-template.yaml +0 -0
  23. {datachain-0.26.1 → datachain-0.26.2}/.gitignore +0 -0
  24. {datachain-0.26.1 → datachain-0.26.2}/.pre-commit-config.yaml +0 -0
  25. {datachain-0.26.1 → datachain-0.26.2}/CODE_OF_CONDUCT.rst +0 -0
  26. {datachain-0.26.1 → datachain-0.26.2}/LICENSE +0 -0
  27. {datachain-0.26.1 → datachain-0.26.2}/README.rst +0 -0
  28. {datachain-0.26.1 → datachain-0.26.2}/docs/assets/captioned_cartoons.png +0 -0
  29. {datachain-0.26.1 → datachain-0.26.2}/docs/assets/datachain-white.svg +0 -0
  30. {datachain-0.26.1 → datachain-0.26.2}/docs/assets/datachain.svg +0 -0
  31. {datachain-0.26.1 → datachain-0.26.2}/docs/commands/auth/login.md +0 -0
  32. {datachain-0.26.1 → datachain-0.26.2}/docs/commands/auth/logout.md +0 -0
  33. {datachain-0.26.1 → datachain-0.26.2}/docs/commands/auth/team.md +0 -0
  34. {datachain-0.26.1 → datachain-0.26.2}/docs/commands/auth/token.md +0 -0
  35. {datachain-0.26.1 → datachain-0.26.2}/docs/commands/index.md +0 -0
  36. {datachain-0.26.1 → datachain-0.26.2}/docs/commands/job/cancel.md +0 -0
  37. {datachain-0.26.1 → datachain-0.26.2}/docs/commands/job/clusters.md +0 -0
  38. {datachain-0.26.1 → datachain-0.26.2}/docs/commands/job/logs.md +0 -0
  39. {datachain-0.26.1 → datachain-0.26.2}/docs/commands/job/ls.md +0 -0
  40. {datachain-0.26.1 → datachain-0.26.2}/docs/commands/job/run.md +0 -0
  41. {datachain-0.26.1 → datachain-0.26.2}/docs/contributing.md +0 -0
  42. {datachain-0.26.1 → datachain-0.26.2}/docs/css/github-permalink-style.css +0 -0
  43. {datachain-0.26.1 → datachain-0.26.2}/docs/examples.md +0 -0
  44. {datachain-0.26.1 → datachain-0.26.2}/docs/guide/db_migrations.md +0 -0
  45. {datachain-0.26.1 → datachain-0.26.2}/docs/guide/delta.md +0 -0
  46. {datachain-0.26.1 → datachain-0.26.2}/docs/guide/env.md +0 -0
  47. {datachain-0.26.1 → datachain-0.26.2}/docs/guide/index.md +0 -0
  48. {datachain-0.26.1 → datachain-0.26.2}/docs/guide/namespaces.md +0 -0
  49. {datachain-0.26.1 → datachain-0.26.2}/docs/guide/processing.md +0 -0
  50. {datachain-0.26.1 → datachain-0.26.2}/docs/guide/remotes.md +0 -0
  51. {datachain-0.26.1 → datachain-0.26.2}/docs/guide/retry.md +0 -0
  52. {datachain-0.26.1 → datachain-0.26.2}/docs/index.md +0 -0
  53. {datachain-0.26.1 → datachain-0.26.2}/docs/overrides/main.html +0 -0
  54. {datachain-0.26.1 → datachain-0.26.2}/docs/quick-start.md +0 -0
  55. {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/arrowrow.md +0 -0
  56. {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/bbox.md +0 -0
  57. {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/file.md +0 -0
  58. {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/imagefile.md +0 -0
  59. {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/index.md +0 -0
  60. {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/pose.md +0 -0
  61. {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/segment.md +0 -0
  62. {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/tarvfile.md +0 -0
  63. {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/textfile.md +0 -0
  64. {datachain-0.26.1 → datachain-0.26.2}/docs/references/data-types/videofile.md +0 -0
  65. {datachain-0.26.1 → datachain-0.26.2}/docs/references/datachain.md +0 -0
  66. {datachain-0.26.1 → datachain-0.26.2}/docs/references/func.md +0 -0
  67. {datachain-0.26.1 → datachain-0.26.2}/docs/references/index.md +0 -0
  68. {datachain-0.26.1 → datachain-0.26.2}/docs/references/toolkit.md +0 -0
  69. {datachain-0.26.1 → datachain-0.26.2}/docs/references/torch.md +0 -0
  70. {datachain-0.26.1 → datachain-0.26.2}/docs/references/udf.md +0 -0
  71. {datachain-0.26.1 → datachain-0.26.2}/docs/tutorials.md +0 -0
  72. {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  73. {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  74. {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/openimage-detect.py +0 -0
  75. {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/ultralytics-bbox.py +0 -0
  76. {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/ultralytics-pose.py +0 -0
  77. {datachain-0.26.1 → datachain-0.26.2}/examples/computer_vision/ultralytics-segment.py +0 -0
  78. {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/common_sql_functions.py +0 -0
  79. {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/json-csv-reader.py +0 -0
  80. {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/torch-loader.py +0 -0
  81. {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/udfs/parallel.py +0 -0
  82. {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/udfs/simple.py +0 -0
  83. {datachain-0.26.1 → datachain-0.26.2}/examples/get_started/udfs/stateful.py +0 -0
  84. {datachain-0.26.1 → datachain-0.26.2}/examples/incremental_processing/delta.py +0 -0
  85. {datachain-0.26.1 → datachain-0.26.2}/examples/incremental_processing/retry.py +0 -0
  86. {datachain-0.26.1 → datachain-0.26.2}/examples/incremental_processing/utils.py +0 -0
  87. {datachain-0.26.1 → datachain-0.26.2}/examples/llm_and_nlp/claude-query.py +0 -0
  88. {datachain-0.26.1 → datachain-0.26.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  89. {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/audio-to-text.py +0 -0
  90. {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/clip_inference.py +0 -0
  91. {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/hf_pipeline.py +0 -0
  92. {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
  93. {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/wds.py +0 -0
  94. {datachain-0.26.1 → datachain-0.26.2}/examples/multimodal/wds_filtered.py +0 -0
  95. {datachain-0.26.1 → datachain-0.26.2}/mkdocs.yml +0 -0
  96. {datachain-0.26.1 → datachain-0.26.2}/noxfile.py +0 -0
  97. {datachain-0.26.1 → datachain-0.26.2}/setup.cfg +0 -0
  98. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/__init__.py +0 -0
  99. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/__main__.py +0 -0
  100. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/asyn.py +0 -0
  101. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cache.py +0 -0
  102. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/catalog/__init__.py +0 -0
  103. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/catalog/catalog.py +0 -0
  104. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/catalog/datasource.py +0 -0
  105. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/catalog/loader.py +0 -0
  106. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/__init__.py +0 -0
  107. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/__init__.py +0 -0
  108. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/datasets.py +0 -0
  109. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/du.py +0 -0
  110. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/index.py +0 -0
  111. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/ls.py +0 -0
  112. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/misc.py +0 -0
  113. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/query.py +0 -0
  114. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/commands/show.py +0 -0
  115. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/parser/__init__.py +0 -0
  116. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/parser/job.py +0 -0
  117. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/parser/studio.py +0 -0
  118. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/parser/utils.py +0 -0
  119. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/cli/utils.py +0 -0
  120. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/__init__.py +0 -0
  121. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/azure.py +0 -0
  122. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/fileslice.py +0 -0
  123. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/fsspec.py +0 -0
  124. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/gcs.py +0 -0
  125. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/hf.py +0 -0
  126. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/local.py +0 -0
  127. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/client/s3.py +0 -0
  128. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/config.py +0 -0
  129. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/__init__.py +0 -0
  130. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/db_engine.py +0 -0
  131. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/job.py +0 -0
  132. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/metastore.py +0 -0
  133. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/schema.py +0 -0
  134. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/serializer.py +0 -0
  135. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/sqlite.py +0 -0
  136. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/data_storage/warehouse.py +0 -0
  137. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/dataset.py +0 -0
  138. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/delta.py +0 -0
  139. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/diff/__init__.py +0 -0
  140. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/error.py +0 -0
  141. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/fs/__init__.py +0 -0
  142. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/fs/reference.py +0 -0
  143. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/fs/utils.py +0 -0
  144. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/__init__.py +0 -0
  145. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/aggregate.py +0 -0
  146. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/array.py +0 -0
  147. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/base.py +0 -0
  148. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/conditional.py +0 -0
  149. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/func.py +0 -0
  150. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/numeric.py +0 -0
  151. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/path.py +0 -0
  152. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/random.py +0 -0
  153. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/string.py +0 -0
  154. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/func/window.py +0 -0
  155. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/job.py +0 -0
  156. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/__init__.py +0 -0
  157. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/audio.py +0 -0
  158. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/clip.py +0 -0
  159. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/__init__.py +0 -0
  160. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/flatten.py +0 -0
  161. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
  162. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
  163. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/unflatten.py +0 -0
  164. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  165. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dataset_info.py +0 -0
  166. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/__init__.py +0 -0
  167. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/csv.py +0 -0
  168. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/database.py +0 -0
  169. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/datachain.py +0 -0
  170. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/datasets.py +0 -0
  171. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/json.py +0 -0
  172. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/listings.py +0 -0
  173. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/pandas.py +0 -0
  174. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/parquet.py +0 -0
  175. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/records.py +0 -0
  176. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/storage.py +0 -0
  177. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/utils.py +0 -0
  178. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/dc/values.py +0 -0
  179. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/file.py +0 -0
  180. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/image.py +0 -0
  181. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/listing.py +0 -0
  182. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/listing_info.py +0 -0
  183. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/meta_formats.py +0 -0
  184. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/model_store.py +0 -0
  185. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/namespaces.py +0 -0
  186. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/projects.py +0 -0
  187. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/pytorch.py +0 -0
  188. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/settings.py +0 -0
  189. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/signal_schema.py +0 -0
  190. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/tar.py +0 -0
  191. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/text.py +0 -0
  192. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/udf.py +0 -0
  193. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/udf_signature.py +0 -0
  194. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/utils.py +0 -0
  195. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/video.py +0 -0
  196. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/webdataset.py +0 -0
  197. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/lib/webdataset_laion.py +0 -0
  198. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/listing.py +0 -0
  199. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/__init__.py +0 -0
  200. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/bbox.py +0 -0
  201. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/pose.py +0 -0
  202. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/segment.py +0 -0
  203. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/ultralytics/__init__.py +0 -0
  204. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/ultralytics/bbox.py +0 -0
  205. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/ultralytics/pose.py +0 -0
  206. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/ultralytics/segment.py +0 -0
  207. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/model/utils.py +0 -0
  208. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/namespace.py +0 -0
  209. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/node.py +0 -0
  210. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/nodes_fetcher.py +0 -0
  211. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/nodes_thread_pool.py +0 -0
  212. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/progress.py +0 -0
  213. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/project.py +0 -0
  214. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/py.typed +0 -0
  215. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/__init__.py +0 -0
  216. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/batch.py +0 -0
  217. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/dataset.py +0 -0
  218. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/dispatch.py +0 -0
  219. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/metrics.py +0 -0
  220. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/params.py +0 -0
  221. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/queue.py +0 -0
  222. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/schema.py +0 -0
  223. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/session.py +0 -0
  224. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/udf.py +0 -0
  225. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/query/utils.py +0 -0
  226. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/remote/__init__.py +0 -0
  227. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/remote/studio.py +0 -0
  228. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/script_meta.py +0 -0
  229. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/semver.py +0 -0
  230. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/__init__.py +0 -0
  231. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/default/__init__.py +0 -0
  232. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/default/base.py +0 -0
  233. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/__init__.py +0 -0
  234. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/aggregate.py +0 -0
  235. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/array.py +0 -0
  236. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/conditional.py +0 -0
  237. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/numeric.py +0 -0
  238. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/path.py +0 -0
  239. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/random.py +0 -0
  240. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/functions/string.py +0 -0
  241. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/selectable.py +0 -0
  242. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/sqlite/__init__.py +0 -0
  243. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/sqlite/base.py +0 -0
  244. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/sqlite/types.py +0 -0
  245. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/sqlite/vector.py +0 -0
  246. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/types.py +0 -0
  247. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/sql/utils.py +0 -0
  248. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/studio.py +0 -0
  249. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/telemetry.py +0 -0
  250. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/toolkit/__init__.py +0 -0
  251. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/toolkit/split.py +0 -0
  252. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/torch/__init__.py +0 -0
  253. {datachain-0.26.1 → datachain-0.26.2}/src/datachain/utils.py +0 -0
  254. {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/SOURCES.txt +0 -0
  255. {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/dependency_links.txt +0 -0
  256. {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/entry_points.txt +0 -0
  257. {datachain-0.26.1 → datachain-0.26.2}/src/datachain.egg-info/top_level.txt +0 -0
  258. {datachain-0.26.1 → datachain-0.26.2}/tests/__init__.py +0 -0
  259. {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/__init__.py +0 -0
  260. {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/conftest.py +0 -0
  261. {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  262. {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/datasets/.dvc/config +0 -0
  263. {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/datasets/.gitignore +0 -0
  264. {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  265. {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/test_datachain.py +0 -0
  266. {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/test_ls.py +0 -0
  267. {datachain-0.26.1 → datachain-0.26.2}/tests/benchmarks/test_version.py +0 -0
  268. {datachain-0.26.1 → datachain-0.26.2}/tests/conftest.py +0 -0
  269. {datachain-0.26.1 → datachain-0.26.2}/tests/data.py +0 -0
  270. {datachain-0.26.1 → datachain-0.26.2}/tests/examples/__init__.py +0 -0
  271. {datachain-0.26.1 → datachain-0.26.2}/tests/examples/test_examples.py +0 -0
  272. {datachain-0.26.1 → datachain-0.26.2}/tests/examples/test_wds_e2e.py +0 -0
  273. {datachain-0.26.1 → datachain-0.26.2}/tests/examples/wds_data.py +0 -0
  274. {datachain-0.26.1 → datachain-0.26.2}/tests/func/__init__.py +0 -0
  275. {datachain-0.26.1 → datachain-0.26.2}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  276. {datachain-0.26.1 → datachain-0.26.2}/tests/func/data/lena.jpg +0 -0
  277. {datachain-0.26.1 → datachain-0.26.2}/tests/func/fake-service-account-credentials.json +0 -0
  278. {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/__init__.py +0 -0
  279. {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_aggregate.py +0 -0
  280. {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_array.py +0 -0
  281. {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_conditional.py +0 -0
  282. {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_numeric.py +0 -0
  283. {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_path.py +0 -0
  284. {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_random.py +0 -0
  285. {datachain-0.26.1 → datachain-0.26.2}/tests/func/functions/test_string.py +0 -0
  286. {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/__init__.py +0 -0
  287. {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/data/running-mask0.png +0 -0
  288. {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/data/running-mask1.png +0 -0
  289. {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/data/running.jpg +0 -0
  290. {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/data/ships.jpg +0 -0
  291. {datachain-0.26.1 → datachain-0.26.2}/tests/func/model/test_yolo.py +0 -0
  292. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_audio.py +0 -0
  293. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_batching.py +0 -0
  294. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_catalog.py +0 -0
  295. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_client.py +0 -0
  296. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_cloud_transfer.py +0 -0
  297. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_data_storage.py +0 -0
  298. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_datachain.py +0 -0
  299. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_datachain_merge.py +0 -0
  300. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_dataset_query.py +0 -0
  301. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_datasets.py +0 -0
  302. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_delta.py +0 -0
  303. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_feature_pickling.py +0 -0
  304. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_file.py +0 -0
  305. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_hidden_field.py +0 -0
  306. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_image.py +0 -0
  307. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_listing.py +0 -0
  308. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_ls.py +0 -0
  309. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_meta_formats.py +0 -0
  310. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_metastore.py +0 -0
  311. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_metrics.py +0 -0
  312. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_pull.py +0 -0
  313. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_pytorch.py +0 -0
  314. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_query.py +0 -0
  315. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_read_database.py +0 -0
  316. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_read_dataset_remote.py +0 -0
  317. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  318. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_retry.py +0 -0
  319. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_session.py +0 -0
  320. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_toolkit.py +0 -0
  321. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_video.py +0 -0
  322. {datachain-0.26.1 → datachain-0.26.2}/tests/func/test_warehouse.py +0 -0
  323. {datachain-0.26.1 → datachain-0.26.2}/tests/scripts/feature_class.py +0 -0
  324. {datachain-0.26.1 → datachain-0.26.2}/tests/scripts/feature_class_exception.py +0 -0
  325. {datachain-0.26.1 → datachain-0.26.2}/tests/scripts/feature_class_parallel.py +0 -0
  326. {datachain-0.26.1 → datachain-0.26.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  327. {datachain-0.26.1 → datachain-0.26.2}/tests/scripts/name_len_slow.py +0 -0
  328. {datachain-0.26.1 → datachain-0.26.2}/tests/test_atomicity.py +0 -0
  329. {datachain-0.26.1 → datachain-0.26.2}/tests/test_cli_e2e.py +0 -0
  330. {datachain-0.26.1 → datachain-0.26.2}/tests/test_cli_studio.py +0 -0
  331. {datachain-0.26.1 → datachain-0.26.2}/tests/test_import_time.py +0 -0
  332. {datachain-0.26.1 → datachain-0.26.2}/tests/test_query_e2e.py +0 -0
  333. {datachain-0.26.1 → datachain-0.26.2}/tests/test_telemetry.py +0 -0
  334. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/__init__.py +0 -0
  335. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/__init__.py +0 -0
  336. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/conftest.py +0 -0
  337. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_arrow.py +0 -0
  338. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_audio.py +0 -0
  339. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_clip.py +0 -0
  340. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_datachain.py +0 -0
  341. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  342. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_datachain_merge.py +0 -0
  343. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_diff.py +0 -0
  344. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_feature.py +0 -0
  345. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_feature_utils.py +0 -0
  346. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_file.py +0 -0
  347. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_image.py +0 -0
  348. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_listing_info.py +0 -0
  349. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_namespace.py +0 -0
  350. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_partition_by.py +0 -0
  351. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_project.py +0 -0
  352. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_python_to_sql.py +0 -0
  353. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_schema.py +0 -0
  354. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_signal_schema.py +0 -0
  355. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_sql_to_python.py +0 -0
  356. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_text.py +0 -0
  357. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_udf.py +0 -0
  358. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_udf_signature.py +0 -0
  359. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_utils.py +0 -0
  360. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/lib/test_webdataset.py +0 -0
  361. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/model/__init__.py +0 -0
  362. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/model/test_bbox.py +0 -0
  363. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/model/test_pose.py +0 -0
  364. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/model/test_segment.py +0 -0
  365. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/model/test_utils.py +0 -0
  366. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/__init__.py +0 -0
  367. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/sqlite/__init__.py +0 -0
  368. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/sqlite/test_types.py +0 -0
  369. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
  370. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_array.py +0 -0
  371. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_conditional.py +0 -0
  372. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_path.py +0 -0
  373. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_random.py +0 -0
  374. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_selectable.py +0 -0
  375. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/sql/test_string.py +0 -0
  376. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_asyn.py +0 -0
  377. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_cache.py +0 -0
  378. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_catalog.py +0 -0
  379. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_catalog_loader.py +0 -0
  380. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_cli_parsing.py +0 -0
  381. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_client.py +0 -0
  382. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_client_gcs.py +0 -0
  383. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_client_s3.py +0 -0
  384. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_config.py +0 -0
  385. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_data_storage.py +0 -0
  386. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_database_engine.py +0 -0
  387. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_dataset.py +0 -0
  388. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_dispatch.py +0 -0
  389. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_fileslice.py +0 -0
  390. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_func.py +0 -0
  391. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_listing.py +0 -0
  392. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_metastore.py +0 -0
  393. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_module_exports.py +0 -0
  394. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_pytorch.py +0 -0
  395. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_query.py +0 -0
  396. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_query_metrics.py +0 -0
  397. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_query_params.py +0 -0
  398. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_script_meta.py +0 -0
  399. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_semver.py +0 -0
  400. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_serializer.py +0 -0
  401. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_session.py +0 -0
  402. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_utils.py +0 -0
  403. {datachain-0.26.1 → datachain-0.26.2}/tests/unit/test_warehouse.py +0 -0
  404. {datachain-0.26.1 → datachain-0.26.2}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.26.1
3
+ Version: 0.26.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
98
98
  Requires-Dist: ultralytics; extra == "tests"
99
99
  Provides-Extra: dev
100
100
  Requires-Dist: datachain[docs,tests]; extra == "dev"
101
- Requires-Dist: mypy==1.16.1; extra == "dev"
101
+ Requires-Dist: mypy==1.17.0; extra == "dev"
102
102
  Requires-Dist: types-python-dateutil; extra == "dev"
103
103
  Requires-Dist: types-pytz; extra == "dev"
104
104
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -114,7 +114,7 @@ tests = [
114
114
  ]
115
115
  dev = [
116
116
  "datachain[docs,tests]",
117
- "mypy==1.16.1",
117
+ "mypy==1.17.0",
118
118
  "types-python-dateutil",
119
119
  "types-pytz",
120
120
  "types-PyYAML",
@@ -262,7 +262,7 @@ def _get_hf_schema(
262
262
  from datachain.lib.hf import get_output_schema, schema_from_arrow
263
263
 
264
264
  features = schema_from_arrow(schema)
265
- return features, get_output_schema(features)
265
+ return features, get_output_schema(features)[0]
266
266
  return None
267
267
 
268
268
 
@@ -3,6 +3,7 @@ from datetime import datetime
3
3
  from typing import ClassVar, Optional, Union, get_args, get_origin
4
4
 
5
5
  from pydantic import AliasChoices, BaseModel, Field, create_model
6
+ from pydantic.fields import FieldInfo
6
7
 
7
8
  from datachain.lib.model_store import ModelStore
8
9
  from datachain.lib.utils import normalize_col_names
@@ -89,7 +90,16 @@ def dict_to_data_model(
89
90
  }
90
91
 
91
92
  class _DataModelStrict(BaseModel, extra="forbid"):
92
- pass
93
+ @classmethod
94
+ def _model_fields_by_aliases(cls) -> dict[str, tuple[str, FieldInfo]]:
95
+ """Returns a map of aliases to original field names and info."""
96
+ field_info = {}
97
+ for _name, field in cls.model_fields.items():
98
+ assert isinstance(field.validation_alias, AliasChoices)
99
+ # Add mapping for all aliases (both normalized and original names)
100
+ for alias in field.validation_alias.choices:
101
+ field_info[str(alias)] = (_name, field)
102
+ return field_info
93
103
 
94
104
  return create_model(
95
105
  name,
@@ -32,6 +32,7 @@ def read_hf(
32
32
  Parameters:
33
33
  dataset : Path or name of the dataset to read from Hugging Face Hub,
34
34
  or an instance of `datasets.Dataset`-like object.
35
+ args : Additional positional arguments to pass to datasets.load_dataset.
35
36
  session : Session to use for the chain.
36
37
  settings : Settings to use for the chain.
37
38
  column : Generated object column name.
@@ -64,8 +65,9 @@ def read_hf(
64
65
 
65
66
  model_name = model_name or column or ""
66
67
  hf_features = next(iter(ds_dict.values())).features
67
- output = output | get_output_schema(hf_features)
68
- model = dict_to_data_model(model_name, output)
68
+ hf_output, normalized_names = get_output_schema(hf_features, list(output.keys()))
69
+ output = output | hf_output
70
+ model = dict_to_data_model(model_name, output, list(normalized_names.values()))
69
71
  if column:
70
72
  output = {column: model}
71
73
 
@@ -26,7 +26,7 @@ except ImportError as exc:
26
26
  ) from exc
27
27
 
28
28
  from io import BytesIO
29
- from typing import TYPE_CHECKING, Any, Union
29
+ from typing import TYPE_CHECKING, Any, Optional, Union
30
30
 
31
31
  import PIL
32
32
  from tqdm.auto import tqdm
@@ -34,6 +34,7 @@ from tqdm.auto import tqdm
34
34
  from datachain.lib.arrow import arrow_type_mapper
35
35
  from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
36
36
  from datachain.lib.udf import Generator
37
+ from datachain.lib.utils import normalize_col_names
37
38
 
38
39
  if TYPE_CHECKING:
39
40
  import pyarrow as pa
@@ -94,14 +95,18 @@ class HFGenerator(Generator):
94
95
  ds = self.ds_dict[split]
95
96
  if split:
96
97
  desc += f" split '{split}'"
98
+ model_fields = self.output_schema._model_fields_by_aliases() # type: ignore[attr-defined]
97
99
  with tqdm(desc=desc, unit=" rows", leave=False) as pbar:
98
100
  for row in ds:
99
101
  output_dict = {}
100
102
  if split and "split" in self.output_schema.model_fields:
101
103
  output_dict["split"] = split
102
104
  for name, feat in ds.features.items():
103
- anno = self.output_schema.model_fields[name].annotation
104
- output_dict[name] = convert_feature(row[name], feat, anno)
105
+ normalized_name, info = model_fields[name]
106
+ anno = info.annotation
107
+ output_dict[normalized_name] = convert_feature(
108
+ row[name], feat, anno
109
+ )
105
110
  yield self.output_schema(**output_dict)
106
111
  pbar.update(1)
107
112
 
@@ -122,10 +127,12 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
122
127
  return HFClassLabel(string=feat.names[val], integer=val)
123
128
  if isinstance(feat, dict):
124
129
  sdict = {}
130
+ model_fields = anno._model_fields_by_aliases() # type: ignore[attr-defined]
125
131
  for sname in val:
126
132
  sfeat = feat[sname]
127
- sanno = anno.model_fields[sname].annotation
128
- sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
133
+ norm_name, info = model_fields[sname]
134
+ sanno = info.annotation
135
+ sdict[norm_name] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
129
136
  return anno(**sdict)
130
137
  if isinstance(feat, Image):
131
138
  if isinstance(val, dict):
@@ -135,12 +142,26 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
135
142
  return HFAudio(array=val["array"], sampling_rate=val["sampling_rate"])
136
143
 
137
144
 
138
- def get_output_schema(features: Features) -> dict[str, DataType]:
139
- """Generate UDF output schema from huggingface datasets features."""
145
+ def get_output_schema(
146
+ features: Features, existing_column_names: Optional[list[str]] = None
147
+ ) -> tuple[dict[str, DataType], dict[str, str]]:
148
+ """
149
+ Generate UDF output schema from Hugging Face datasets features. It normalizes the
150
+ column names and returns a mapping of normalized names to original names along with
151
+ the data types. `existing_column_names` is the list of column names that already
152
+ exist in the dataset (to avoid name collisions due to normalization).
153
+ """
154
+ existing_column_names = existing_column_names or []
140
155
  fields_dict = {}
141
- for name, val in features.items():
142
- fields_dict[name] = _feature_to_chain_type(name, val)
143
- return fields_dict
156
+ normalized_names = normalize_col_names(
157
+ existing_column_names + list(features.keys())
158
+ )
159
+ # List of tuple(str, str) for HF dataset feature names, (normalized, original)
160
+ new_feature_names = list(normalized_names.items())[len(existing_column_names) :]
161
+ for idx, feat in enumerate(features.items()):
162
+ name, val = feat
163
+ fields_dict[new_feature_names[idx][0]] = _feature_to_chain_type(name, val)
164
+ return fields_dict, normalized_names
144
165
 
145
166
 
146
167
  def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.26.1
3
+ Version: 0.26.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
98
98
  Requires-Dist: ultralytics; extra == "tests"
99
99
  Provides-Extra: dev
100
100
  Requires-Dist: datachain[docs,tests]; extra == "dev"
101
- Requires-Dist: mypy==1.16.1; extra == "dev"
101
+ Requires-Dist: mypy==1.17.0; extra == "dev"
102
102
  Requires-Dist: types-python-dateutil; extra == "dev"
103
103
  Requires-Dist: types-pytz; extra == "dev"
104
104
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -41,7 +41,7 @@ soundfile
41
41
 
42
42
  [dev]
43
43
  datachain[docs,tests]
44
- mypy==1.16.1
44
+ mypy==1.17.0
45
45
  types-python-dateutil
46
46
  types-pytz
47
47
  types-PyYAML
@@ -34,10 +34,11 @@ def test_hf_image(tmp_path):
34
34
  img.save(train_dir / "img1.png")
35
35
 
36
36
  ds = load_dataset("imagefolder", data_dir=tmp_path)
37
- schema = {"split": str} | get_output_schema(ds["train"].features)
37
+ hf_schema, norm_names = get_output_schema(ds["train"].features, ["split"])
38
+ schema = {"split": str} | hf_schema
38
39
  assert schema["image"] is HFImage
39
40
 
40
- gen = HFGenerator(ds, dict_to_data_model("", schema))
41
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
41
42
  gen.setup()
42
43
  row = next(iter(gen.process("train")))
43
44
  assert row.image.img == image_to_bytes(img)
@@ -56,9 +57,10 @@ def test_hf_audio(tmp_path):
56
57
  write(train_dir / "example.wav", samplerate, data.astype(np.int16))
57
58
 
58
59
  ds = load_dataset("audiofolder", data_dir=tmp_path)
59
- schema = {"split": str} | get_output_schema(ds["train"].features)
60
+ hf_schema, norm_names = get_output_schema(ds["train"].features, ["split"])
61
+ schema = {"split": str} | hf_schema
60
62
 
61
- gen = HFGenerator(ds, dict_to_data_model("", schema))
63
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
62
64
  gen.setup()
63
65
  row = next(iter(gen.process("train")))
64
66
  assert np.allclose(row.audio.array, data / amplitude, atol=1e-4)
@@ -11,37 +11,41 @@ from datachain.lib.hf import (
11
11
 
12
12
  def test_hf():
13
13
  ds = Dataset.from_dict({"pokemon": ["bulbasaur", "squirtle"]})
14
- schema = get_output_schema(ds.features)
14
+ schema, norm_names = get_output_schema(ds.features)
15
15
  assert schema["pokemon"] is str
16
16
 
17
- gen = HFGenerator(ds, dict_to_data_model("", schema))
17
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
18
18
  gen.setup()
19
19
  row = next(iter(gen.process()))
20
20
  assert row.pokemon == "bulbasaur"
21
21
 
22
22
 
23
23
  def test_hf_split():
24
- ds_train = Dataset.from_dict({"pokemon": ["bulbasaur", "squirtle"]})
25
- ds_test = Dataset.from_dict({"pokemon": ["charizard", "pikachu"]})
24
+ # Space in the column name should be normalized
25
+ ds_train = Dataset.from_dict({"pok emon": ["bulbasaur", "squirtle"]})
26
+ ds_test = Dataset.from_dict({"pok emon": ["charizard", "pikachu"]})
26
27
  ds_dict = DatasetDict({"train": ds_train, "test": ds_test})
27
28
  ds_dict = stream_splits(ds_dict)
28
- schema = {"split": str} | get_output_schema(ds_dict["train"].features)
29
+ hf_schema, norm_names = get_output_schema(ds_dict["train"].features, ["split"])
30
+ schema = {"split": str} | hf_schema
29
31
 
30
- gen = HFGenerator(ds_dict, dict_to_data_model("", schema))
32
+ gen = HFGenerator(
33
+ ds_dict, dict_to_data_model("", schema, list(norm_names.values()))
34
+ )
31
35
  gen.setup()
32
36
  row = next(iter(gen.process("train")))
33
37
 
34
38
  assert row.split == "train"
35
- assert row.pokemon == "bulbasaur"
39
+ assert row.pok_emon == "bulbasaur"
36
40
 
37
41
 
38
42
  def test_hf_class_label():
39
43
  ds = Dataset.from_dict({"pokemon": ["bulbasaur", "squirtle"]})
40
44
  ds = ds.class_encode_column("pokemon")
41
- schema = get_output_schema(ds.features)
45
+ schema, norm_names = get_output_schema(ds.features)
42
46
  assert schema["pokemon"] is HFClassLabel
43
47
 
44
- gen = HFGenerator(ds, dict_to_data_model("", schema))
48
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
45
49
  gen.setup()
46
50
  row = next(iter(gen.process()))
47
51
  assert row.pokemon.string == "bulbasaur"
@@ -50,26 +54,28 @@ def test_hf_class_label():
50
54
 
51
55
  def test_hf_sequence_list():
52
56
  ds = Dataset.from_dict({"seq": [[0, 1], [2, 3]]})
53
- schema = get_output_schema(ds.features)
57
+ schema, norm_names = get_output_schema(ds.features)
54
58
  assert schema["seq"] == list[int]
55
59
 
56
- gen = HFGenerator(ds, dict_to_data_model("", schema))
60
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
57
61
  gen.setup()
58
62
  row = next(iter(gen.process()))
59
63
  assert row.seq == [0, 1]
60
64
 
61
65
 
62
66
  def test_hf_sequence_dict():
67
+ # ? in the column name should be normalized
68
+ # Check if even nested names are not normalized we handle it correctly
63
69
  ds = Dataset.from_dict(
64
- {"pokemon": [{"name": ["bulbasaur"]}, {"name": ["squirtle"]}]}
70
+ {"pokemon": [{"name?": ["bulbasaur"]}, {"name?": ["squirtle"]}]}
65
71
  )
66
72
  new_features = ds.features.copy()
67
- new_features["pokemon"] = Sequence(feature={"name": Value(dtype="string")})
73
+ new_features["pokemon"] = Sequence(feature={"name?": Value(dtype="string")})
68
74
  ds = ds.cast(new_features)
69
- schema = get_output_schema(ds.features)
75
+ schema, norm_names = get_output_schema(ds.features)
70
76
  assert schema["pokemon"].model_fields["name"].annotation == list[str]
71
77
 
72
- gen = HFGenerator(ds, dict_to_data_model("", schema))
78
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
73
79
  gen.setup()
74
80
  row = next(iter(gen.process()))
75
81
  assert row.pokemon.name == ["bulbasaur"]
@@ -80,10 +86,10 @@ def test_hf_array():
80
86
  new_features = ds.features.copy()
81
87
  new_features["arr"] = Array2D(shape=(2, 2), dtype="int32")
82
88
  ds = ds.cast(new_features)
83
- schema = get_output_schema(ds.features)
89
+ schema, norm_names = get_output_schema(ds.features)
84
90
  assert schema["arr"] == list[list[int]]
85
91
 
86
- gen = HFGenerator(ds, dict_to_data_model("", schema))
92
+ gen = HFGenerator(ds, dict_to_data_model("", schema, list(norm_names.values())))
87
93
  gen.setup()
88
94
  row = next(iter(gen.process()))
89
95
  assert row.arr == [[0, 1], [2, 3]]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes