datachain 0.37.7__tar.gz → 0.37.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (493) hide show
  1. {datachain-0.37.7 → datachain-0.37.9}/PKG-INFO +1 -1
  2. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/data_storage/warehouse.py +31 -5
  3. datachain-0.37.9/src/datachain/lib/convert/values_to_tuples.py +210 -0
  4. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/data_model.py +3 -0
  5. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/datachain.py +19 -3
  6. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/signal_schema.py +72 -6
  7. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/query/dataset.py +22 -5
  8. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/toolkit/split.py +30 -8
  9. {datachain-0.37.7 → datachain-0.37.9}/src/datachain.egg-info/PKG-INFO +1 -1
  10. {datachain-0.37.7 → datachain-0.37.9}/src/datachain.egg-info/SOURCES.txt +1 -0
  11. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_data_storage.py +45 -10
  12. datachain-0.37.9/tests/func/test_retrieval.py +470 -0
  13. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_toolkit.py +34 -4
  14. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_datachain.py +29 -68
  15. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_feature_utils.py +66 -0
  16. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_signal_schema.py +107 -5
  17. datachain-0.37.7/src/datachain/lib/convert/values_to_tuples.py +0 -114
  18. {datachain-0.37.7 → datachain-0.37.9}/.cruft.json +0 -0
  19. {datachain-0.37.7 → datachain-0.37.9}/.gitattributes +0 -0
  20. {datachain-0.37.7 → datachain-0.37.9}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  21. {datachain-0.37.7 → datachain-0.37.9}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  22. {datachain-0.37.7 → datachain-0.37.9}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  23. {datachain-0.37.7 → datachain-0.37.9}/.github/codecov.yaml +0 -0
  24. {datachain-0.37.7 → datachain-0.37.9}/.github/dependabot.yml +0 -0
  25. {datachain-0.37.7 → datachain-0.37.9}/.github/workflows/benchmarks.yml +0 -0
  26. {datachain-0.37.7 → datachain-0.37.9}/.github/workflows/release.yml +0 -0
  27. {datachain-0.37.7 → datachain-0.37.9}/.github/workflows/tests-studio.yml +0 -0
  28. {datachain-0.37.7 → datachain-0.37.9}/.github/workflows/tests.yml +0 -0
  29. {datachain-0.37.7 → datachain-0.37.9}/.github/workflows/update-template.yaml +0 -0
  30. {datachain-0.37.7 → datachain-0.37.9}/.gitignore +0 -0
  31. {datachain-0.37.7 → datachain-0.37.9}/.pre-commit-config.yaml +0 -0
  32. {datachain-0.37.7 → datachain-0.37.9}/CODE_OF_CONDUCT.rst +0 -0
  33. {datachain-0.37.7 → datachain-0.37.9}/LICENSE +0 -0
  34. {datachain-0.37.7 → datachain-0.37.9}/README.rst +0 -0
  35. {datachain-0.37.7 → datachain-0.37.9}/docs/api_hooks.py +0 -0
  36. {datachain-0.37.7 → datachain-0.37.9}/docs/assets/captioned_cartoons.png +0 -0
  37. {datachain-0.37.7 → datachain-0.37.9}/docs/assets/datachain-white.svg +0 -0
  38. {datachain-0.37.7 → datachain-0.37.9}/docs/assets/datachain.svg +0 -0
  39. {datachain-0.37.7 → datachain-0.37.9}/docs/assets/webhook_dialog.png +0 -0
  40. {datachain-0.37.7 → datachain-0.37.9}/docs/assets/webhook_list.png +0 -0
  41. {datachain-0.37.7 → datachain-0.37.9}/docs/commands/auth/login.md +0 -0
  42. {datachain-0.37.7 → datachain-0.37.9}/docs/commands/auth/logout.md +0 -0
  43. {datachain-0.37.7 → datachain-0.37.9}/docs/commands/auth/team.md +0 -0
  44. {datachain-0.37.7 → datachain-0.37.9}/docs/commands/auth/token.md +0 -0
  45. {datachain-0.37.7 → datachain-0.37.9}/docs/commands/index.md +0 -0
  46. {datachain-0.37.7 → datachain-0.37.9}/docs/commands/job/cancel.md +0 -0
  47. {datachain-0.37.7 → datachain-0.37.9}/docs/commands/job/clusters.md +0 -0
  48. {datachain-0.37.7 → datachain-0.37.9}/docs/commands/job/logs.md +0 -0
  49. {datachain-0.37.7 → datachain-0.37.9}/docs/commands/job/ls.md +0 -0
  50. {datachain-0.37.7 → datachain-0.37.9}/docs/commands/job/run.md +0 -0
  51. {datachain-0.37.7 → datachain-0.37.9}/docs/contributing.md +0 -0
  52. {datachain-0.37.7 → datachain-0.37.9}/docs/css/github-permalink-style.css +0 -0
  53. {datachain-0.37.7 → datachain-0.37.9}/docs/examples.md +0 -0
  54. {datachain-0.37.7 → datachain-0.37.9}/docs/guide/checkpoints.md +0 -0
  55. {datachain-0.37.7 → datachain-0.37.9}/docs/guide/db_migrations.md +0 -0
  56. {datachain-0.37.7 → datachain-0.37.9}/docs/guide/delta.md +0 -0
  57. {datachain-0.37.7 → datachain-0.37.9}/docs/guide/env.md +0 -0
  58. {datachain-0.37.7 → datachain-0.37.9}/docs/guide/index.md +0 -0
  59. {datachain-0.37.7 → datachain-0.37.9}/docs/guide/namespaces.md +0 -0
  60. {datachain-0.37.7 → datachain-0.37.9}/docs/guide/processing.md +0 -0
  61. {datachain-0.37.7 → datachain-0.37.9}/docs/guide/remotes.md +0 -0
  62. {datachain-0.37.7 → datachain-0.37.9}/docs/guide/retry.md +0 -0
  63. {datachain-0.37.7 → datachain-0.37.9}/docs/index.md +0 -0
  64. {datachain-0.37.7 → datachain-0.37.9}/docs/overrides/main.html +0 -0
  65. {datachain-0.37.7 → datachain-0.37.9}/docs/quick-start.md +0 -0
  66. {datachain-0.37.7 → datachain-0.37.9}/docs/references/data-types/arrowrow.md +0 -0
  67. {datachain-0.37.7 → datachain-0.37.9}/docs/references/data-types/bbox.md +0 -0
  68. {datachain-0.37.7 → datachain-0.37.9}/docs/references/data-types/file.md +0 -0
  69. {datachain-0.37.7 → datachain-0.37.9}/docs/references/data-types/imagefile.md +0 -0
  70. {datachain-0.37.7 → datachain-0.37.9}/docs/references/data-types/index.md +0 -0
  71. {datachain-0.37.7 → datachain-0.37.9}/docs/references/data-types/pose.md +0 -0
  72. {datachain-0.37.7 → datachain-0.37.9}/docs/references/data-types/segment.md +0 -0
  73. {datachain-0.37.7 → datachain-0.37.9}/docs/references/data-types/tarvfile.md +0 -0
  74. {datachain-0.37.7 → datachain-0.37.9}/docs/references/data-types/textfile.md +0 -0
  75. {datachain-0.37.7 → datachain-0.37.9}/docs/references/data-types/videofile.md +0 -0
  76. {datachain-0.37.7 → datachain-0.37.9}/docs/references/datachain.md +0 -0
  77. {datachain-0.37.7 → datachain-0.37.9}/docs/references/func.md +0 -0
  78. {datachain-0.37.7 → datachain-0.37.9}/docs/references/functions/aggregate.md +0 -0
  79. {datachain-0.37.7 → datachain-0.37.9}/docs/references/functions/array.md +0 -0
  80. {datachain-0.37.7 → datachain-0.37.9}/docs/references/functions/conditional.md +0 -0
  81. {datachain-0.37.7 → datachain-0.37.9}/docs/references/functions/numeric.md +0 -0
  82. {datachain-0.37.7 → datachain-0.37.9}/docs/references/functions/path.md +0 -0
  83. {datachain-0.37.7 → datachain-0.37.9}/docs/references/functions/random.md +0 -0
  84. {datachain-0.37.7 → datachain-0.37.9}/docs/references/functions/string.md +0 -0
  85. {datachain-0.37.7 → datachain-0.37.9}/docs/references/functions/window.md +0 -0
  86. {datachain-0.37.7 → datachain-0.37.9}/docs/references/index.md +0 -0
  87. {datachain-0.37.7 → datachain-0.37.9}/docs/references/toolkit.md +0 -0
  88. {datachain-0.37.7 → datachain-0.37.9}/docs/references/torch.md +0 -0
  89. {datachain-0.37.7 → datachain-0.37.9}/docs/references/udf.md +0 -0
  90. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/api/.gitkeep +0 -0
  91. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/index.md +0 -0
  92. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/configuration/ca-certificates.md +0 -0
  93. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/configuration/git-forges/bitbucket.md +0 -0
  94. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/configuration/git-forges/github.md +0 -0
  95. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/configuration/git-forges/gitlab.md +0 -0
  96. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/configuration/git-forges/index.md +0 -0
  97. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/configuration/index.md +0 -0
  98. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/configuration/ssl-tls.md +0 -0
  99. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/index.md +0 -0
  100. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/installation/aws-ami.md +0 -0
  101. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/installation/index.md +0 -0
  102. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/installation/k8s-helm.md +0 -0
  103. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/troubleshooting/502-errors.md +0 -0
  104. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/troubleshooting/index.md +0 -0
  105. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/troubleshooting/support-bundle.md +0 -0
  106. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/upgrading/airgap-procedure.md +0 -0
  107. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/upgrading/index.md +0 -0
  108. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/self-hosting/upgrading/regular-procedure.md +0 -0
  109. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/account-management.md +0 -0
  110. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/authentication/openid-connect.md +0 -0
  111. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/authentication/single-sign-on.md +0 -0
  112. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/experiments/configure-a-project.md +0 -0
  113. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/experiments/create-a-project.md +0 -0
  114. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/experiments/explore-ml-experiments.md +0 -0
  115. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/experiments/index.md +0 -0
  116. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/experiments/live-metrics-and-plots.md +0 -0
  117. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/experiments/run-experiments.md +0 -0
  118. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/experiments/share-a-project.md +0 -0
  119. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/experiments/visualize-and-compare.md +0 -0
  120. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/git-connections/custom-gitlab-server.md +0 -0
  121. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/git-connections/github-app.md +0 -0
  122. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/git-connections/index.md +0 -0
  123. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/index.md +0 -0
  124. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/jobs/create-and-run.md +0 -0
  125. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/jobs/index.md +0 -0
  126. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/jobs/monitor-jobs.md +0 -0
  127. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/model-registry/add-a-model.md +0 -0
  128. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/model-registry/assign-stage.md +0 -0
  129. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/model-registry/register-version.md +0 -0
  130. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/model-registry/remove-a-model-or-its-details.md +0 -0
  131. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/model-registry/use-models.md +0 -0
  132. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/model-registry/view-and-compare-models.md +0 -0
  133. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/team-collaboration.md +0 -0
  134. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/user-guide/troubleshooting.md +0 -0
  135. {datachain-0.37.7 → datachain-0.37.9}/docs/studio/webhooks.md +0 -0
  136. {datachain-0.37.7 → datachain-0.37.9}/docs/templates/main.dot +0 -0
  137. {datachain-0.37.7 → datachain-0.37.9}/docs/templates/operation.dot +0 -0
  138. {datachain-0.37.7 → datachain-0.37.9}/docs/templates/responses.def +0 -0
  139. {datachain-0.37.7 → datachain-0.37.9}/docs/tutorials.md +0 -0
  140. {datachain-0.37.7 → datachain-0.37.9}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  141. {datachain-0.37.7 → datachain-0.37.9}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  142. {datachain-0.37.7 → datachain-0.37.9}/examples/computer_vision/openimage-detect.py +0 -0
  143. {datachain-0.37.7 → datachain-0.37.9}/examples/computer_vision/ultralytics-bbox.py +0 -0
  144. {datachain-0.37.7 → datachain-0.37.9}/examples/computer_vision/ultralytics-pose.py +0 -0
  145. {datachain-0.37.7 → datachain-0.37.9}/examples/computer_vision/ultralytics-segment.py +0 -0
  146. {datachain-0.37.7 → datachain-0.37.9}/examples/get_started/common_sql_functions.py +0 -0
  147. {datachain-0.37.7 → datachain-0.37.9}/examples/get_started/json-csv-reader.py +0 -0
  148. {datachain-0.37.7 → datachain-0.37.9}/examples/get_started/nested_datamodel.py +0 -0
  149. {datachain-0.37.7 → datachain-0.37.9}/examples/get_started/torch-loader.py +0 -0
  150. {datachain-0.37.7 → datachain-0.37.9}/examples/get_started/udfs/parallel.py +0 -0
  151. {datachain-0.37.7 → datachain-0.37.9}/examples/get_started/udfs/simple.py +0 -0
  152. {datachain-0.37.7 → datachain-0.37.9}/examples/get_started/udfs/stateful.py +0 -0
  153. {datachain-0.37.7 → datachain-0.37.9}/examples/incremental_processing/delta.py +0 -0
  154. {datachain-0.37.7 → datachain-0.37.9}/examples/incremental_processing/retry.py +0 -0
  155. {datachain-0.37.7 → datachain-0.37.9}/examples/incremental_processing/utils.py +0 -0
  156. {datachain-0.37.7 → datachain-0.37.9}/examples/llm_and_nlp/claude-query.py +0 -0
  157. {datachain-0.37.7 → datachain-0.37.9}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  158. {datachain-0.37.7 → datachain-0.37.9}/examples/multimodal/audio-to-text.py +0 -0
  159. {datachain-0.37.7 → datachain-0.37.9}/examples/multimodal/clip_inference.py +0 -0
  160. {datachain-0.37.7 → datachain-0.37.9}/examples/multimodal/hf_pipeline.py +0 -0
  161. {datachain-0.37.7 → datachain-0.37.9}/examples/multimodal/openai_image_desc_lib.py +0 -0
  162. {datachain-0.37.7 → datachain-0.37.9}/examples/multimodal/wds.py +0 -0
  163. {datachain-0.37.7 → datachain-0.37.9}/examples/multimodal/wds_filtered.py +0 -0
  164. {datachain-0.37.7 → datachain-0.37.9}/mkdocs.yml +0 -0
  165. {datachain-0.37.7 → datachain-0.37.9}/noxfile.py +0 -0
  166. {datachain-0.37.7 → datachain-0.37.9}/pyproject.toml +0 -0
  167. {datachain-0.37.7 → datachain-0.37.9}/setup.cfg +0 -0
  168. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/__init__.py +0 -0
  169. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/__main__.py +0 -0
  170. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/asyn.py +0 -0
  171. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cache.py +0 -0
  172. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/catalog/__init__.py +0 -0
  173. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/catalog/catalog.py +0 -0
  174. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/catalog/datasource.py +0 -0
  175. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/catalog/dependency.py +0 -0
  176. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/catalog/loader.py +0 -0
  177. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/checkpoint.py +0 -0
  178. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/__init__.py +0 -0
  179. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/commands/__init__.py +0 -0
  180. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/commands/datasets.py +0 -0
  181. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/commands/du.py +0 -0
  182. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/commands/index.py +0 -0
  183. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/commands/ls.py +0 -0
  184. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/commands/misc.py +0 -0
  185. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/commands/query.py +0 -0
  186. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/commands/show.py +0 -0
  187. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/parser/__init__.py +0 -0
  188. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/parser/job.py +0 -0
  189. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/parser/studio.py +0 -0
  190. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/parser/utils.py +0 -0
  191. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/cli/utils.py +0 -0
  192. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/client/__init__.py +0 -0
  193. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/client/azure.py +0 -0
  194. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/client/fileslice.py +0 -0
  195. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/client/fsspec.py +0 -0
  196. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/client/gcs.py +0 -0
  197. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/client/hf.py +0 -0
  198. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/client/http.py +0 -0
  199. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/client/local.py +0 -0
  200. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/client/s3.py +0 -0
  201. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/config.py +0 -0
  202. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/data_storage/__init__.py +0 -0
  203. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/data_storage/db_engine.py +0 -0
  204. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/data_storage/job.py +0 -0
  205. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/data_storage/metastore.py +0 -0
  206. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/data_storage/schema.py +0 -0
  207. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/data_storage/serializer.py +0 -0
  208. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/data_storage/sqlite.py +0 -0
  209. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/dataset.py +0 -0
  210. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/delta.py +0 -0
  211. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/diff/__init__.py +0 -0
  212. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/error.py +0 -0
  213. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/fs/__init__.py +0 -0
  214. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/fs/reference.py +0 -0
  215. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/fs/utils.py +0 -0
  216. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/__init__.py +0 -0
  217. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/aggregate.py +0 -0
  218. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/array.py +0 -0
  219. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/base.py +0 -0
  220. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/conditional.py +0 -0
  221. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/func.py +0 -0
  222. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/numeric.py +0 -0
  223. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/path.py +0 -0
  224. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/random.py +0 -0
  225. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/string.py +0 -0
  226. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/func/window.py +0 -0
  227. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/hash_utils.py +0 -0
  228. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/job.py +0 -0
  229. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/__init__.py +0 -0
  230. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/arrow.py +0 -0
  231. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/audio.py +0 -0
  232. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/clip.py +0 -0
  233. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/convert/__init__.py +0 -0
  234. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/convert/flatten.py +0 -0
  235. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/convert/python_to_sql.py +0 -0
  236. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/convert/sql_to_python.py +0 -0
  237. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/convert/unflatten.py +0 -0
  238. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dataset_info.py +0 -0
  239. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/__init__.py +0 -0
  240. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/csv.py +0 -0
  241. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/database.py +0 -0
  242. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/datasets.py +0 -0
  243. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/hf.py +0 -0
  244. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/json.py +0 -0
  245. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/listings.py +0 -0
  246. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/pandas.py +0 -0
  247. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/parquet.py +0 -0
  248. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/records.py +0 -0
  249. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/storage.py +0 -0
  250. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/storage_pattern.py +0 -0
  251. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/utils.py +0 -0
  252. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/dc/values.py +0 -0
  253. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/file.py +0 -0
  254. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/hf.py +0 -0
  255. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/image.py +0 -0
  256. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/listing.py +0 -0
  257. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/listing_info.py +0 -0
  258. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/meta_formats.py +0 -0
  259. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/model_store.py +0 -0
  260. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/namespaces.py +0 -0
  261. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/projects.py +0 -0
  262. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/pytorch.py +0 -0
  263. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/settings.py +0 -0
  264. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/tar.py +0 -0
  265. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/text.py +0 -0
  266. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/udf.py +0 -0
  267. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/udf_signature.py +0 -0
  268. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/utils.py +0 -0
  269. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/video.py +0 -0
  270. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/webdataset.py +0 -0
  271. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/lib/webdataset_laion.py +0 -0
  272. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/listing.py +0 -0
  273. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/model/__init__.py +0 -0
  274. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/model/bbox.py +0 -0
  275. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/model/pose.py +0 -0
  276. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/model/segment.py +0 -0
  277. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/model/ultralytics/__init__.py +0 -0
  278. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/model/ultralytics/bbox.py +0 -0
  279. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/model/ultralytics/pose.py +0 -0
  280. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/model/ultralytics/segment.py +0 -0
  281. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/model/utils.py +0 -0
  282. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/namespace.py +0 -0
  283. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/node.py +0 -0
  284. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/nodes_fetcher.py +0 -0
  285. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/nodes_thread_pool.py +0 -0
  286. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/plugins.py +0 -0
  287. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/progress.py +0 -0
  288. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/project.py +0 -0
  289. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/py.typed +0 -0
  290. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/query/__init__.py +0 -0
  291. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/query/batch.py +0 -0
  292. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/query/dispatch.py +0 -0
  293. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/query/metrics.py +0 -0
  294. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/query/params.py +0 -0
  295. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/query/queue.py +0 -0
  296. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/query/schema.py +0 -0
  297. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/query/session.py +0 -0
  298. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/query/udf.py +0 -0
  299. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/remote/__init__.py +0 -0
  300. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/remote/studio.py +0 -0
  301. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/script_meta.py +0 -0
  302. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/semver.py +0 -0
  303. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/__init__.py +0 -0
  304. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/default/__init__.py +0 -0
  305. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/default/base.py +0 -0
  306. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/functions/__init__.py +0 -0
  307. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/functions/aggregate.py +0 -0
  308. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/functions/array.py +0 -0
  309. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/functions/conditional.py +0 -0
  310. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/functions/numeric.py +0 -0
  311. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/functions/path.py +0 -0
  312. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/functions/random.py +0 -0
  313. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/functions/string.py +0 -0
  314. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/postgresql_dialect.py +0 -0
  315. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/postgresql_types.py +0 -0
  316. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/selectable.py +0 -0
  317. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/sqlite/__init__.py +0 -0
  318. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/sqlite/base.py +0 -0
  319. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/sqlite/types.py +0 -0
  320. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/sqlite/vector.py +0 -0
  321. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/types.py +0 -0
  322. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/sql/utils.py +0 -0
  323. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/studio.py +0 -0
  324. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/telemetry.py +0 -0
  325. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/toolkit/__init__.py +0 -0
  326. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/torch/__init__.py +0 -0
  327. {datachain-0.37.7 → datachain-0.37.9}/src/datachain/utils.py +0 -0
  328. {datachain-0.37.7 → datachain-0.37.9}/src/datachain.egg-info/dependency_links.txt +0 -0
  329. {datachain-0.37.7 → datachain-0.37.9}/src/datachain.egg-info/entry_points.txt +0 -0
  330. {datachain-0.37.7 → datachain-0.37.9}/src/datachain.egg-info/requires.txt +0 -0
  331. {datachain-0.37.7 → datachain-0.37.9}/src/datachain.egg-info/top_level.txt +0 -0
  332. {datachain-0.37.7 → datachain-0.37.9}/tests/__init__.py +0 -0
  333. {datachain-0.37.7 → datachain-0.37.9}/tests/benchmarks/__init__.py +0 -0
  334. {datachain-0.37.7 → datachain-0.37.9}/tests/benchmarks/conftest.py +0 -0
  335. {datachain-0.37.7 → datachain-0.37.9}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  336. {datachain-0.37.7 → datachain-0.37.9}/tests/benchmarks/datasets/.dvc/config +0 -0
  337. {datachain-0.37.7 → datachain-0.37.9}/tests/benchmarks/datasets/.gitignore +0 -0
  338. {datachain-0.37.7 → datachain-0.37.9}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  339. {datachain-0.37.7 → datachain-0.37.9}/tests/benchmarks/test_datachain.py +0 -0
  340. {datachain-0.37.7 → datachain-0.37.9}/tests/benchmarks/test_ls.py +0 -0
  341. {datachain-0.37.7 → datachain-0.37.9}/tests/benchmarks/test_version.py +0 -0
  342. {datachain-0.37.7 → datachain-0.37.9}/tests/conftest.py +0 -0
  343. {datachain-0.37.7 → datachain-0.37.9}/tests/data.py +0 -0
  344. {datachain-0.37.7 → datachain-0.37.9}/tests/examples/__init__.py +0 -0
  345. {datachain-0.37.7 → datachain-0.37.9}/tests/examples/test_examples.py +0 -0
  346. {datachain-0.37.7 → datachain-0.37.9}/tests/examples/test_wds_e2e.py +0 -0
  347. {datachain-0.37.7 → datachain-0.37.9}/tests/examples/wds_data.py +0 -0
  348. {datachain-0.37.7 → datachain-0.37.9}/tests/func/__init__.py +0 -0
  349. {datachain-0.37.7 → datachain-0.37.9}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  350. {datachain-0.37.7 → datachain-0.37.9}/tests/func/data/lena.jpg +0 -0
  351. {datachain-0.37.7 → datachain-0.37.9}/tests/func/fake-service-account-credentials.json +0 -0
  352. {datachain-0.37.7 → datachain-0.37.9}/tests/func/functions/__init__.py +0 -0
  353. {datachain-0.37.7 → datachain-0.37.9}/tests/func/functions/test_aggregate.py +0 -0
  354. {datachain-0.37.7 → datachain-0.37.9}/tests/func/functions/test_array.py +0 -0
  355. {datachain-0.37.7 → datachain-0.37.9}/tests/func/functions/test_conditional.py +0 -0
  356. {datachain-0.37.7 → datachain-0.37.9}/tests/func/functions/test_numeric.py +0 -0
  357. {datachain-0.37.7 → datachain-0.37.9}/tests/func/functions/test_path.py +0 -0
  358. {datachain-0.37.7 → datachain-0.37.9}/tests/func/functions/test_random.py +0 -0
  359. {datachain-0.37.7 → datachain-0.37.9}/tests/func/functions/test_string.py +0 -0
  360. {datachain-0.37.7 → datachain-0.37.9}/tests/func/model/__init__.py +0 -0
  361. {datachain-0.37.7 → datachain-0.37.9}/tests/func/model/data/running-mask0.png +0 -0
  362. {datachain-0.37.7 → datachain-0.37.9}/tests/func/model/data/running-mask1.png +0 -0
  363. {datachain-0.37.7 → datachain-0.37.9}/tests/func/model/data/running.jpg +0 -0
  364. {datachain-0.37.7 → datachain-0.37.9}/tests/func/model/data/ships.jpg +0 -0
  365. {datachain-0.37.7 → datachain-0.37.9}/tests/func/model/test_yolo.py +0 -0
  366. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_audio.py +0 -0
  367. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_catalog.py +0 -0
  368. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_checkpoints.py +0 -0
  369. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_client.py +0 -0
  370. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_cloud_transfer.py +0 -0
  371. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_datachain.py +0 -0
  372. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_datachain_merge.py +0 -0
  373. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_dataset_query.py +0 -0
  374. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_datasets.py +0 -0
  375. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_delta.py +0 -0
  376. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_feature_pickling.py +0 -0
  377. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_file.py +0 -0
  378. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_hf.py +0 -0
  379. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_hidden_field.py +0 -0
  380. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_image.py +0 -0
  381. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_listing.py +0 -0
  382. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_ls.py +0 -0
  383. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_meta_formats.py +0 -0
  384. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_metastore.py +0 -0
  385. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_metrics.py +0 -0
  386. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_mutate.py +0 -0
  387. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_pull.py +0 -0
  388. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_pytorch.py +0 -0
  389. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_query.py +0 -0
  390. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_read_database.py +0 -0
  391. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_read_dataset_remote.py +0 -0
  392. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  393. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_retry.py +0 -0
  394. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_session.py +0 -0
  395. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_storage_pattern.py +0 -0
  396. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_studio_datetime_parsing.py +0 -0
  397. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_temp_table_tracking.py +0 -0
  398. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_to_database.py +0 -0
  399. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_udf.py +0 -0
  400. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_union.py +0 -0
  401. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_video.py +0 -0
  402. {datachain-0.37.7 → datachain-0.37.9}/tests/func/test_warehouse.py +0 -0
  403. {datachain-0.37.7 → datachain-0.37.9}/tests/scripts/feature_class.py +0 -0
  404. {datachain-0.37.7 → datachain-0.37.9}/tests/scripts/feature_class_exception.py +0 -0
  405. {datachain-0.37.7 → datachain-0.37.9}/tests/scripts/feature_class_parallel.py +0 -0
  406. {datachain-0.37.7 → datachain-0.37.9}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  407. {datachain-0.37.7 → datachain-0.37.9}/tests/scripts/name_len_slow.py +0 -0
  408. {datachain-0.37.7 → datachain-0.37.9}/tests/test_atomicity.py +0 -0
  409. {datachain-0.37.7 → datachain-0.37.9}/tests/test_cli_e2e.py +0 -0
  410. {datachain-0.37.7 → datachain-0.37.9}/tests/test_cli_studio.py +0 -0
  411. {datachain-0.37.7 → datachain-0.37.9}/tests/test_import_time.py +0 -0
  412. {datachain-0.37.7 → datachain-0.37.9}/tests/test_job_management_e2e.py +0 -0
  413. {datachain-0.37.7 → datachain-0.37.9}/tests/test_query_e2e.py +0 -0
  414. {datachain-0.37.7 → datachain-0.37.9}/tests/test_telemetry.py +0 -0
  415. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/__init__.py +0 -0
  416. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/__init__.py +0 -0
  417. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/conftest.py +0 -0
  418. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_arrow.py +0 -0
  419. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_audio.py +0 -0
  420. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_checkpoints.py +0 -0
  421. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_clip.py +0 -0
  422. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  423. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_datachain_merge.py +0 -0
  424. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_diff.py +0 -0
  425. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_feature.py +0 -0
  426. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_file.py +0 -0
  427. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_hf.py +0 -0
  428. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_image.py +0 -0
  429. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_listing_info.py +0 -0
  430. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_namespace.py +0 -0
  431. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_partition_by.py +0 -0
  432. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_project.py +0 -0
  433. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_python_to_sql.py +0 -0
  434. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_schema.py +0 -0
  435. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_settings.py +0 -0
  436. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_sql_to_python.py +0 -0
  437. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_storage_pattern.py +0 -0
  438. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_text.py +0 -0
  439. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_udf.py +0 -0
  440. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_udf_signature.py +0 -0
  441. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_utils.py +0 -0
  442. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/lib/test_webdataset.py +0 -0
  443. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/model/__init__.py +0 -0
  444. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/model/test_bbox.py +0 -0
  445. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/model/test_pose.py +0 -0
  446. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/model/test_segment.py +0 -0
  447. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/model/test_utils.py +0 -0
  448. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/sql/__init__.py +0 -0
  449. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/sql/sqlite/__init__.py +0 -0
  450. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/sql/sqlite/test_types.py +0 -0
  451. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/sql/sqlite/test_utils.py +0 -0
  452. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/sql/test_array.py +0 -0
  453. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/sql/test_conditional.py +0 -0
  454. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/sql/test_path.py +0 -0
  455. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/sql/test_random.py +0 -0
  456. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/sql/test_selectable.py +0 -0
  457. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/sql/test_string.py +0 -0
  458. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_asyn.py +0 -0
  459. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_batching.py +0 -0
  460. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_cache.py +0 -0
  461. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_catalog.py +0 -0
  462. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_catalog_loader.py +0 -0
  463. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_cli_datasets.py +0 -0
  464. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_cli_parsing.py +0 -0
  465. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_client.py +0 -0
  466. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_client_gcs.py +0 -0
  467. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_client_http.py +0 -0
  468. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_client_s3.py +0 -0
  469. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_config.py +0 -0
  470. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_data_storage.py +0 -0
  471. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_database_engine.py +0 -0
  472. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_datachain_hash.py +0 -0
  473. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_dataset.py +0 -0
  474. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_dispatch.py +0 -0
  475. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_fileslice.py +0 -0
  476. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_func.py +0 -0
  477. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_hash_utils.py +0 -0
  478. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_job_management.py +0 -0
  479. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_listing.py +0 -0
  480. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_metastore.py +0 -0
  481. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_module_exports.py +0 -0
  482. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_pytorch.py +0 -0
  483. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_query.py +0 -0
  484. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_query_metrics.py +0 -0
  485. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_query_params.py +0 -0
  486. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_query_steps_hash.py +0 -0
  487. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_script_meta.py +0 -0
  488. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_semver.py +0 -0
  489. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_serializer.py +0 -0
  490. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_session.py +0 -0
  491. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_utils.py +0 -0
  492. {datachain-0.37.7 → datachain-0.37.9}/tests/unit/test_warehouse.py +0 -0
  493. {datachain-0.37.7 → datachain-0.37.9}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.37.7
3
+ Version: 0.37.9
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -18,6 +18,7 @@ from datachain.data_storage.schema import convert_rows_custom_column_types
18
18
  from datachain.data_storage.serializer import Serializable
19
19
  from datachain.dataset import DatasetRecord, StorageURI
20
20
  from datachain.lib.file import File
21
+ from datachain.lib.model_store import ModelStore
21
22
  from datachain.lib.signal_schema import SignalSchema
22
23
  from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
23
24
  from datachain.query.batch import RowsOutput
@@ -76,6 +77,29 @@ class AbstractWarehouse(ABC, Serializable):
76
77
  def cleanup_for_tests(self):
77
78
  """Cleanup for tests."""
78
79
 
80
+ def _to_jsonable(self, obj: Any) -> Any:
81
+ """Recursively convert Python/Pydantic structures into JSON-serializable
82
+ objects.
83
+ """
84
+
85
+ if ModelStore.is_pydantic(type(obj)):
86
+ return obj.model_dump()
87
+
88
+ if isinstance(obj, dict):
89
+ out: dict[str, Any] = {}
90
+ for k, v in obj.items():
91
+ if not isinstance(k, str):
92
+ key_str = json.dumps(self._to_jsonable(k), ensure_ascii=False)
93
+ else:
94
+ key_str = k
95
+ out[key_str] = self._to_jsonable(v)
96
+ return out
97
+
98
+ if isinstance(obj, (list, tuple, set)):
99
+ return [self._to_jsonable(i) for i in obj]
100
+
101
+ return obj
102
+
79
103
  def convert_type( # noqa: PLR0911
80
104
  self,
81
105
  val: Any,
@@ -122,11 +146,13 @@ class AbstractWarehouse(ABC, Serializable):
122
146
  if col_python_type is dict or col_type_name == "JSON":
123
147
  if value_type is str:
124
148
  return val
125
- if value_type in (dict, list):
126
- return json.dumps(val, ensure_ascii=False)
127
- raise ValueError(
128
- f"Cannot convert value {val!r} with type {value_type} to JSON"
129
- )
149
+ try:
150
+ json_ready = self._to_jsonable(val)
151
+ return json.dumps(json_ready, ensure_ascii=False)
152
+ except Exception as e:
153
+ raise ValueError(
154
+ f"Cannot convert value {val!r} with type {value_type} to JSON"
155
+ ) from e
130
156
 
131
157
  if isinstance(val, col_python_type):
132
158
  return val
@@ -0,0 +1,210 @@
1
+ import itertools
2
+ from collections.abc import Sequence
3
+ from typing import Any
4
+
5
+ from datachain.lib.data_model import DataType, DataTypeNames, DataValue, is_chain_type
6
+ from datachain.lib.utils import DataChainParamsError
7
+
8
+
9
+ class ValuesToTupleError(DataChainParamsError):
10
+ def __init__(self, ds_name: str, msg: str):
11
+ if ds_name:
12
+ ds_name = f"' {ds_name}'"
13
+ super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
14
+
15
+
16
+ def _find_first_non_none(sequence: Sequence[Any]) -> Any | None:
17
+ """Find the first non-None element in a sequence."""
18
+ try:
19
+ return next(itertools.dropwhile(lambda i: i is None, sequence))
20
+ except StopIteration:
21
+ return None
22
+
23
+
24
+ def _infer_list_item_type(lst: list) -> type:
25
+ """Infer the item type of a list, handling None values and nested lists."""
26
+ if len(lst) == 0:
27
+ # Default to str when list is empty to avoid generic list
28
+ return str
29
+
30
+ first_item = _find_first_non_none(lst)
31
+ if first_item is None:
32
+ # Default to str when all items are None
33
+ return str
34
+
35
+ item_type = type(first_item)
36
+
37
+ # Handle nested lists one level deep
38
+ if isinstance(first_item, list) and len(first_item) > 0:
39
+ nested_item = _find_first_non_none(first_item)
40
+ if nested_item is not None:
41
+ return list[type(nested_item)] # type: ignore[misc, return-value]
42
+ # Default to str for nested lists with all None
43
+ return list[str] # type: ignore[return-value]
44
+
45
+ return item_type
46
+
47
+
48
+ def _infer_dict_value_type(dct: dict) -> type:
49
+ """Infer the value type of a dict, handling None values and list values."""
50
+ if len(dct) == 0:
51
+ # Default to str when dict is empty to avoid generic dict values
52
+ return str
53
+
54
+ # Find first non-None value
55
+ first_value = None
56
+ for val in dct.values():
57
+ if val is not None:
58
+ first_value = val
59
+ break
60
+
61
+ if first_value is None:
62
+ # Default to str when all values are None
63
+ return str
64
+
65
+ # Handle list values
66
+ if isinstance(first_value, list) and len(first_value) > 0:
67
+ list_item = _find_first_non_none(first_value)
68
+ if list_item is not None:
69
+ return list[type(list_item)] # type: ignore[misc, return-value]
70
+ # Default to str for lists with all None
71
+ return list[str] # type: ignore[return-value]
72
+
73
+ return type(first_value)
74
+
75
+
76
+ def _infer_type_from_sequence(
77
+ sequence: Sequence[DataValue], signal_name: str, ds_name: str
78
+ ) -> type:
79
+ """
80
+ Infer the type from a sequence of values.
81
+
82
+ Returns str if all values are None, otherwise infers from the first non-None value.
83
+ Handles lists and dicts with proper type inference for nested structures.
84
+ """
85
+ first_element = _find_first_non_none(sequence)
86
+
87
+ if first_element is None:
88
+ # Default to str if column is empty or all values are None
89
+ return str
90
+
91
+ typ = type(first_element)
92
+
93
+ if not is_chain_type(typ):
94
+ raise ValuesToTupleError(
95
+ ds_name,
96
+ f"signal '{signal_name}' has unsupported type '{typ.__name__}'."
97
+ f" Please use DataModel types: {DataTypeNames}",
98
+ )
99
+
100
+ if isinstance(first_element, list):
101
+ item_type = _infer_list_item_type(first_element)
102
+ return list[item_type] # type: ignore[valid-type, return-value]
103
+
104
+ if isinstance(first_element, dict):
105
+ # If the first dict is empty, use str as default key/value types
106
+ if len(first_element) == 0:
107
+ return dict[str, str] # type: ignore[return-value]
108
+ first_key = next(iter(first_element.keys()))
109
+ value_type = _infer_dict_value_type(first_element)
110
+ return dict[type(first_key), value_type] # type: ignore[misc, return-value]
111
+
112
+ return typ
113
+
114
+
115
+ def _validate_and_normalize_output(
116
+ output: DataType | Sequence[str] | dict[str, DataType] | None,
117
+ fr_map: dict[str, Sequence[DataValue]],
118
+ ds_name: str,
119
+ ) -> dict[str, DataType] | None:
120
+ """Validate and normalize the output parameter to a dict format."""
121
+ if not output:
122
+ return None
123
+
124
+ if not isinstance(output, (Sequence, str, dict)):
125
+ if len(fr_map) != 1:
126
+ raise ValuesToTupleError(
127
+ ds_name,
128
+ f"only one output type was specified, {len(fr_map)} expected",
129
+ )
130
+ if not isinstance(output, type):
131
+ raise ValuesToTupleError(
132
+ ds_name,
133
+ f"output must specify a type while '{output}' was given",
134
+ )
135
+
136
+ key: str = next(iter(fr_map.keys()))
137
+ return {key: output} # type: ignore[dict-item]
138
+
139
+ if not isinstance(output, dict):
140
+ raise ValuesToTupleError(
141
+ ds_name,
142
+ "output type must be dict[str, DataType] while "
143
+ f"'{type(output).__name__}' is given",
144
+ )
145
+
146
+ if len(output) != len(fr_map):
147
+ raise ValuesToTupleError(
148
+ ds_name,
149
+ f"number of outputs '{len(output)}' should match"
150
+ f" number of signals '{len(fr_map)}'",
151
+ )
152
+
153
+ return output # type: ignore[return-value]
154
+
155
+
156
+ def values_to_tuples(
157
+ ds_name: str = "",
158
+ output: DataType | Sequence[str] | dict[str, DataType] | None = None,
159
+ **fr_map: Sequence[DataValue],
160
+ ) -> tuple[Any, Any, Any]:
161
+ output = _validate_and_normalize_output(output, fr_map, ds_name)
162
+
163
+ types_map: dict[str, type] = {}
164
+ length = -1
165
+ for k, v in fr_map.items():
166
+ if not isinstance(v, Sequence) or isinstance(v, str): # type: ignore[unreachable]
167
+ raise ValuesToTupleError(ds_name, f"signals '{k}' is not a sequence")
168
+ len_ = len(v)
169
+
170
+ if output:
171
+ if k not in output: # type: ignore[operator]
172
+ raise ValuesToTupleError(
173
+ ds_name,
174
+ f"signal '{k}' is not present in the output",
175
+ )
176
+ else:
177
+ # FIXME: Stops as soon as it finds the first non-None value.
178
+ # If a non-None value appears early, it won't check the remaining items for
179
+ # `None` values.
180
+ typ = _infer_type_from_sequence(v, k, ds_name)
181
+ types_map[k] = typ
182
+
183
+ if length < 0:
184
+ length = len_
185
+ elif length != len_:
186
+ raise ValuesToTupleError(
187
+ ds_name,
188
+ f"signal '{k}' should have length {length} while {len_} is given",
189
+ )
190
+
191
+ if not output:
192
+ output = types_map # type: ignore[assignment]
193
+
194
+ if not output:
195
+ raise ValuesToTupleError(
196
+ ds_name,
197
+ "output type must be dict[str, DataType] while empty is given"
198
+ " and no signals are provided",
199
+ )
200
+
201
+ output_types: list[type] = list(output.values()) # type: ignore[union-attr,call-arg,arg-type]
202
+ if len(output) > 1: # type: ignore[arg-type]
203
+ tuple_type = tuple(output_types)
204
+ res_type = tuple[tuple_type] # type: ignore[valid-type]
205
+ res_values: Sequence[Any] = list(zip(*fr_map.values(), strict=False))
206
+ else:
207
+ res_type = output_types[0] # type: ignore[misc]
208
+ res_values = next(iter(fr_map.values()))
209
+
210
+ return res_type, output, res_values
@@ -64,6 +64,9 @@ def is_chain_type(t: type) -> bool:
64
64
  if orig is list and len(args) == 1:
65
65
  return is_chain_type(get_args(t)[0])
66
66
 
67
+ if orig is dict and len(args) == 2:
68
+ return is_chain_type(args[0]) and is_chain_type(args[1])
69
+
67
70
  if orig in (Union, types.UnionType) and len(args) == 2 and (type(None) in args):
68
71
  return is_chain_type(args[0] if args[1] is type(None) else args[1])
69
72
 
@@ -52,7 +52,11 @@ from datachain.lib.udf_signature import UdfSignature
52
52
  from datachain.lib.utils import DataChainColumnError, DataChainParamsError
53
53
  from datachain.project import Project
54
54
  from datachain.query import Session
55
- from datachain.query.dataset import DatasetQuery, PartitionByType
55
+ from datachain.query.dataset import (
56
+ DatasetQuery,
57
+ PartitionByType,
58
+ RegenerateSystemColumns,
59
+ )
56
60
  from datachain.query.schema import DEFAULT_DELIMITER, Column
57
61
  from datachain.sql.functions import path as pathfunc
58
62
  from datachain.utils import batched_it, env2bool, inside_notebook, row_to_nested_dict
@@ -2740,8 +2744,20 @@ class DataChain:
2740
2744
  )
2741
2745
 
2742
2746
  def shuffle(self) -> "Self":
2743
- """Shuffle the rows of the chain deterministically."""
2744
- return self.order_by("sys.rand")
2747
+ """Shuffle rows with a best-effort deterministic ordering.
2748
+
2749
+ This produces repeatable shuffles. Merge and union operations can
2750
+ lead to non-deterministic results. Use order by or save a dataset
2751
+ afterward to guarantee the same result.
2752
+ """
2753
+ query = self._query.clone(new_table=False)
2754
+ query.steps.append(RegenerateSystemColumns(self._query.catalog))
2755
+
2756
+ chain = self._evolve(
2757
+ query=query,
2758
+ signal_schema=SignalSchema({"sys": Sys}) | self.signals_schema,
2759
+ )
2760
+ return chain.order_by("sys.rand")
2745
2761
 
2746
2762
  def sample(self, n: int) -> "Self":
2747
2763
  """Return a random sample from the chain.
@@ -1,6 +1,5 @@
1
1
  import copy
2
2
  import hashlib
3
- import json
4
3
  import logging
5
4
  import math
6
5
  import types
@@ -14,9 +13,7 @@ from typing import (
14
13
  TYPE_CHECKING,
15
14
  Annotated,
16
15
  Any,
17
- Dict, # type: ignore[UP035]
18
16
  Final,
19
- List, # type: ignore[UP035]
20
17
  Literal,
21
18
  Optional,
22
19
  Union,
@@ -24,6 +21,7 @@ from typing import (
24
21
  get_origin,
25
22
  )
26
23
 
24
+ import ujson as json
27
25
  from pydantic import BaseModel, Field, ValidationError, create_model
28
26
  from sqlalchemy import ColumnElement
29
27
  from typing_extensions import Literal as LiteralEx
@@ -569,8 +567,10 @@ class SignalSchema:
569
567
  pos = 0
570
568
  for fr_cls in self.values.values():
571
569
  if (fr := ModelStore.to_pydantic(fr_cls)) is None:
572
- res.append(row[pos])
570
+ value = row[pos]
573
571
  pos += 1
572
+ converted = self._convert_feature_value(fr_cls, value, catalog, cache)
573
+ res.append(converted)
574
574
  else:
575
575
  json, pos = unflatten_to_json_pos(fr, row, pos) # type: ignore[union-attr]
576
576
  try:
@@ -585,6 +585,72 @@ class SignalSchema:
585
585
  res.append(obj)
586
586
  return res
587
587
 
588
+ def _convert_feature_value(
589
+ self,
590
+ annotation: DataType,
591
+ value: Any,
592
+ catalog: "Catalog",
593
+ cache: bool,
594
+ ) -> Any:
595
+ """Convert raw DB value into declared annotation if needed."""
596
+ if value is None:
597
+ return None
598
+
599
+ result = value
600
+ origin = get_origin(annotation)
601
+
602
+ if origin in (Union, types.UnionType):
603
+ non_none_args = [
604
+ arg for arg in get_args(annotation) if arg is not type(None)
605
+ ]
606
+ if len(non_none_args) == 1:
607
+ annotation = non_none_args[0]
608
+ origin = get_origin(annotation)
609
+ else:
610
+ return result
611
+
612
+ if ModelStore.is_pydantic(annotation):
613
+ if isinstance(value, annotation):
614
+ obj = value
615
+ elif isinstance(value, Mapping):
616
+ obj = annotation(**value)
617
+ else:
618
+ return result
619
+ assert isinstance(obj, BaseModel)
620
+ SignalSchema._set_file_stream(obj, catalog, cache)
621
+ result = obj
622
+ elif origin is list:
623
+ args = get_args(annotation)
624
+ if args and isinstance(value, (list, tuple)):
625
+ item_type = args[0]
626
+ result = [
627
+ self._convert_feature_value(item_type, item, catalog, cache)
628
+ if item is not None
629
+ else None
630
+ for item in value
631
+ ]
632
+ elif origin is dict:
633
+ args = get_args(annotation)
634
+ if len(args) == 2 and isinstance(value, dict):
635
+ key_type, val_type = args
636
+ result = {}
637
+ for key, val in value.items():
638
+ if key_type is str:
639
+ converted_key = key
640
+ else:
641
+ loaded_key = json.loads(key)
642
+ converted_key = self._convert_feature_value(
643
+ key_type, loaded_key, catalog, cache
644
+ )
645
+ converted_val = (
646
+ self._convert_feature_value(val_type, val, catalog, cache)
647
+ if val_type is not Any
648
+ else val
649
+ )
650
+ result[converted_key] = converted_val
651
+
652
+ return result
653
+
588
654
  @staticmethod
589
655
  def _set_file_stream(
590
656
  obj: BaseModel, catalog: "Catalog", cache: bool = False
@@ -898,13 +964,13 @@ class SignalSchema:
898
964
  args = get_args(type_)
899
965
  type_str = SignalSchema._type_to_str(args[0], subtypes)
900
966
  return f"Optional[{type_str}]"
901
- if origin in (list, List): # noqa: UP006
967
+ if origin is list:
902
968
  args = get_args(type_)
903
969
  if len(args) == 0:
904
970
  return "list"
905
971
  type_str = SignalSchema._type_to_str(args[0], subtypes)
906
972
  return f"list[{type_str}]"
907
- if origin in (dict, Dict): # noqa: UP006
973
+ if origin is dict:
908
974
  args = get_args(type_)
909
975
  if len(args) == 0:
910
976
  return "dict"
@@ -786,10 +786,31 @@ class SQLClause(Step, ABC):
786
786
  return tuple(c.get_column() if isinstance(c, Function) else c for c in cols)
787
787
 
788
788
  @abstractmethod
789
- def apply_sql_clause(self, query):
789
+ def apply_sql_clause(self, query: Any) -> Any:
790
790
  pass
791
791
 
792
792
 
793
+ @frozen
794
+ class RegenerateSystemColumns(Step):
795
+ catalog: "Catalog"
796
+
797
+ def hash_inputs(self) -> str:
798
+ return hashlib.sha256(b"regenerate_system_columns").hexdigest()
799
+
800
+ def apply(
801
+ self, query_generator: QueryGenerator, temp_tables: list[str]
802
+ ) -> StepResult:
803
+ query = query_generator.select()
804
+ new_query = self.catalog.warehouse._regenerate_system_columns(
805
+ query, keep_existing_columns=True
806
+ )
807
+
808
+ def q(*columns):
809
+ return new_query.with_only_columns(*columns)
810
+
811
+ return step_result(q, new_query.selected_columns)
812
+
813
+
793
814
  @frozen
794
815
  class SQLSelect(SQLClause):
795
816
  args: tuple[Function | ColumnElement, ...]
@@ -1488,10 +1509,6 @@ class DatasetQuery:
1488
1509
  finally:
1489
1510
  self.cleanup()
1490
1511
 
1491
- def shuffle(self) -> "Self":
1492
- # ToDo: implement shaffle based on seed and/or generating random column
1493
- return self.order_by(C.sys__rand)
1494
-
1495
1512
  def sample(self, n) -> "Self":
1496
1513
  """
1497
1514
  Return a random sample from the dataset.
@@ -1,6 +1,7 @@
1
1
  import random
2
2
 
3
3
  from datachain import C, DataChain
4
+ from datachain.lib.signal_schema import SignalResolvingError
4
5
 
5
6
  RESOLUTION = 2**31 - 1 # Maximum positive value for a 32-bit signed integer.
6
7
 
@@ -59,7 +60,10 @@ def train_test_split(
59
60
  ```
60
61
 
61
62
  Note:
62
- The splits are random but deterministic, based on Dataset `sys__rand` field.
63
+ Splits reuse the same best-effort shuffle used by `DataChain.shuffle`. Results
64
+ are typically repeatable, but earlier operations such as `merge`, `union`, or
65
+ custom SQL that reshuffle rows can change the outcome between runs. Add order by
66
+ stable keys first when you need strict reproducibility.
63
67
  """
64
68
  if len(weights) < 2:
65
69
  raise ValueError("Weights should have at least two elements")
@@ -68,16 +72,34 @@ def train_test_split(
68
72
 
69
73
  weights_normalized = [weight / sum(weights) for weight in weights]
70
74
 
75
+ try:
76
+ dc.signals_schema.resolve("sys.rand")
77
+ except SignalResolvingError:
78
+ dc = dc.persist()
79
+
71
80
  rand_col = C("sys.rand")
72
81
  if seed is not None:
73
82
  uniform_seed = random.Random(seed).randrange(1, RESOLUTION) # noqa: S311
74
83
  rand_col = (rand_col % RESOLUTION) * uniform_seed # type: ignore[assignment]
75
84
  rand_col = rand_col % RESOLUTION # type: ignore[assignment]
76
85
 
77
- return [
78
- dc.filter(
79
- rand_col >= round(sum(weights_normalized[:index]) * (RESOLUTION - 1)),
80
- rand_col < round(sum(weights_normalized[: index + 1]) * (RESOLUTION - 1)),
81
- )
82
- for index, _ in enumerate(weights_normalized)
83
- ]
86
+ boundaries: list[int] = [0]
87
+ cumulative = 0.0
88
+ for weight in weights_normalized[:-1]:
89
+ cumulative += weight
90
+ boundary = round(cumulative * RESOLUTION)
91
+ boundaries.append(min(boundary, RESOLUTION))
92
+ boundaries.append(RESOLUTION)
93
+
94
+ splits: list[DataChain] = []
95
+ last_index = len(weights_normalized) - 1
96
+ for index in range(len(weights_normalized)):
97
+ lower = boundaries[index]
98
+ if index == last_index:
99
+ condition = rand_col >= lower
100
+ else:
101
+ upper = boundaries[index + 1]
102
+ condition = (rand_col >= lower) & (rand_col < upper)
103
+ splits.append(dc.filter(condition))
104
+
105
+ return splits
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.37.7
3
+ Version: 0.37.9
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -374,6 +374,7 @@ tests/func/test_query.py
374
374
  tests/func/test_read_database.py
375
375
  tests/func/test_read_dataset_remote.py
376
376
  tests/func/test_read_dataset_version_specifiers.py
377
+ tests/func/test_retrieval.py
377
378
  tests/func/test_retry.py
378
379
  tests/func/test_session.py
379
380
  tests/func/test_storage_pattern.py
@@ -2,6 +2,8 @@ from datetime import datetime
2
2
  from typing import Any
3
3
 
4
4
  import pytest
5
+ import ujson as json
6
+ from pydantic import BaseModel, ConfigDict
5
7
 
6
8
  from datachain.sql.types import (
7
9
  JSON,
@@ -93,14 +95,8 @@ def test_dir_expansion(cloud_test_catalog, version_aware, cloud_type):
93
95
  assert to_compare == expected
94
96
 
95
97
 
96
- @pytest.mark.parametrize(
97
- "cloud_type,version_aware",
98
- [("s3", True)],
99
- indirect=True,
100
- )
101
- def test_convert_type(cloud_test_catalog):
102
- ctc = cloud_test_catalog
103
- catalog = ctc.catalog
98
+ def test_convert_type(test_session):
99
+ catalog = test_session.catalog
104
100
  warehouse = catalog.warehouse
105
101
  now = datetime.now()
106
102
 
@@ -137,8 +133,47 @@ def test_convert_type(cloud_test_catalog):
137
133
  assert run_convert_type('{"a": 1}', JSON()) == '{"a": 1}'
138
134
  assert run_convert_type({"a": 1}, JSON()) == '{"a":1}'
139
135
  assert run_convert_type([{"a": 1}], JSON()) == '[{"a":1}]'
140
- with pytest.raises(ValueError):
141
- run_convert_type(0.5, JSON())
136
+ assert run_convert_type([[1, 2], [3, 4]], JSON()) == "[[1,2],[3,4]]"
137
+ assert run_convert_type(None, JSON()) == "null"
138
+ assert run_convert_type({"a": None}, JSON()) == '{"a":null}'
139
+ # primitives should serialize to valid JSON
140
+ assert run_convert_type(0.5, JSON()) == "0.5"
141
+
142
+ # JSON with Pydantic models (values and nested)
143
+ class MyFr(BaseModel):
144
+ model_config = ConfigDict(frozen=True)
145
+ nnn: str
146
+ count: int
147
+
148
+ fr1 = MyFr(nnn="x", count=1)
149
+ fr2 = MyFr(nnn="y", count=2)
150
+
151
+ # Pydantic as dict value
152
+ out = run_convert_type({"a": fr1}, JSON())
153
+ assert out == '{"a":{"nnn":"x","count":1}}'
154
+
155
+ # Pydantic in list
156
+ out = run_convert_type([fr1, fr2], JSON())
157
+ assert out == '[{"nnn":"x","count":1},{"nnn":"y","count":2}]'
158
+
159
+ # Nested structures with Pydantic
160
+ out = run_convert_type({"k": [{"inner": fr1}]}, JSON())
161
+ assert out == '{"k":[{"inner":{"nnn":"x","count":1}}]}'
162
+
163
+ # Complex dict key (tuple) becomes a JSON-encoded string key
164
+ out = run_convert_type({(1, "a"): 3}, JSON())
165
+ # Decode and compare to expected mapping using encoded key
166
+ loaded = json.loads(out)
167
+ assert loaded == {json.dumps([1, "a"]): 3}
168
+
169
+ # Pydantic model as dict key
170
+ key_model = MyFr(nnn="k", count=7)
171
+ d: dict[Any, Any] = {}
172
+ d[key_model] = "v"
173
+ out = run_convert_type(d, JSON())
174
+ loaded = json.loads(out)
175
+ expected_key = json.dumps({"nnn": "k", "count": 7})
176
+ assert loaded == {expected_key: "v"}
142
177
 
143
178
  # convert array to compatible type
144
179
  converted = run_convert_type([1, 2], Array(Float))