datachain 0.28.0__tar.gz → 0.28.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (406) hide show
  1. {datachain-0.28.0 → datachain-0.28.1}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.28.0 → datachain-0.28.1}/PKG-INFO +1 -1
  3. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/datachain.py +9 -4
  4. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/file.py +53 -1
  5. datachain-0.28.1/src/datachain/lib/utils.py +155 -0
  6. {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/PKG-INFO +1 -1
  7. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_datachain.py +17 -6
  8. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_file.py +47 -1
  9. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_utils.py +70 -1
  10. datachain-0.28.0/src/datachain/lib/utils.py +0 -59
  11. {datachain-0.28.0 → datachain-0.28.1}/.cruft.json +0 -0
  12. {datachain-0.28.0 → datachain-0.28.1}/.gitattributes +0 -0
  13. {datachain-0.28.0 → datachain-0.28.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  14. {datachain-0.28.0 → datachain-0.28.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  15. {datachain-0.28.0 → datachain-0.28.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  16. {datachain-0.28.0 → datachain-0.28.1}/.github/codecov.yaml +0 -0
  17. {datachain-0.28.0 → datachain-0.28.1}/.github/dependabot.yml +0 -0
  18. {datachain-0.28.0 → datachain-0.28.1}/.github/workflows/benchmarks.yml +0 -0
  19. {datachain-0.28.0 → datachain-0.28.1}/.github/workflows/release.yml +0 -0
  20. {datachain-0.28.0 → datachain-0.28.1}/.github/workflows/tests-studio.yml +0 -0
  21. {datachain-0.28.0 → datachain-0.28.1}/.github/workflows/tests.yml +0 -0
  22. {datachain-0.28.0 → datachain-0.28.1}/.github/workflows/update-template.yaml +0 -0
  23. {datachain-0.28.0 → datachain-0.28.1}/.gitignore +0 -0
  24. {datachain-0.28.0 → datachain-0.28.1}/CODE_OF_CONDUCT.rst +0 -0
  25. {datachain-0.28.0 → datachain-0.28.1}/LICENSE +0 -0
  26. {datachain-0.28.0 → datachain-0.28.1}/README.rst +0 -0
  27. {datachain-0.28.0 → datachain-0.28.1}/docs/assets/captioned_cartoons.png +0 -0
  28. {datachain-0.28.0 → datachain-0.28.1}/docs/assets/datachain-white.svg +0 -0
  29. {datachain-0.28.0 → datachain-0.28.1}/docs/assets/datachain.svg +0 -0
  30. {datachain-0.28.0 → datachain-0.28.1}/docs/commands/auth/login.md +0 -0
  31. {datachain-0.28.0 → datachain-0.28.1}/docs/commands/auth/logout.md +0 -0
  32. {datachain-0.28.0 → datachain-0.28.1}/docs/commands/auth/team.md +0 -0
  33. {datachain-0.28.0 → datachain-0.28.1}/docs/commands/auth/token.md +0 -0
  34. {datachain-0.28.0 → datachain-0.28.1}/docs/commands/index.md +0 -0
  35. {datachain-0.28.0 → datachain-0.28.1}/docs/commands/job/cancel.md +0 -0
  36. {datachain-0.28.0 → datachain-0.28.1}/docs/commands/job/clusters.md +0 -0
  37. {datachain-0.28.0 → datachain-0.28.1}/docs/commands/job/logs.md +0 -0
  38. {datachain-0.28.0 → datachain-0.28.1}/docs/commands/job/ls.md +0 -0
  39. {datachain-0.28.0 → datachain-0.28.1}/docs/commands/job/run.md +0 -0
  40. {datachain-0.28.0 → datachain-0.28.1}/docs/contributing.md +0 -0
  41. {datachain-0.28.0 → datachain-0.28.1}/docs/css/github-permalink-style.css +0 -0
  42. {datachain-0.28.0 → datachain-0.28.1}/docs/examples.md +0 -0
  43. {datachain-0.28.0 → datachain-0.28.1}/docs/guide/db_migrations.md +0 -0
  44. {datachain-0.28.0 → datachain-0.28.1}/docs/guide/delta.md +0 -0
  45. {datachain-0.28.0 → datachain-0.28.1}/docs/guide/env.md +0 -0
  46. {datachain-0.28.0 → datachain-0.28.1}/docs/guide/index.md +0 -0
  47. {datachain-0.28.0 → datachain-0.28.1}/docs/guide/namespaces.md +0 -0
  48. {datachain-0.28.0 → datachain-0.28.1}/docs/guide/processing.md +0 -0
  49. {datachain-0.28.0 → datachain-0.28.1}/docs/guide/remotes.md +0 -0
  50. {datachain-0.28.0 → datachain-0.28.1}/docs/guide/retry.md +0 -0
  51. {datachain-0.28.0 → datachain-0.28.1}/docs/index.md +0 -0
  52. {datachain-0.28.0 → datachain-0.28.1}/docs/overrides/main.html +0 -0
  53. {datachain-0.28.0 → datachain-0.28.1}/docs/quick-start.md +0 -0
  54. {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/arrowrow.md +0 -0
  55. {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/bbox.md +0 -0
  56. {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/file.md +0 -0
  57. {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/imagefile.md +0 -0
  58. {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/index.md +0 -0
  59. {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/pose.md +0 -0
  60. {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/segment.md +0 -0
  61. {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/tarvfile.md +0 -0
  62. {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/textfile.md +0 -0
  63. {datachain-0.28.0 → datachain-0.28.1}/docs/references/data-types/videofile.md +0 -0
  64. {datachain-0.28.0 → datachain-0.28.1}/docs/references/datachain.md +0 -0
  65. {datachain-0.28.0 → datachain-0.28.1}/docs/references/func.md +0 -0
  66. {datachain-0.28.0 → datachain-0.28.1}/docs/references/index.md +0 -0
  67. {datachain-0.28.0 → datachain-0.28.1}/docs/references/toolkit.md +0 -0
  68. {datachain-0.28.0 → datachain-0.28.1}/docs/references/torch.md +0 -0
  69. {datachain-0.28.0 → datachain-0.28.1}/docs/references/udf.md +0 -0
  70. {datachain-0.28.0 → datachain-0.28.1}/docs/tutorials.md +0 -0
  71. {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  72. {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  73. {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/openimage-detect.py +0 -0
  74. {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
  75. {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/ultralytics-pose.py +0 -0
  76. {datachain-0.28.0 → datachain-0.28.1}/examples/computer_vision/ultralytics-segment.py +0 -0
  77. {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/common_sql_functions.py +0 -0
  78. {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/json-csv-reader.py +0 -0
  79. {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/torch-loader.py +0 -0
  80. {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/udfs/parallel.py +0 -0
  81. {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/udfs/simple.py +0 -0
  82. {datachain-0.28.0 → datachain-0.28.1}/examples/get_started/udfs/stateful.py +0 -0
  83. {datachain-0.28.0 → datachain-0.28.1}/examples/incremental_processing/delta.py +0 -0
  84. {datachain-0.28.0 → datachain-0.28.1}/examples/incremental_processing/retry.py +0 -0
  85. {datachain-0.28.0 → datachain-0.28.1}/examples/incremental_processing/utils.py +0 -0
  86. {datachain-0.28.0 → datachain-0.28.1}/examples/llm_and_nlp/claude-query.py +0 -0
  87. {datachain-0.28.0 → datachain-0.28.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  88. {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/audio-to-text.py +0 -0
  89. {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/clip_inference.py +0 -0
  90. {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/hf_pipeline.py +0 -0
  91. {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  92. {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/wds.py +0 -0
  93. {datachain-0.28.0 → datachain-0.28.1}/examples/multimodal/wds_filtered.py +0 -0
  94. {datachain-0.28.0 → datachain-0.28.1}/mkdocs.yml +0 -0
  95. {datachain-0.28.0 → datachain-0.28.1}/noxfile.py +0 -0
  96. {datachain-0.28.0 → datachain-0.28.1}/pyproject.toml +0 -0
  97. {datachain-0.28.0 → datachain-0.28.1}/setup.cfg +0 -0
  98. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/__init__.py +0 -0
  99. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/__main__.py +0 -0
  100. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/asyn.py +0 -0
  101. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cache.py +0 -0
  102. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/catalog/__init__.py +0 -0
  103. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/catalog/catalog.py +0 -0
  104. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/catalog/datasource.py +0 -0
  105. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/catalog/loader.py +0 -0
  106. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/__init__.py +0 -0
  107. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/__init__.py +0 -0
  108. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/datasets.py +0 -0
  109. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/du.py +0 -0
  110. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/index.py +0 -0
  111. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/ls.py +0 -0
  112. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/misc.py +0 -0
  113. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/query.py +0 -0
  114. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/commands/show.py +0 -0
  115. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/parser/__init__.py +0 -0
  116. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/parser/job.py +0 -0
  117. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/parser/studio.py +0 -0
  118. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/parser/utils.py +0 -0
  119. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/cli/utils.py +0 -0
  120. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/__init__.py +0 -0
  121. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/azure.py +0 -0
  122. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/fileslice.py +0 -0
  123. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/fsspec.py +0 -0
  124. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/gcs.py +0 -0
  125. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/hf.py +0 -0
  126. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/local.py +0 -0
  127. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/client/s3.py +0 -0
  128. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/config.py +0 -0
  129. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/__init__.py +0 -0
  130. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/db_engine.py +0 -0
  131. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/job.py +0 -0
  132. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/metastore.py +0 -0
  133. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/schema.py +0 -0
  134. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/serializer.py +0 -0
  135. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/sqlite.py +0 -0
  136. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/data_storage/warehouse.py +0 -0
  137. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/dataset.py +0 -0
  138. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/delta.py +0 -0
  139. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/diff/__init__.py +0 -0
  140. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/error.py +0 -0
  141. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/fs/__init__.py +0 -0
  142. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/fs/reference.py +0 -0
  143. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/fs/utils.py +0 -0
  144. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/__init__.py +0 -0
  145. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/aggregate.py +0 -0
  146. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/array.py +0 -0
  147. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/base.py +0 -0
  148. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/conditional.py +0 -0
  149. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/func.py +0 -0
  150. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/numeric.py +0 -0
  151. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/path.py +0 -0
  152. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/random.py +0 -0
  153. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/string.py +0 -0
  154. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/func/window.py +0 -0
  155. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/job.py +0 -0
  156. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/__init__.py +0 -0
  157. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/arrow.py +0 -0
  158. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/audio.py +0 -0
  159. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/clip.py +0 -0
  160. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/__init__.py +0 -0
  161. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/flatten.py +0 -0
  162. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  163. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  164. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/unflatten.py +0 -0
  165. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  166. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/data_model.py +0 -0
  167. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dataset_info.py +0 -0
  168. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/__init__.py +0 -0
  169. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/csv.py +0 -0
  170. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/database.py +0 -0
  171. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/datasets.py +0 -0
  172. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/hf.py +0 -0
  173. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/json.py +0 -0
  174. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/listings.py +0 -0
  175. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/pandas.py +0 -0
  176. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/parquet.py +0 -0
  177. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/records.py +0 -0
  178. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/storage.py +0 -0
  179. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/utils.py +0 -0
  180. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/dc/values.py +0 -0
  181. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/hf.py +0 -0
  182. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/image.py +0 -0
  183. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/listing.py +0 -0
  184. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/listing_info.py +0 -0
  185. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/meta_formats.py +0 -0
  186. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/model_store.py +0 -0
  187. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/namespaces.py +0 -0
  188. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/projects.py +0 -0
  189. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/pytorch.py +0 -0
  190. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/settings.py +0 -0
  191. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/signal_schema.py +0 -0
  192. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/tar.py +0 -0
  193. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/text.py +0 -0
  194. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/udf.py +0 -0
  195. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/udf_signature.py +0 -0
  196. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/video.py +0 -0
  197. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/webdataset.py +0 -0
  198. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/lib/webdataset_laion.py +0 -0
  199. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/listing.py +0 -0
  200. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/__init__.py +0 -0
  201. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/bbox.py +0 -0
  202. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/pose.py +0 -0
  203. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/segment.py +0 -0
  204. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/ultralytics/__init__.py +0 -0
  205. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/ultralytics/bbox.py +0 -0
  206. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/ultralytics/pose.py +0 -0
  207. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/ultralytics/segment.py +0 -0
  208. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/model/utils.py +0 -0
  209. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/namespace.py +0 -0
  210. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/node.py +0 -0
  211. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/nodes_fetcher.py +0 -0
  212. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/nodes_thread_pool.py +0 -0
  213. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/progress.py +0 -0
  214. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/project.py +0 -0
  215. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/py.typed +0 -0
  216. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/__init__.py +0 -0
  217. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/batch.py +0 -0
  218. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/dataset.py +0 -0
  219. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/dispatch.py +0 -0
  220. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/metrics.py +0 -0
  221. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/params.py +0 -0
  222. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/queue.py +0 -0
  223. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/schema.py +0 -0
  224. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/session.py +0 -0
  225. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/udf.py +0 -0
  226. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/query/utils.py +0 -0
  227. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/remote/__init__.py +0 -0
  228. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/remote/studio.py +0 -0
  229. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/script_meta.py +0 -0
  230. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/semver.py +0 -0
  231. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/__init__.py +0 -0
  232. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/default/__init__.py +0 -0
  233. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/default/base.py +0 -0
  234. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/__init__.py +0 -0
  235. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/aggregate.py +0 -0
  236. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/array.py +0 -0
  237. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/conditional.py +0 -0
  238. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/numeric.py +0 -0
  239. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/path.py +0 -0
  240. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/random.py +0 -0
  241. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/functions/string.py +0 -0
  242. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/selectable.py +0 -0
  243. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  244. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/sqlite/base.py +0 -0
  245. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/sqlite/types.py +0 -0
  246. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/sqlite/vector.py +0 -0
  247. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/types.py +0 -0
  248. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/sql/utils.py +0 -0
  249. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/studio.py +0 -0
  250. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/telemetry.py +0 -0
  251. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/toolkit/__init__.py +0 -0
  252. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/toolkit/split.py +0 -0
  253. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/torch/__init__.py +0 -0
  254. {datachain-0.28.0 → datachain-0.28.1}/src/datachain/utils.py +0 -0
  255. {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/SOURCES.txt +0 -0
  256. {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  257. {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/entry_points.txt +0 -0
  258. {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/requires.txt +0 -0
  259. {datachain-0.28.0 → datachain-0.28.1}/src/datachain.egg-info/top_level.txt +0 -0
  260. {datachain-0.28.0 → datachain-0.28.1}/tests/__init__.py +0 -0
  261. {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/__init__.py +0 -0
  262. {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/conftest.py +0 -0
  263. {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  264. {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  265. {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/datasets/.gitignore +0 -0
  266. {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  267. {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/test_datachain.py +0 -0
  268. {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/test_ls.py +0 -0
  269. {datachain-0.28.0 → datachain-0.28.1}/tests/benchmarks/test_version.py +0 -0
  270. {datachain-0.28.0 → datachain-0.28.1}/tests/conftest.py +0 -0
  271. {datachain-0.28.0 → datachain-0.28.1}/tests/data.py +0 -0
  272. {datachain-0.28.0 → datachain-0.28.1}/tests/examples/__init__.py +0 -0
  273. {datachain-0.28.0 → datachain-0.28.1}/tests/examples/test_examples.py +0 -0
  274. {datachain-0.28.0 → datachain-0.28.1}/tests/examples/test_wds_e2e.py +0 -0
  275. {datachain-0.28.0 → datachain-0.28.1}/tests/examples/wds_data.py +0 -0
  276. {datachain-0.28.0 → datachain-0.28.1}/tests/func/__init__.py +0 -0
  277. {datachain-0.28.0 → datachain-0.28.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  278. {datachain-0.28.0 → datachain-0.28.1}/tests/func/data/lena.jpg +0 -0
  279. {datachain-0.28.0 → datachain-0.28.1}/tests/func/fake-service-account-credentials.json +0 -0
  280. {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/__init__.py +0 -0
  281. {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_aggregate.py +0 -0
  282. {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_array.py +0 -0
  283. {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_conditional.py +0 -0
  284. {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_numeric.py +0 -0
  285. {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_path.py +0 -0
  286. {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_random.py +0 -0
  287. {datachain-0.28.0 → datachain-0.28.1}/tests/func/functions/test_string.py +0 -0
  288. {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/__init__.py +0 -0
  289. {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/data/running-mask0.png +0 -0
  290. {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/data/running-mask1.png +0 -0
  291. {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/data/running.jpg +0 -0
  292. {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/data/ships.jpg +0 -0
  293. {datachain-0.28.0 → datachain-0.28.1}/tests/func/model/test_yolo.py +0 -0
  294. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_audio.py +0 -0
  295. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_batching.py +0 -0
  296. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_catalog.py +0 -0
  297. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_client.py +0 -0
  298. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_cloud_transfer.py +0 -0
  299. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_data_storage.py +0 -0
  300. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_datachain_merge.py +0 -0
  301. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_dataset_query.py +0 -0
  302. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_datasets.py +0 -0
  303. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_delta.py +0 -0
  304. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_feature_pickling.py +0 -0
  305. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_file.py +0 -0
  306. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_hf.py +0 -0
  307. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_hidden_field.py +0 -0
  308. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_image.py +0 -0
  309. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_listing.py +0 -0
  310. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_ls.py +0 -0
  311. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_meta_formats.py +0 -0
  312. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_metastore.py +0 -0
  313. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_metrics.py +0 -0
  314. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_pull.py +0 -0
  315. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_pytorch.py +0 -0
  316. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_query.py +0 -0
  317. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_read_database.py +0 -0
  318. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_read_dataset_remote.py +0 -0
  319. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_read_dataset_version_specifiers.py +0 -0
  320. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_retry.py +0 -0
  321. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_session.py +0 -0
  322. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_studio_datetime_parsing.py +0 -0
  323. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_toolkit.py +0 -0
  324. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_video.py +0 -0
  325. {datachain-0.28.0 → datachain-0.28.1}/tests/func/test_warehouse.py +0 -0
  326. {datachain-0.28.0 → datachain-0.28.1}/tests/scripts/feature_class.py +0 -0
  327. {datachain-0.28.0 → datachain-0.28.1}/tests/scripts/feature_class_exception.py +0 -0
  328. {datachain-0.28.0 → datachain-0.28.1}/tests/scripts/feature_class_parallel.py +0 -0
  329. {datachain-0.28.0 → datachain-0.28.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  330. {datachain-0.28.0 → datachain-0.28.1}/tests/scripts/name_len_slow.py +0 -0
  331. {datachain-0.28.0 → datachain-0.28.1}/tests/test_atomicity.py +0 -0
  332. {datachain-0.28.0 → datachain-0.28.1}/tests/test_cli_e2e.py +0 -0
  333. {datachain-0.28.0 → datachain-0.28.1}/tests/test_cli_studio.py +0 -0
  334. {datachain-0.28.0 → datachain-0.28.1}/tests/test_import_time.py +0 -0
  335. {datachain-0.28.0 → datachain-0.28.1}/tests/test_query_e2e.py +0 -0
  336. {datachain-0.28.0 → datachain-0.28.1}/tests/test_telemetry.py +0 -0
  337. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/__init__.py +0 -0
  338. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/__init__.py +0 -0
  339. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/conftest.py +0 -0
  340. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_arrow.py +0 -0
  341. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_audio.py +0 -0
  342. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_clip.py +0 -0
  343. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_datachain.py +0 -0
  344. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  345. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  346. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_diff.py +0 -0
  347. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_feature.py +0 -0
  348. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_feature_utils.py +0 -0
  349. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_hf.py +0 -0
  350. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_image.py +0 -0
  351. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_listing_info.py +0 -0
  352. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_namespace.py +0 -0
  353. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_partition_by.py +0 -0
  354. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_project.py +0 -0
  355. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_python_to_sql.py +0 -0
  356. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_schema.py +0 -0
  357. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_signal_schema.py +0 -0
  358. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  359. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_text.py +0 -0
  360. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_udf.py +0 -0
  361. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_udf_signature.py +0 -0
  362. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/lib/test_webdataset.py +0 -0
  363. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/model/__init__.py +0 -0
  364. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/model/test_bbox.py +0 -0
  365. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/model/test_pose.py +0 -0
  366. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/model/test_segment.py +0 -0
  367. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/model/test_utils.py +0 -0
  368. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/__init__.py +0 -0
  369. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  370. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/sqlite/test_types.py +0 -0
  371. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  372. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_array.py +0 -0
  373. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_conditional.py +0 -0
  374. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_path.py +0 -0
  375. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_random.py +0 -0
  376. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_selectable.py +0 -0
  377. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/sql/test_string.py +0 -0
  378. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_asyn.py +0 -0
  379. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_cache.py +0 -0
  380. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_catalog.py +0 -0
  381. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_catalog_loader.py +0 -0
  382. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_cli_parsing.py +0 -0
  383. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_client.py +0 -0
  384. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_client_gcs.py +0 -0
  385. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_client_s3.py +0 -0
  386. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_config.py +0 -0
  387. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_data_storage.py +0 -0
  388. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_database_engine.py +0 -0
  389. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_dataset.py +0 -0
  390. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_dispatch.py +0 -0
  391. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_fileslice.py +0 -0
  392. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_func.py +0 -0
  393. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_listing.py +0 -0
  394. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_metastore.py +0 -0
  395. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_module_exports.py +0 -0
  396. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_pytorch.py +0 -0
  397. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_query.py +0 -0
  398. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_query_metrics.py +0 -0
  399. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_query_params.py +0 -0
  400. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_script_meta.py +0 -0
  401. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_semver.py +0 -0
  402. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_serializer.py +0 -0
  403. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_session.py +0 -0
  404. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_utils.py +0 -0
  405. {datachain-0.28.0 → datachain-0.28.1}/tests/unit/test_warehouse.py +0 -0
  406. {datachain-0.28.0 → datachain-0.28.1}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.12.4'
27
+ rev: 'v0.12.5'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.28.0
3
+ Version: 0.28.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -2419,9 +2419,11 @@ class DataChain:
2419
2419
  ds.to_storage("gs://mybucket", placement="filename")
2420
2420
  ```
2421
2421
  """
2422
+ chain = self.persist()
2423
+ count = chain.count()
2424
+
2422
2425
  if placement == "filename" and (
2423
- self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
2424
- != self._query.count()
2426
+ chain._query.distinct(pathfunc.name(C(f"{signal}__path"))).count() != count
2425
2427
  ):
2426
2428
  raise ValueError("Files with the same name found")
2427
2429
 
@@ -2433,7 +2435,7 @@ class DataChain:
2433
2435
  unit=" files",
2434
2436
  unit_scale=True,
2435
2437
  unit_divisor=10,
2436
- total=self.count(),
2438
+ total=count,
2437
2439
  leave=False,
2438
2440
  )
2439
2441
  file_exporter = FileExporter(
@@ -2444,7 +2446,10 @@ class DataChain:
2444
2446
  max_threads=num_threads or 1,
2445
2447
  client_config=client_config,
2446
2448
  )
2447
- file_exporter.run(self.to_values(signal), progress_bar)
2449
+ file_exporter.run(
2450
+ (rows[0] for rows in chain.to_iter(signal)),
2451
+ progress_bar,
2452
+ )
2448
2453
 
2449
2454
  def shuffle(self) -> "Self":
2450
2455
  """Shuffle the rows of the chain deterministically."""
@@ -23,7 +23,7 @@ from pydantic import Field, field_validator
23
23
 
24
24
  from datachain.client.fileslice import FileSlice
25
25
  from datachain.lib.data_model import DataModel
26
- from datachain.lib.utils import DataChainError
26
+ from datachain.lib.utils import DataChainError, rebase_path
27
27
  from datachain.nodes_thread_pool import NodesThreadPool
28
28
  from datachain.sql.types import JSON, Boolean, DateTime, Int, String
29
29
  from datachain.utils import TIME_ZERO
@@ -634,6 +634,40 @@ class File(DataModel):
634
634
  location=self.location,
635
635
  )
636
636
 
637
+ def rebase(
638
+ self,
639
+ old_base: str,
640
+ new_base: str,
641
+ suffix: str = "",
642
+ extension: str = "",
643
+ ) -> str:
644
+ """
645
+ Rebase the file's URI from one base directory to another.
646
+
647
+ Args:
648
+ old_base: Base directory to remove from the file's URI
649
+ new_base: New base directory to prepend
650
+ suffix: Optional suffix to add before file extension
651
+ extension: Optional new file extension (without dot)
652
+
653
+ Returns:
654
+ str: Rebased URI with new base directory
655
+
656
+ Raises:
657
+ ValueError: If old_base is not found in the file's URI
658
+
659
+ Examples:
660
+ >>> file = File(source="s3://bucket", path="data/2025-05-27/file.wav")
661
+ >>> file.rebase("s3://bucket/data", "s3://output-bucket/processed", \
662
+ extension="mp3")
663
+ 's3://output-bucket/processed/2025-05-27/file.mp3'
664
+
665
+ >>> file.rebase("data/audio", "/local/output", suffix="_ch1",
666
+ extension="npy")
667
+ '/local/output/file_ch1.npy'
668
+ """
669
+ return rebase_path(self.get_uri(), old_base, new_base, suffix, extension)
670
+
637
671
 
638
672
  def resolve(file: File) -> File:
639
673
  """
@@ -1219,6 +1253,24 @@ class Audio(DataModel):
1219
1253
  codec: str = Field(default="")
1220
1254
  bit_rate: int = Field(default=-1)
1221
1255
 
1256
+ @staticmethod
1257
+ def get_channel_name(num_channels: int, channel_idx: int) -> str:
1258
+ """Map channel index to meaningful name based on common audio formats"""
1259
+ channel_mappings = {
1260
+ 1: ["Mono"],
1261
+ 2: ["Left", "Right"],
1262
+ 4: ["W", "X", "Y", "Z"], # First-order Ambisonics
1263
+ 6: ["FL", "FR", "FC", "LFE", "BL", "BR"], # 5.1 surround
1264
+ 8: ["FL", "FR", "FC", "LFE", "BL", "BR", "SL", "SR"], # 7.1 surround
1265
+ }
1266
+
1267
+ if num_channels in channel_mappings:
1268
+ channels = channel_mappings[num_channels]
1269
+ if 0 <= channel_idx < len(channels):
1270
+ return channels[channel_idx]
1271
+
1272
+ return f"Ch{channel_idx + 1}"
1273
+
1222
1274
 
1223
1275
  class ArrowRow(DataModel):
1224
1276
  """`DataModel` for reading row from Arrow-supported file."""
@@ -0,0 +1,155 @@
1
+ import re
2
+ from abc import ABC, abstractmethod
3
+ from collections.abc import Sequence
4
+ from pathlib import PurePosixPath
5
+ from urllib.parse import urlparse
6
+
7
+
8
+ class AbstractUDF(ABC):
9
+ @abstractmethod
10
+ def process(self, *args, **kwargs):
11
+ pass
12
+
13
+ @abstractmethod
14
+ def setup(self):
15
+ pass
16
+
17
+ @abstractmethod
18
+ def teardown(self):
19
+ pass
20
+
21
+
22
+ class DataChainError(Exception):
23
+ pass
24
+
25
+
26
+ class DataChainParamsError(DataChainError):
27
+ pass
28
+
29
+
30
+ class DataChainColumnError(DataChainParamsError):
31
+ def __init__(self, col_name: str, msg: str):
32
+ super().__init__(f"Error for column {col_name}: {msg}")
33
+
34
+
35
+ def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
36
+ """Returns normalized_name -> original_name dict."""
37
+ gen_col_counter = 0
38
+ new_col_names = {}
39
+ org_col_names = set(col_names)
40
+
41
+ for org_column in col_names:
42
+ new_column = org_column.lower()
43
+ new_column = re.sub("[^0-9a-z]+", "_", new_column)
44
+ new_column = new_column.strip("_")
45
+
46
+ generated_column = new_column
47
+
48
+ while (
49
+ not generated_column.isidentifier()
50
+ or generated_column in new_col_names
51
+ or (generated_column != org_column and generated_column in org_col_names)
52
+ ):
53
+ if new_column:
54
+ generated_column = f"c{gen_col_counter}_{new_column}"
55
+ else:
56
+ generated_column = f"c{gen_col_counter}"
57
+ gen_col_counter += 1
58
+
59
+ new_col_names[generated_column] = org_column
60
+
61
+ return new_col_names
62
+
63
+
64
+ def rebase_path(
65
+ src_path: str,
66
+ old_base: str,
67
+ new_base: str,
68
+ suffix: str = "",
69
+ extension: str = "",
70
+ ) -> str:
71
+ """
72
+ Rebase a file path from one base directory to another.
73
+
74
+ Args:
75
+ src_path: Source file path (can include URI scheme like s3://)
76
+ old_base: Base directory to remove from src_path
77
+ new_base: New base directory to prepend
78
+ suffix: Optional suffix to add before file extension
79
+ extension: Optional new file extension (without dot)
80
+
81
+ Returns:
82
+ str: Rebased path with new base directory
83
+
84
+ Raises:
85
+ ValueError: If old_base is not found in src_path
86
+ """
87
+ # Parse URIs to handle schemes properly
88
+ src_parsed = urlparse(src_path)
89
+ old_base_parsed = urlparse(old_base)
90
+ new_base_parsed = urlparse(new_base)
91
+
92
+ # Get the path component (without scheme)
93
+ if src_parsed.scheme:
94
+ src_path_only = src_parsed.netloc + src_parsed.path
95
+ else:
96
+ src_path_only = src_path
97
+
98
+ if old_base_parsed.scheme:
99
+ old_base_only = old_base_parsed.netloc + old_base_parsed.path
100
+ else:
101
+ old_base_only = old_base
102
+
103
+ # Normalize paths
104
+ src_path_norm = PurePosixPath(src_path_only).as_posix()
105
+ old_base_norm = PurePosixPath(old_base_only).as_posix()
106
+
107
+ # Find where old_base appears in src_path
108
+ if old_base_norm in src_path_norm:
109
+ # Find the index where old_base appears
110
+ idx = src_path_norm.find(old_base_norm)
111
+ if idx == -1:
112
+ raise ValueError(f"old_base '{old_base}' not found in src_path")
113
+
114
+ # Extract the relative path after old_base
115
+ relative_start = idx + len(old_base_norm)
116
+ # Skip leading slash if present
117
+ if relative_start < len(src_path_norm) and src_path_norm[relative_start] == "/":
118
+ relative_start += 1
119
+ relative_path = src_path_norm[relative_start:]
120
+ else:
121
+ raise ValueError(f"old_base '{old_base}' not found in src_path")
122
+
123
+ # Parse the filename
124
+ path_obj = PurePosixPath(relative_path)
125
+ stem = path_obj.stem
126
+ current_ext = path_obj.suffix
127
+
128
+ # Apply suffix and extension changes
129
+ new_stem = stem + suffix if suffix else stem
130
+ if extension:
131
+ new_ext = f".{extension}"
132
+ elif current_ext:
133
+ new_ext = current_ext
134
+ else:
135
+ new_ext = ""
136
+
137
+ # Build new filename
138
+ new_name = new_stem + new_ext
139
+
140
+ # Reconstruct path with new base
141
+ parent = str(path_obj.parent)
142
+ if parent == ".":
143
+ new_relative_path = new_name
144
+ else:
145
+ new_relative_path = str(PurePosixPath(parent) / new_name)
146
+
147
+ # Handle new_base URI scheme
148
+ if new_base_parsed.scheme:
149
+ # Has schema like s3://
150
+ base_path = new_base_parsed.netloc + new_base_parsed.path
151
+ base_path = PurePosixPath(base_path).as_posix()
152
+ full_path = str(PurePosixPath(base_path) / new_relative_path)
153
+ return f"{new_base_parsed.scheme}://{full_path}"
154
+ # Regular path
155
+ return str(PurePosixPath(new_base) / new_relative_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.28.0
3
+ Version: 0.28.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -9,7 +9,7 @@ import uuid
9
9
  from collections.abc import Iterator
10
10
  from datetime import datetime, timedelta, timezone
11
11
  from pathlib import Path, PurePosixPath
12
- from unittest.mock import patch
12
+ from unittest.mock import Mock, patch
13
13
 
14
14
  import numpy as np
15
15
  import pandas as pd
@@ -358,15 +358,24 @@ def test_to_storage(
358
358
  file_type,
359
359
  num_threads,
360
360
  ):
361
+ mapper = Mock(side_effect=lambda file_path: len(file_path))
362
+
361
363
  ctc = cloud_test_catalog
362
364
  df = dc.read_storage(ctc.src_uri, type=file_type, session=test_session)
363
365
  if use_map:
364
- df.settings(cache=use_cache).map(
365
- res=lambda file: file.export(tmp_dir / "output", placement=placement)
366
- ).exec()
366
+ (
367
+ df.settings(cache=use_cache)
368
+ .map(mapper, params=["file.path"], output={"path_len": int})
369
+ .map(res=lambda file: file.export(tmp_dir / "output", placement=placement))
370
+ .exec()
371
+ )
367
372
  else:
368
- df.settings(cache=use_cache).to_storage(
369
- tmp_dir / "output", placement=placement, num_threads=num_threads
373
+ (
374
+ df.settings(cache=use_cache)
375
+ .map(mapper, params=["file.path"], output={"path_len": int})
376
+ .to_storage(
377
+ tmp_dir / "output", placement=placement, num_threads=num_threads
378
+ )
370
379
  )
371
380
 
372
381
  expected = {
@@ -387,6 +396,8 @@ def test_to_storage(
387
396
  with open(tmp_dir / "output" / file_path) as f:
388
397
  assert f.read() == expected[file.name]
389
398
 
399
+ assert mapper.call_count == len(expected)
400
+
390
401
 
391
402
  @pytest.mark.parametrize("use_cache", [True, False])
392
403
  def test_export_images_files(test_session, tmp_dir, tmp_path, use_cache):
@@ -7,7 +7,7 @@ from fsspec.implementations.local import LocalFileSystem
7
7
  from PIL import Image
8
8
 
9
9
  from datachain.catalog import Catalog
10
- from datachain.lib.file import File, FileError, ImageFile, TextFile, resolve
10
+ from datachain.lib.file import Audio, File, FileError, ImageFile, TextFile, resolve
11
11
 
12
12
 
13
13
  def create_file(source: str):
@@ -409,3 +409,49 @@ def test_path_normalized(path, expected, raises):
409
409
  file.get_path_normalized()
410
410
  else:
411
411
  assert file.get_path_normalized() == expected
412
+
413
+
414
+ def test_file_rebase_method():
415
+ """Test File.rebase() method"""
416
+ file = File(source="s3://bucket", path="data/audio/file.wav")
417
+
418
+ # Basic rebase
419
+ result = file.rebase("s3://bucket/data/audio", "s3://output-bucket/waveforms")
420
+ assert result == "s3://output-bucket/waveforms/file.wav"
421
+
422
+ # With suffix and extension
423
+ result = file.rebase(
424
+ "s3://bucket/data/audio",
425
+ "s3://output-bucket/processed",
426
+ suffix="_ch1",
427
+ extension="npy",
428
+ )
429
+ assert result == "s3://output-bucket/processed/file_ch1.npy"
430
+
431
+
432
+ def test_file_rebase_local_path():
433
+ """Test File.rebase() with local file paths"""
434
+ file = File(source="file://", path="/data/audio/folder/file.mp3")
435
+
436
+ result = file.rebase("file:///data/audio", "/output/processed")
437
+ assert result == "/output/processed/folder/file.mp3"
438
+
439
+
440
+ def test_audio_get_channel_name():
441
+ # Test known channel configurations
442
+ assert Audio.get_channel_name(1, 0) == "Mono"
443
+ assert Audio.get_channel_name(2, 0) == "Left"
444
+ assert Audio.get_channel_name(2, 1) == "Right"
445
+ assert Audio.get_channel_name(4, 2) == "Y" # Ambisonics
446
+ assert Audio.get_channel_name(6, 3) == "LFE" # 5.1 surround
447
+ assert Audio.get_channel_name(8, 7) == "SR" # 7.1 surround
448
+
449
+ # Test fallback for unknown configurations
450
+ assert Audio.get_channel_name(-1, 0) == "Ch1"
451
+ assert Audio.get_channel_name(3, 0) == "Ch1"
452
+ assert Audio.get_channel_name(5, 4) == "Ch5"
453
+ assert Audio.get_channel_name(10, 9) == "Ch10"
454
+
455
+ # Test out of range indices
456
+ assert Audio.get_channel_name(2, 5) == "Ch6"
457
+ assert Audio.get_channel_name(1, 1) == "Ch2"
@@ -5,7 +5,7 @@ import pytest
5
5
  from pydantic import BaseModel
6
6
 
7
7
  from datachain.lib.convert.python_to_sql import python_to_sql
8
- from datachain.lib.utils import normalize_col_names
8
+ from datachain.lib.utils import normalize_col_names, rebase_path
9
9
  from datachain.sql.types import Array, String
10
10
 
11
11
 
@@ -110,3 +110,72 @@ def test_normalize_column_names_repeat_generated_after_normalize():
110
110
  res = normalize_col_names(["c0_CoLuMn", "_column", "column"])
111
111
 
112
112
  assert res == {"c0_column": "c0_CoLuMn", "c1_column": "_column", "column": "column"}
113
+
114
+
115
+ def test_rebase_path_basic():
116
+ result = rebase_path(
117
+ "/data/audio/folder1/file.wav", "/data/audio", "/output/waveforms"
118
+ )
119
+ assert result == "/output/waveforms/folder1/file.wav"
120
+
121
+
122
+ def test_rebase_path_with_s3_uri():
123
+ result = rebase_path(
124
+ "s3://bucket/data/audio/folder/file.wav",
125
+ "data/audio",
126
+ "s3://output-bucket/waveforms",
127
+ )
128
+ assert result == "s3://output-bucket/waveforms/folder/file.wav"
129
+
130
+
131
+ def test_rebase_path_mixed_uri_schemes():
132
+ result = rebase_path(
133
+ "/local/data/audio/file.mp3", "/local/data/audio", "s3://bucket/output"
134
+ )
135
+ assert result == "s3://bucket/output/file.mp3"
136
+
137
+
138
+ def test_rebase_path_with_suffix():
139
+ result = rebase_path(
140
+ "/data/audio/file.wav", "/data/audio", "/output", suffix="_processed"
141
+ )
142
+ assert result == "/output/file_processed.wav"
143
+
144
+
145
+ def test_rebase_path_with_extension_change():
146
+ result = rebase_path("/data/audio/file.wav", "audio", "/output", extension="npy")
147
+ assert result == "/output/file.npy"
148
+
149
+
150
+ def test_rebase_path_base_dir_not_in_path():
151
+ with pytest.raises(
152
+ ValueError, match="old_base '/data/audio' not found in src_path"
153
+ ):
154
+ rebase_path("/different/path/file.wav", "/data/audio", "/output")
155
+
156
+
157
+ def test_rebase_path_partial_match_base_dir():
158
+ result = rebase_path("/home/user/data/audio/file.wav", "data/audio", "/output")
159
+ assert result == "/output/file.wav"
160
+
161
+
162
+ def test_rebase_path_complex_s3_paths():
163
+ result = rebase_path(
164
+ "s3://bucket/balanced_train_segments/audio/folder/file.flac",
165
+ "s3://bucket/balanced_train_segments",
166
+ "s3://output-bucket/waveforms",
167
+ suffix="_ch1",
168
+ extension="npy",
169
+ )
170
+ assert result == "s3://output-bucket/waveforms/audio/folder/file_ch1.npy"
171
+
172
+
173
+ def test_rebase_path_file_without_extension():
174
+ result = rebase_path("/data/audio/file_no_ext", "/data/audio", "/output")
175
+ assert result == "/output/file_no_ext"
176
+
177
+ # With new extension
178
+ result = rebase_path(
179
+ "/data/audio/file_no_ext", "/data/audio", "/output", extension="txt"
180
+ )
181
+ assert result == "/output/file_no_ext.txt"
@@ -1,59 +0,0 @@
1
- import re
2
- from abc import ABC, abstractmethod
3
- from collections.abc import Sequence
4
-
5
-
6
- class AbstractUDF(ABC):
7
- @abstractmethod
8
- def process(self, *args, **kwargs):
9
- pass
10
-
11
- @abstractmethod
12
- def setup(self):
13
- pass
14
-
15
- @abstractmethod
16
- def teardown(self):
17
- pass
18
-
19
-
20
- class DataChainError(Exception):
21
- pass
22
-
23
-
24
- class DataChainParamsError(DataChainError):
25
- pass
26
-
27
-
28
- class DataChainColumnError(DataChainParamsError):
29
- def __init__(self, col_name: str, msg: str):
30
- super().__init__(f"Error for column {col_name}: {msg}")
31
-
32
-
33
- def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
34
- """Returns normalized_name -> original_name dict."""
35
- gen_col_counter = 0
36
- new_col_names = {}
37
- org_col_names = set(col_names)
38
-
39
- for org_column in col_names:
40
- new_column = org_column.lower()
41
- new_column = re.sub("[^0-9a-z]+", "_", new_column)
42
- new_column = new_column.strip("_")
43
-
44
- generated_column = new_column
45
-
46
- while (
47
- not generated_column.isidentifier()
48
- or generated_column in new_col_names
49
- or (generated_column != org_column and generated_column in org_col_names)
50
- ):
51
- if new_column:
52
- generated_column = f"c{gen_col_counter}_{new_column}"
53
- else:
54
- generated_column = f"c{gen_col_counter}"
55
- gen_col_counter += 1
56
-
57
- new_col_names[generated_column] = org_column
58
-
59
- return new_col_names
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes