datachain 0.18.0__tar.gz → 0.18.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (374) hide show
  1. {datachain-0.18.0 → datachain-0.18.2}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.18.0/src/datachain.egg-info → datachain-0.18.2}/PKG-INFO +1 -1
  3. datachain-0.18.2/examples/incremental_processing/delta.py +64 -0
  4. datachain-0.18.2/examples/incremental_processing/utils.py +41 -0
  5. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/data_storage/metastore.py +1 -1
  6. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/array.py +120 -0
  7. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/func.py +14 -12
  8. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/file.py +1 -1
  9. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/session.py +8 -2
  10. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/functions/array.py +22 -1
  11. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/sqlite/base.py +33 -0
  12. {datachain-0.18.0 → datachain-0.18.2/src/datachain.egg-info}/PKG-INFO +1 -1
  13. {datachain-0.18.0 → datachain-0.18.2}/src/datachain.egg-info/SOURCES.txt +2 -0
  14. {datachain-0.18.0 → datachain-0.18.2}/tests/examples/test_examples.py +11 -0
  15. datachain-0.18.2/tests/func/test_func.py +223 -0
  16. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_datachain.py +10 -0
  17. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/sql/test_array.py +42 -0
  18. datachain-0.18.0/tests/func/test_func.py +0 -124
  19. {datachain-0.18.0 → datachain-0.18.2}/.cruft.json +0 -0
  20. {datachain-0.18.0 → datachain-0.18.2}/.gitattributes +0 -0
  21. {datachain-0.18.0 → datachain-0.18.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  22. {datachain-0.18.0 → datachain-0.18.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  23. {datachain-0.18.0 → datachain-0.18.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  24. {datachain-0.18.0 → datachain-0.18.2}/.github/codecov.yaml +0 -0
  25. {datachain-0.18.0 → datachain-0.18.2}/.github/dependabot.yml +0 -0
  26. {datachain-0.18.0 → datachain-0.18.2}/.github/workflows/benchmarks.yml +0 -0
  27. {datachain-0.18.0 → datachain-0.18.2}/.github/workflows/release.yml +0 -0
  28. {datachain-0.18.0 → datachain-0.18.2}/.github/workflows/tests-studio.yml +0 -0
  29. {datachain-0.18.0 → datachain-0.18.2}/.github/workflows/tests.yml +0 -0
  30. {datachain-0.18.0 → datachain-0.18.2}/.github/workflows/update-template.yaml +0 -0
  31. {datachain-0.18.0 → datachain-0.18.2}/.gitignore +0 -0
  32. {datachain-0.18.0 → datachain-0.18.2}/CODE_OF_CONDUCT.rst +0 -0
  33. {datachain-0.18.0 → datachain-0.18.2}/LICENSE +0 -0
  34. {datachain-0.18.0 → datachain-0.18.2}/README.rst +0 -0
  35. {datachain-0.18.0 → datachain-0.18.2}/docs/assets/captioned_cartoons.png +0 -0
  36. {datachain-0.18.0 → datachain-0.18.2}/docs/assets/datachain-white.svg +0 -0
  37. {datachain-0.18.0 → datachain-0.18.2}/docs/assets/datachain.svg +0 -0
  38. {datachain-0.18.0 → datachain-0.18.2}/docs/commands/auth/login.md +0 -0
  39. {datachain-0.18.0 → datachain-0.18.2}/docs/commands/auth/logout.md +0 -0
  40. {datachain-0.18.0 → datachain-0.18.2}/docs/commands/auth/team.md +0 -0
  41. {datachain-0.18.0 → datachain-0.18.2}/docs/commands/auth/token.md +0 -0
  42. {datachain-0.18.0 → datachain-0.18.2}/docs/commands/index.md +0 -0
  43. {datachain-0.18.0 → datachain-0.18.2}/docs/commands/job/cancel.md +0 -0
  44. {datachain-0.18.0 → datachain-0.18.2}/docs/commands/job/logs.md +0 -0
  45. {datachain-0.18.0 → datachain-0.18.2}/docs/commands/job/ls.md +0 -0
  46. {datachain-0.18.0 → datachain-0.18.2}/docs/commands/job/run.md +0 -0
  47. {datachain-0.18.0 → datachain-0.18.2}/docs/contributing.md +0 -0
  48. {datachain-0.18.0 → datachain-0.18.2}/docs/css/github-permalink-style.css +0 -0
  49. {datachain-0.18.0 → datachain-0.18.2}/docs/examples.md +0 -0
  50. {datachain-0.18.0 → datachain-0.18.2}/docs/index.md +0 -0
  51. {datachain-0.18.0 → datachain-0.18.2}/docs/overrides/main.html +0 -0
  52. {datachain-0.18.0 → datachain-0.18.2}/docs/quick-start.md +0 -0
  53. {datachain-0.18.0 → datachain-0.18.2}/docs/references/data-types/arrowrow.md +0 -0
  54. {datachain-0.18.0 → datachain-0.18.2}/docs/references/data-types/bbox.md +0 -0
  55. {datachain-0.18.0 → datachain-0.18.2}/docs/references/data-types/file.md +0 -0
  56. {datachain-0.18.0 → datachain-0.18.2}/docs/references/data-types/imagefile.md +0 -0
  57. {datachain-0.18.0 → datachain-0.18.2}/docs/references/data-types/index.md +0 -0
  58. {datachain-0.18.0 → datachain-0.18.2}/docs/references/data-types/pose.md +0 -0
  59. {datachain-0.18.0 → datachain-0.18.2}/docs/references/data-types/segment.md +0 -0
  60. {datachain-0.18.0 → datachain-0.18.2}/docs/references/data-types/tarvfile.md +0 -0
  61. {datachain-0.18.0 → datachain-0.18.2}/docs/references/data-types/textfile.md +0 -0
  62. {datachain-0.18.0 → datachain-0.18.2}/docs/references/data-types/videofile.md +0 -0
  63. {datachain-0.18.0 → datachain-0.18.2}/docs/references/datachain.md +0 -0
  64. {datachain-0.18.0 → datachain-0.18.2}/docs/references/func.md +0 -0
  65. {datachain-0.18.0 → datachain-0.18.2}/docs/references/index.md +0 -0
  66. {datachain-0.18.0 → datachain-0.18.2}/docs/references/remotes.md +0 -0
  67. {datachain-0.18.0 → datachain-0.18.2}/docs/references/toolkit.md +0 -0
  68. {datachain-0.18.0 → datachain-0.18.2}/docs/references/torch.md +0 -0
  69. {datachain-0.18.0 → datachain-0.18.2}/docs/references/udf.md +0 -0
  70. {datachain-0.18.0 → datachain-0.18.2}/docs/tutorials.md +0 -0
  71. {datachain-0.18.0 → datachain-0.18.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  72. {datachain-0.18.0 → datachain-0.18.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  73. {datachain-0.18.0 → datachain-0.18.2}/examples/computer_vision/openimage-detect.py +0 -0
  74. {datachain-0.18.0 → datachain-0.18.2}/examples/computer_vision/ultralytics-bbox.py +0 -0
  75. {datachain-0.18.0 → datachain-0.18.2}/examples/computer_vision/ultralytics-pose.py +0 -0
  76. {datachain-0.18.0 → datachain-0.18.2}/examples/computer_vision/ultralytics-segment.py +0 -0
  77. {datachain-0.18.0 → datachain-0.18.2}/examples/get_started/common_sql_functions.py +0 -0
  78. {datachain-0.18.0 → datachain-0.18.2}/examples/get_started/json-csv-reader.py +0 -0
  79. {datachain-0.18.0 → datachain-0.18.2}/examples/get_started/torch-loader.py +0 -0
  80. {datachain-0.18.0 → datachain-0.18.2}/examples/get_started/udfs/parallel.py +0 -0
  81. {datachain-0.18.0 → datachain-0.18.2}/examples/get_started/udfs/simple.py +0 -0
  82. {datachain-0.18.0 → datachain-0.18.2}/examples/get_started/udfs/stateful.py +0 -0
  83. {datachain-0.18.0 → datachain-0.18.2}/examples/llm_and_nlp/claude-query.py +0 -0
  84. {datachain-0.18.0 → datachain-0.18.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  85. {datachain-0.18.0 → datachain-0.18.2}/examples/multimodal/clip_inference.py +0 -0
  86. {datachain-0.18.0 → datachain-0.18.2}/examples/multimodal/hf_pipeline.py +0 -0
  87. {datachain-0.18.0 → datachain-0.18.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
  88. {datachain-0.18.0 → datachain-0.18.2}/examples/multimodal/wds.py +0 -0
  89. {datachain-0.18.0 → datachain-0.18.2}/examples/multimodal/wds_filtered.py +0 -0
  90. {datachain-0.18.0 → datachain-0.18.2}/mkdocs.yml +0 -0
  91. {datachain-0.18.0 → datachain-0.18.2}/noxfile.py +0 -0
  92. {datachain-0.18.0 → datachain-0.18.2}/pyproject.toml +0 -0
  93. {datachain-0.18.0 → datachain-0.18.2}/setup.cfg +0 -0
  94. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/__init__.py +0 -0
  95. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/__main__.py +0 -0
  96. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/asyn.py +0 -0
  97. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cache.py +0 -0
  98. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/catalog/__init__.py +0 -0
  99. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/catalog/catalog.py +0 -0
  100. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/catalog/datasource.py +0 -0
  101. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/catalog/loader.py +0 -0
  102. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/__init__.py +0 -0
  103. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/commands/__init__.py +0 -0
  104. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/commands/datasets.py +0 -0
  105. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/commands/du.py +0 -0
  106. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/commands/index.py +0 -0
  107. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/commands/ls.py +0 -0
  108. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/commands/misc.py +0 -0
  109. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/commands/query.py +0 -0
  110. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/commands/show.py +0 -0
  111. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/parser/__init__.py +0 -0
  112. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/parser/job.py +0 -0
  113. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/parser/studio.py +0 -0
  114. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/parser/utils.py +0 -0
  115. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/cli/utils.py +0 -0
  116. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/client/__init__.py +0 -0
  117. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/client/azure.py +0 -0
  118. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/client/fileslice.py +0 -0
  119. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/client/fsspec.py +0 -0
  120. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/client/gcs.py +0 -0
  121. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/client/hf.py +0 -0
  122. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/client/local.py +0 -0
  123. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/client/s3.py +0 -0
  124. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/config.py +0 -0
  125. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/data_storage/__init__.py +0 -0
  126. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/data_storage/db_engine.py +0 -0
  127. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/data_storage/job.py +0 -0
  128. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/data_storage/schema.py +0 -0
  129. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/data_storage/serializer.py +0 -0
  130. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/data_storage/sqlite.py +0 -0
  131. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/data_storage/warehouse.py +0 -0
  132. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/dataset.py +0 -0
  133. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/delta.py +0 -0
  134. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/diff/__init__.py +0 -0
  135. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/error.py +0 -0
  136. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/fs/__init__.py +0 -0
  137. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/fs/reference.py +0 -0
  138. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/fs/utils.py +0 -0
  139. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/__init__.py +0 -0
  140. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/aggregate.py +0 -0
  141. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/base.py +0 -0
  142. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/conditional.py +0 -0
  143. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/numeric.py +0 -0
  144. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/path.py +0 -0
  145. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/random.py +0 -0
  146. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/string.py +0 -0
  147. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/func/window.py +0 -0
  148. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/job.py +0 -0
  149. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/__init__.py +0 -0
  150. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/arrow.py +0 -0
  151. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/clip.py +0 -0
  152. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/convert/__init__.py +0 -0
  153. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/convert/flatten.py +0 -0
  154. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
  155. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
  156. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/convert/unflatten.py +0 -0
  157. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  158. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/data_model.py +0 -0
  159. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dataset_info.py +0 -0
  160. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/__init__.py +0 -0
  161. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/csv.py +0 -0
  162. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/database.py +0 -0
  163. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/datachain.py +0 -0
  164. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/datasets.py +0 -0
  165. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/hf.py +0 -0
  166. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/json.py +0 -0
  167. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/listings.py +0 -0
  168. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/pandas.py +0 -0
  169. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/parquet.py +0 -0
  170. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/records.py +0 -0
  171. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/storage.py +0 -0
  172. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/utils.py +0 -0
  173. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/dc/values.py +0 -0
  174. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/hf.py +0 -0
  175. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/image.py +0 -0
  176. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/listing.py +0 -0
  177. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/listing_info.py +0 -0
  178. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/meta_formats.py +0 -0
  179. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/model_store.py +0 -0
  180. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/pytorch.py +0 -0
  181. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/settings.py +0 -0
  182. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/signal_schema.py +0 -0
  183. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/tar.py +0 -0
  184. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/text.py +0 -0
  185. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/udf.py +0 -0
  186. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/udf_signature.py +0 -0
  187. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/utils.py +0 -0
  188. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/video.py +0 -0
  189. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/webdataset.py +0 -0
  190. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/lib/webdataset_laion.py +0 -0
  191. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/listing.py +0 -0
  192. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/model/__init__.py +0 -0
  193. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/model/bbox.py +0 -0
  194. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/model/pose.py +0 -0
  195. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/model/segment.py +0 -0
  196. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/model/ultralytics/__init__.py +0 -0
  197. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/model/ultralytics/bbox.py +0 -0
  198. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/model/ultralytics/pose.py +0 -0
  199. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/model/ultralytics/segment.py +0 -0
  200. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/model/utils.py +0 -0
  201. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/node.py +0 -0
  202. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/nodes_fetcher.py +0 -0
  203. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/nodes_thread_pool.py +0 -0
  204. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/progress.py +0 -0
  205. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/py.typed +0 -0
  206. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/__init__.py +0 -0
  207. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/batch.py +0 -0
  208. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/dataset.py +0 -0
  209. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/dispatch.py +0 -0
  210. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/metrics.py +0 -0
  211. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/params.py +0 -0
  212. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/queue.py +0 -0
  213. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/schema.py +0 -0
  214. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/udf.py +0 -0
  215. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/query/utils.py +0 -0
  216. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/remote/__init__.py +0 -0
  217. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/remote/studio.py +0 -0
  218. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/script_meta.py +0 -0
  219. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/semver.py +0 -0
  220. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/__init__.py +0 -0
  221. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/default/__init__.py +0 -0
  222. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/default/base.py +0 -0
  223. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/functions/__init__.py +0 -0
  224. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/functions/aggregate.py +0 -0
  225. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/functions/conditional.py +0 -0
  226. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/functions/numeric.py +0 -0
  227. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/functions/path.py +0 -0
  228. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/functions/random.py +0 -0
  229. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/functions/string.py +0 -0
  230. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/selectable.py +0 -0
  231. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/sqlite/__init__.py +0 -0
  232. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/sqlite/types.py +0 -0
  233. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/sqlite/vector.py +0 -0
  234. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/types.py +0 -0
  235. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/sql/utils.py +0 -0
  236. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/studio.py +0 -0
  237. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/telemetry.py +0 -0
  238. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/toolkit/__init__.py +0 -0
  239. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/toolkit/split.py +0 -0
  240. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/torch/__init__.py +0 -0
  241. {datachain-0.18.0 → datachain-0.18.2}/src/datachain/utils.py +0 -0
  242. {datachain-0.18.0 → datachain-0.18.2}/src/datachain.egg-info/dependency_links.txt +0 -0
  243. {datachain-0.18.0 → datachain-0.18.2}/src/datachain.egg-info/entry_points.txt +0 -0
  244. {datachain-0.18.0 → datachain-0.18.2}/src/datachain.egg-info/requires.txt +0 -0
  245. {datachain-0.18.0 → datachain-0.18.2}/src/datachain.egg-info/top_level.txt +0 -0
  246. {datachain-0.18.0 → datachain-0.18.2}/tests/__init__.py +0 -0
  247. {datachain-0.18.0 → datachain-0.18.2}/tests/benchmarks/__init__.py +0 -0
  248. {datachain-0.18.0 → datachain-0.18.2}/tests/benchmarks/conftest.py +0 -0
  249. {datachain-0.18.0 → datachain-0.18.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  250. {datachain-0.18.0 → datachain-0.18.2}/tests/benchmarks/datasets/.dvc/config +0 -0
  251. {datachain-0.18.0 → datachain-0.18.2}/tests/benchmarks/datasets/.gitignore +0 -0
  252. {datachain-0.18.0 → datachain-0.18.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  253. {datachain-0.18.0 → datachain-0.18.2}/tests/benchmarks/test_datachain.py +0 -0
  254. {datachain-0.18.0 → datachain-0.18.2}/tests/benchmarks/test_ls.py +0 -0
  255. {datachain-0.18.0 → datachain-0.18.2}/tests/benchmarks/test_version.py +0 -0
  256. {datachain-0.18.0 → datachain-0.18.2}/tests/conftest.py +0 -0
  257. {datachain-0.18.0 → datachain-0.18.2}/tests/data.py +0 -0
  258. {datachain-0.18.0 → datachain-0.18.2}/tests/examples/__init__.py +0 -0
  259. {datachain-0.18.0 → datachain-0.18.2}/tests/examples/test_wds_e2e.py +0 -0
  260. {datachain-0.18.0 → datachain-0.18.2}/tests/examples/wds_data.py +0 -0
  261. {datachain-0.18.0 → datachain-0.18.2}/tests/func/__init__.py +0 -0
  262. {datachain-0.18.0 → datachain-0.18.2}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  263. {datachain-0.18.0 → datachain-0.18.2}/tests/func/data/lena.jpg +0 -0
  264. {datachain-0.18.0 → datachain-0.18.2}/tests/func/fake-service-account-credentials.json +0 -0
  265. {datachain-0.18.0 → datachain-0.18.2}/tests/func/model/__init__.py +0 -0
  266. {datachain-0.18.0 → datachain-0.18.2}/tests/func/model/data/running-mask0.png +0 -0
  267. {datachain-0.18.0 → datachain-0.18.2}/tests/func/model/data/running-mask1.png +0 -0
  268. {datachain-0.18.0 → datachain-0.18.2}/tests/func/model/data/running.jpg +0 -0
  269. {datachain-0.18.0 → datachain-0.18.2}/tests/func/model/data/ships.jpg +0 -0
  270. {datachain-0.18.0 → datachain-0.18.2}/tests/func/model/test_yolo.py +0 -0
  271. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_batching.py +0 -0
  272. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_catalog.py +0 -0
  273. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_client.py +0 -0
  274. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_cloud_transfer.py +0 -0
  275. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_data_storage.py +0 -0
  276. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_datachain.py +0 -0
  277. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_datachain_merge.py +0 -0
  278. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_dataset_query.py +0 -0
  279. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_datasets.py +0 -0
  280. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_delta.py +0 -0
  281. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_feature_pickling.py +0 -0
  282. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_file.py +0 -0
  283. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_hf.py +0 -0
  284. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_hidden_field.py +0 -0
  285. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_image.py +0 -0
  286. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_listing.py +0 -0
  287. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_ls.py +0 -0
  288. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_meta_formats.py +0 -0
  289. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_metrics.py +0 -0
  290. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_pull.py +0 -0
  291. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_pytorch.py +0 -0
  292. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_query.py +0 -0
  293. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_read_database.py +0 -0
  294. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_session.py +0 -0
  295. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_toolkit.py +0 -0
  296. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_video.py +0 -0
  297. {datachain-0.18.0 → datachain-0.18.2}/tests/func/test_warehouse.py +0 -0
  298. {datachain-0.18.0 → datachain-0.18.2}/tests/scripts/feature_class.py +0 -0
  299. {datachain-0.18.0 → datachain-0.18.2}/tests/scripts/feature_class_exception.py +0 -0
  300. {datachain-0.18.0 → datachain-0.18.2}/tests/scripts/feature_class_parallel.py +0 -0
  301. {datachain-0.18.0 → datachain-0.18.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  302. {datachain-0.18.0 → datachain-0.18.2}/tests/scripts/name_len_slow.py +0 -0
  303. {datachain-0.18.0 → datachain-0.18.2}/tests/test_atomicity.py +0 -0
  304. {datachain-0.18.0 → datachain-0.18.2}/tests/test_cli_e2e.py +0 -0
  305. {datachain-0.18.0 → datachain-0.18.2}/tests/test_cli_studio.py +0 -0
  306. {datachain-0.18.0 → datachain-0.18.2}/tests/test_import_time.py +0 -0
  307. {datachain-0.18.0 → datachain-0.18.2}/tests/test_query_e2e.py +0 -0
  308. {datachain-0.18.0 → datachain-0.18.2}/tests/test_telemetry.py +0 -0
  309. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/__init__.py +0 -0
  310. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/__init__.py +0 -0
  311. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/conftest.py +0 -0
  312. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_arrow.py +0 -0
  313. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_clip.py +0 -0
  314. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  315. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_datachain_merge.py +0 -0
  316. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_diff.py +0 -0
  317. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_feature.py +0 -0
  318. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_feature_utils.py +0 -0
  319. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_file.py +0 -0
  320. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_hf.py +0 -0
  321. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_image.py +0 -0
  322. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_listing_info.py +0 -0
  323. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_python_to_sql.py +0 -0
  324. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_schema.py +0 -0
  325. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_signal_schema.py +0 -0
  326. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_sql_to_python.py +0 -0
  327. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_text.py +0 -0
  328. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_udf.py +0 -0
  329. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_udf_signature.py +0 -0
  330. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_utils.py +0 -0
  331. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/lib/test_webdataset.py +0 -0
  332. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/model/__init__.py +0 -0
  333. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/model/test_bbox.py +0 -0
  334. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/model/test_pose.py +0 -0
  335. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/model/test_segment.py +0 -0
  336. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/model/test_utils.py +0 -0
  337. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/sql/__init__.py +0 -0
  338. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/sql/sqlite/__init__.py +0 -0
  339. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/sql/sqlite/test_types.py +0 -0
  340. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
  341. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/sql/test_conditional.py +0 -0
  342. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/sql/test_path.py +0 -0
  343. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/sql/test_random.py +0 -0
  344. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/sql/test_selectable.py +0 -0
  345. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/sql/test_string.py +0 -0
  346. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_asyn.py +0 -0
  347. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_cache.py +0 -0
  348. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_catalog.py +0 -0
  349. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_catalog_loader.py +0 -0
  350. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_cli_parsing.py +0 -0
  351. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_client.py +0 -0
  352. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_client_gcs.py +0 -0
  353. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_client_s3.py +0 -0
  354. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_config.py +0 -0
  355. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_data_storage.py +0 -0
  356. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_database_engine.py +0 -0
  357. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_dataset.py +0 -0
  358. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_dispatch.py +0 -0
  359. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_fileslice.py +0 -0
  360. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_func.py +0 -0
  361. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_listing.py +0 -0
  362. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_metastore.py +0 -0
  363. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_module_exports.py +0 -0
  364. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_pytorch.py +0 -0
  365. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_query.py +0 -0
  366. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_query_metrics.py +0 -0
  367. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_query_params.py +0 -0
  368. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_script_meta.py +0 -0
  369. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_semver.py +0 -0
  370. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_serializer.py +0 -0
  371. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_session.py +0 -0
  372. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_utils.py +0 -0
  373. {datachain-0.18.0 → datachain-0.18.2}/tests/unit/test_warehouse.py +0 -0
  374. {datachain-0.18.0 → datachain-0.18.2}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.11.9'
27
+ rev: 'v0.11.10'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.18.0
3
+ Version: 0.18.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ File Generator Script using DataChain Delta
4
+
5
+ This script demonstrates:
6
+ 1. Creating numbered text files in a 'test' directory
7
+ 2. Using DataChain's delta flag for incremental dataset processing
8
+
9
+ Each execution:
10
+ - Creates a new numbered file in the 'test' directory
11
+ - Updates a DataChain dataset to track these files incrementally
12
+ """
13
+
14
+ import re
15
+ import time
16
+
17
+ from utils import generate_next_file
18
+
19
+ import datachain as dc
20
+ from datachain import C, File
21
+
22
+
23
+ def extract_file_number(file: File) -> int:
24
+ """Extract file number from the filename."""
25
+ match = re.search(r"file-(\d+)\.txt", file.name)
26
+ if match:
27
+ return int(match.group(1))
28
+ return -1
29
+
30
+
31
+ def process_files_with_delta():
32
+ """
33
+ Process files in the test directory using DataChain with delta mode.
34
+ This demonstrates incremental processing - only new files are processed.
35
+ """
36
+ chain = (
37
+ dc.read_storage("test/", update=True, delta=True, delta_on="file.path")
38
+ .filter(C("file.path").glob("*.txt"))
39
+ .map(file_number=extract_file_number)
40
+ .map(content=lambda file: file.read_text())
41
+ .map(processed_at=lambda: time.strftime("%Y-%m-%d %H:%M:%S"))
42
+ .save(name="test_files")
43
+ )
44
+
45
+ # Show information about the dataset
46
+ print(f"\nProcessed files. Total records: {chain.count()}")
47
+ print("\nDataset versions:")
48
+ test_dataset = dc.datasets().filter(C("name") == "test_files")
49
+
50
+ for version in test_dataset.collect("version"):
51
+ print(f"- Version: {version}")
52
+
53
+ # Show the last 3 records to demonstrate the incremental processing
54
+ print("\nLatest files processed:")
55
+ chain.order_by("file_number", descending=True).limit(3).show()
56
+
57
+
58
+ if __name__ == "__main__":
59
+ # Generate a new file
60
+ new_file = generate_next_file()
61
+ print(f"Created new file: {new_file}")
62
+
63
+ # Process all new file with (delta update)
64
+ process_files_with_delta()
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ File Generator Helper
4
+
5
+ This helper creates numbered text files in a 'test' directory each time it runs.
6
+ The files follow the naming pattern: file-0.txt, file-1.txt, file-2.txt, etc.
7
+
8
+ Each execution, the script:
9
+
10
+ 1. Creates the 'test' directory if it doesn't exist
11
+ 2. Finds the highest numbered file currently present
12
+ 3. Creates a new file with the next number in sequence
13
+ 4. Adds timestamped content to the file
14
+ """
15
+
16
+ import re
17
+ import time
18
+ from pathlib import Path
19
+
20
+
21
+ def generate_next_file() -> Path:
22
+ """
23
+ Generate (appends) a new numbered text file in the 'test' directory.
24
+ """
25
+ test_dir = Path("test")
26
+ test_dir.mkdir(exist_ok=True)
27
+
28
+ max_num = -1
29
+ for file in test_dir.glob("file-*.txt"):
30
+ if file.is_file():
31
+ match = re.search(r"file-(\d+)\.txt", file.name)
32
+ if match:
33
+ max_num = max(max_num, int(match.group(1)))
34
+
35
+ next_num = max_num + 1
36
+ new_file_path = test_dir / f"file-{next_num}.txt"
37
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
38
+ content = f"This is file number {next_num}\nCreated at: {timestamp}\n"
39
+ new_file_path.write_text(content)
40
+
41
+ return new_file_path
@@ -674,7 +674,7 @@ class AbstractDBMetastore(AbstractMetastore):
674
674
  dv = self._datasets_versions
675
675
  self.db.execute(
676
676
  self._datasets_versions_update()
677
- .where(dv.c.dataset_id == dataset.id and dv.c.version == version)
677
+ .where(dv.c.dataset_id == dataset.id, dv.c.version == version)
678
678
  .values(values),
679
679
  conn=conn,
680
680
  ) # type: ignore [attr-defined]
@@ -178,6 +178,126 @@ def contains(arr: Union[str, Sequence, Func], elem: Any) -> Func:
178
178
  return Func("contains", inner=inner, cols=cols, args=args, result_type=int)
179
179
 
180
180
 
181
+ def slice(
182
+ arr: Union[str, Sequence, Func],
183
+ offset: int,
184
+ length: Optional[int] = None,
185
+ ) -> Func:
186
+ """
187
+ Returns a slice of the array.
188
+
189
+ Args:
190
+ arr (str | Sequence | Func): Array to check for the element.
191
+ If a string is provided, it is assumed to be the name of the array column.
192
+ If a sequence is provided, it is assumed to be an array of values.
193
+ If a Func is provided, it is assumed to be a function returning an array.
194
+ offset (int): Offset to start the slice from.
195
+ length (int, optional): Length of the slice. If not provided, the slice will
196
+ continue to the end of the array.
197
+
198
+ Returns:
199
+ Func: A Func object that represents the slice function. Result of the
200
+ function will be a slice of the array starting from the offset
201
+ and with the given length.
202
+
203
+ Example:
204
+ ```py
205
+ dc.mutate(
206
+ contains1=func.array.slice("signal.values", 3),
207
+ contains2=func.array.slice([1, 2, 3, 4, 5], 1, 3),
208
+ )
209
+ ```
210
+ """
211
+
212
+ def inner(arg):
213
+ if length is not None:
214
+ return array.slice(arg, offset, length)
215
+ return array.slice(arg, offset)
216
+
217
+ def element_type(el):
218
+ if isinstance(el, list):
219
+ try:
220
+ return list[element_type(el[0])]
221
+ except IndexError:
222
+ # if the array is empty, return list[str] as default type
223
+ return list[str]
224
+ return type(el)
225
+
226
+ def type_from_args(arr, *_):
227
+ if isinstance(arr, list):
228
+ try:
229
+ return list[element_type(arr[0])]
230
+ except IndexError:
231
+ # if the array is empty, return list[str] as default type
232
+ return list[str]
233
+ return None
234
+
235
+ if isinstance(arr, (str, Func)):
236
+ cols = [arr]
237
+ args = None
238
+ else:
239
+ cols = None
240
+ args = [arr]
241
+
242
+ return Func(
243
+ "slice",
244
+ inner=inner,
245
+ cols=cols,
246
+ args=args,
247
+ from_array=True,
248
+ is_array=True,
249
+ type_from_args=type_from_args,
250
+ )
251
+
252
+
253
+ def join(
254
+ arr: Union[str, Sequence, Func],
255
+ sep: str = "",
256
+ ) -> Func:
257
+ """
258
+ Returns a string that is the concatenation of the elements of the array,
259
+
260
+ Args:
261
+ arr (str | Sequence | Func): Array to check for the element.
262
+ If a string is provided, it is assumed to be the name of the array column.
263
+ If a sequence is provided, it is assumed to be an array of values.
264
+ If a Func is provided, it is assumed to be a function returning an array.
265
+ sep (str): Separator to use for the concatenation. Default is an empty string.
266
+
267
+ Returns:
268
+ Func: A Func object that represents the join function. Result of the
269
+ function will be a string that is the concatenation of the elements
270
+ of the array, separated by the given separator.
271
+
272
+ Example:
273
+ ```py
274
+ dc.mutate(
275
+ contains1=func.array.join("signal.values", ":"),
276
+ contains2=func.array.join(["1", "2", "3", "4", "5"], "/"),
277
+ )
278
+ ```
279
+ """
280
+
281
+ def inner(arg):
282
+ return array.join(arg, sep)
283
+
284
+ if isinstance(arr, (str, Func)):
285
+ cols = [arr]
286
+ args = None
287
+ else:
288
+ cols = None
289
+ args = [arr]
290
+
291
+ return Func(
292
+ "join",
293
+ inner=inner,
294
+ cols=cols,
295
+ args=args,
296
+ from_array=True,
297
+ result_type=str,
298
+ )
299
+
300
+
181
301
  def get_element(arg: Union[str, Sequence, Func], index: int) -> Func:
182
302
  """
183
303
  Returns the element at the given index from the array.
@@ -108,18 +108,20 @@ class Func(Function):
108
108
  )
109
109
 
110
110
  if self.from_array:
111
- if get_origin(col_type) is list:
112
- col_args = get_args(col_type)
113
- if len(col_args) != 1:
114
- raise DataChainColumnError(
115
- str(self),
116
- "Array column must have a single type argument",
117
- )
118
- return col_args[0]
119
- raise DataChainColumnError(
120
- str(self),
121
- "Array column must be of type list",
122
- )
111
+ if get_origin(col_type) is not list:
112
+ raise DataChainColumnError(
113
+ str(self),
114
+ "Array column must be of type list",
115
+ )
116
+ if self.is_array:
117
+ return col_type
118
+ col_args = get_args(col_type)
119
+ if len(col_args) != 1:
120
+ raise DataChainColumnError(
121
+ str(self),
122
+ "Array column must have a single type argument",
123
+ )
124
+ return col_args[0]
123
125
 
124
126
  return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
125
127
 
@@ -237,7 +237,7 @@ class File(DataModel):
237
237
  @field_validator("path", mode="before")
238
238
  @classmethod
239
239
  def validate_path(cls, path):
240
- return Path(path).as_posix()
240
+ return Path(path).as_posix() if path else ""
241
241
 
242
242
  def model_dump_custom(self):
243
243
  res = self.model_dump()
@@ -195,5 +195,11 @@ class Session:
195
195
  Session.GLOBAL_SESSION_CTX.__exit__(None, None, None)
196
196
 
197
197
  for obj in gc.get_objects(): # Get all tracked objects
198
- if isinstance(obj, Session): # Cleanup temp dataset for session variables.
199
- obj.__exit__(None, None, None)
198
+ try:
199
+ if isinstance(obj, Session):
200
+ # Cleanup temp dataset for session variables.
201
+ obj.__exit__(None, None, None)
202
+ except ReferenceError:
203
+ continue # Object has been finalized already
204
+ except Exception as e: # noqa: BLE001
205
+ logger.error(f"Exception while cleaning up session: {e}") # noqa: G004
@@ -1,6 +1,6 @@
1
1
  from sqlalchemy.sql.functions import GenericFunction
2
2
 
3
- from datachain.sql.types import Boolean, Float, Int64
3
+ from datachain.sql.types import Boolean, Float, Int64, String
4
4
  from datachain.sql.utils import compiler_not_implemented
5
5
 
6
6
 
@@ -48,6 +48,27 @@ class contains(GenericFunction): # noqa: N801
48
48
  inherit_cache = True
49
49
 
50
50
 
51
+ class slice(GenericFunction): # noqa: N801
52
+ """
53
+ Returns a slice of the array.
54
+ """
55
+
56
+ package = "array"
57
+ name = "slice"
58
+ inherit_cache = True
59
+
60
+
61
+ class join(GenericFunction): # noqa: N801
62
+ """
63
+ Returns the concatenation of the array elements.
64
+ """
65
+
66
+ type = String()
67
+ package = "array"
68
+ name = "join"
69
+ inherit_cache = True
70
+
71
+
51
72
  class get_element(GenericFunction): # noqa: N801
52
73
  """
53
74
  Returns the element at the given index in the array.
@@ -88,6 +88,8 @@ def setup():
88
88
  compiles(sql_path.file_ext, "sqlite")(compile_path_file_ext)
89
89
  compiles(array.length, "sqlite")(compile_array_length)
90
90
  compiles(array.contains, "sqlite")(compile_array_contains)
91
+ compiles(array.slice, "sqlite")(compile_array_slice)
92
+ compiles(array.join, "sqlite")(compile_array_join)
91
93
  compiles(array.get_element, "sqlite")(compile_array_get_element)
92
94
  compiles(string.length, "sqlite")(compile_string_length)
93
95
  compiles(string.split, "sqlite")(compile_string_split)
@@ -275,6 +277,15 @@ def register_user_defined_sql_functions() -> None:
275
277
  conn.create_function(
276
278
  "json_array_get_element", 2, py_json_array_get_element, deterministic=True
277
279
  )
280
+ conn.create_function(
281
+ "json_array_slice", 2, py_json_array_slice, deterministic=True
282
+ )
283
+ conn.create_function(
284
+ "json_array_slice", 3, py_json_array_slice, deterministic=True
285
+ )
286
+ conn.create_function(
287
+ "json_array_join", 2, py_json_array_join, deterministic=True
288
+ )
278
289
 
279
290
  _registered_function_creators["array_functions"] = create_array_functions
280
291
 
@@ -454,6 +465,20 @@ def py_json_array_get_element(val, idx):
454
465
  return None
455
466
 
456
467
 
468
+ def py_json_array_slice(val, offset: int, length: Optional[int] = None):
469
+ arr = orjson.loads(val)
470
+ try:
471
+ return orjson.dumps(
472
+ list(arr[offset : offset + length] if length is not None else arr[offset:])
473
+ ).decode("utf-8")
474
+ except IndexError:
475
+ return None
476
+
477
+
478
+ def py_json_array_join(val, sep: str):
479
+ return sep.join(orjson.loads(val))
480
+
481
+
457
482
  def compile_array_get_element(element, compiler, **kwargs):
458
483
  return compiler.process(
459
484
  func.json_array_get_element(*element.clauses.clauses), **kwargs
@@ -470,6 +495,14 @@ def compile_array_contains(element, compiler, **kwargs):
470
495
  )
471
496
 
472
497
 
498
+ def compile_array_slice(element, compiler, **kwargs):
499
+ return compiler.process(func.json_array_slice(*element.clauses.clauses), **kwargs)
500
+
501
+
502
+ def compile_array_join(element, compiler, **kwargs):
503
+ return compiler.process(func.json_array_join(*element.clauses.clauses), **kwargs)
504
+
505
+
473
506
  def compile_string_length(element, compiler, **kwargs):
474
507
  return compiler.process(func.length(*element.clauses.clauses), **kwargs)
475
508
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.18.0
3
+ Version: 0.18.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -66,6 +66,8 @@ examples/get_started/torch-loader.py
66
66
  examples/get_started/udfs/parallel.py
67
67
  examples/get_started/udfs/simple.py
68
68
  examples/get_started/udfs/stateful.py
69
+ examples/incremental_processing/delta.py
70
+ examples/incremental_processing/utils.py
69
71
  examples/llm_and_nlp/claude-query.py
70
72
  examples/llm_and_nlp/hf-dataset-llm-eval.py
71
73
  examples/multimodal/clip_inference.py
@@ -12,6 +12,10 @@ llm_and_nlp_examples = sorted(glob.glob("examples/llm_and_nlp/**/*.py", recursiv
12
12
 
13
13
  multimodal_examples = sorted(glob.glob("examples/multimodal/**/*.py", recursive=True))
14
14
 
15
+ incremental_processing_examples = sorted(
16
+ glob.glob("examples/incremental_processing/delta.py", recursive=True)
17
+ )
18
+
15
19
  computer_vision_examples = sorted(
16
20
  [
17
21
  filename
@@ -86,6 +90,13 @@ def test_multimodal(example):
86
90
  )
87
91
 
88
92
 
93
+ @pytest.mark.examples
94
+ @pytest.mark.incremental_processing
95
+ @pytest.mark.parametrize("example", incremental_processing_examples)
96
+ def test_incremental_processing_examples(example):
97
+ smoke_test(example)
98
+
99
+
89
100
  @pytest.mark.examples
90
101
  @pytest.mark.computer_vision
91
102
  @pytest.mark.parametrize("example", computer_vision_examples)
@@ -0,0 +1,223 @@
1
+ import math
2
+
3
+ import datachain as dc
4
+ from datachain import func
5
+ from datachain.sql.types import Float, Int, String
6
+
7
+
8
+ def values_almost_equal(a, b):
9
+ """Compare two values, treating NaNs as equal."""
10
+ if (
11
+ isinstance(a, float)
12
+ and isinstance(b, float)
13
+ and math.isnan(a)
14
+ and math.isnan(b)
15
+ ):
16
+ return True
17
+ return a == b
18
+
19
+
20
+ def tuples_almost_equal(t1, t2, path=""):
21
+ """Compare two tuples, treating NaN floats as equal."""
22
+ if len(t1) != len(t2):
23
+ raise AssertionError(
24
+ f"Tuple length mismatch at {path or 'root'}: {len(t1)} != {len(t2)}\n"
25
+ f" Left ({type(t1)}): {t1}\n"
26
+ f" Right ({type(t2)}): {t2}"
27
+ )
28
+
29
+ for i, (x, y) in enumerate(zip(t1, t2)):
30
+ subpath = f"{path}[{i}]"
31
+ if isinstance(x, tuple) and isinstance(y, tuple):
32
+ tuples_almost_equal(x, y, path=subpath)
33
+ elif not values_almost_equal(x, y):
34
+ raise AssertionError(
35
+ f"Mismatch at {subpath}:\n"
36
+ f" Left ({type(x)}): {x}\n"
37
+ f" Right ({type(y)}): {y}"
38
+ )
39
+
40
+
41
+ def test_array_slice(test_session):
42
+ class Arr(dc.DataModel):
43
+ i: list[int]
44
+ f: list[float]
45
+ s: list[str]
46
+
47
+ ds = list(
48
+ dc.read_values(
49
+ id=[1, 2, 3],
50
+ arr=(
51
+ Arr(i=[10, 20, 30], f=[1.0, 2.0, 3.0], s=["a", "b", "c"]),
52
+ Arr(i=[40, 50, 60], f=[4.0, 5.0, 6.0], s=["d", "e", "f"]),
53
+ Arr(i=[50], f=[5.0], s=["g"]),
54
+ ),
55
+ session=test_session,
56
+ )
57
+ .mutate(
58
+ t1=func.array.slice("arr.i", 1),
59
+ t2=func.array.slice("arr.i", 100),
60
+ t3=func.array.slice("arr.f", 0),
61
+ t4=func.array.slice("arr.f", 1, 1),
62
+ t5=func.array.slice("arr.s", 2),
63
+ t6=func.array.slice("arr.s", 1, 10),
64
+ t7=func.array.slice([9.0], 0),
65
+ t8=func.array.slice([17], 5),
66
+ t9=func.array.slice(["a", "b", "c", "d"], 1, 5),
67
+ t10=func.array.slice(["a", "b", "c", "d"], 100),
68
+ t11=func.array.slice([], 0),
69
+ )
70
+ .order_by("id")
71
+ .collect("t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", "t11")
72
+ )
73
+
74
+ assert tuple(ds) == (
75
+ (
76
+ [20, 30],
77
+ [],
78
+ [1.0, 2.0, 3.0],
79
+ [2.0],
80
+ ["c"],
81
+ ["b", "c"],
82
+ [9.0],
83
+ [],
84
+ ["b", "c", "d"],
85
+ [],
86
+ [],
87
+ ),
88
+ (
89
+ [50, 60],
90
+ [],
91
+ [4.0, 5.0, 6.0],
92
+ [5.0],
93
+ ["f"],
94
+ ["e", "f"],
95
+ [9.0],
96
+ [],
97
+ ["b", "c", "d"],
98
+ [],
99
+ [],
100
+ ),
101
+ (
102
+ [],
103
+ [],
104
+ [5.0],
105
+ [],
106
+ [],
107
+ [],
108
+ [9.0],
109
+ [],
110
+ ["b", "c", "d"],
111
+ [],
112
+ [],
113
+ ),
114
+ )
115
+
116
+
117
+ def test_array_join(test_session):
118
+ class Arr(dc.DataModel):
119
+ s: list[str]
120
+
121
+ ds = list(
122
+ dc.read_values(
123
+ id=[1, 2, 3],
124
+ arr=(
125
+ Arr(s=["a", "b", "c"]),
126
+ Arr(s=["d"]),
127
+ Arr(s=[]),
128
+ ),
129
+ session=test_session,
130
+ )
131
+ .mutate(
132
+ t1=func.array.join("arr.s", "/"),
133
+ t2=func.array.join("arr.s", ","),
134
+ t3=func.array.join("arr.s"),
135
+ t4=func.array.join(["a", "b", "c", "d"], ":"),
136
+ t5=func.array.join(["1", "2"], ","),
137
+ t6=func.array.join([]),
138
+ )
139
+ .order_by("id")
140
+ .collect("t1", "t2", "t3", "t4", "t5", "t6")
141
+ )
142
+
143
+ assert tuple(ds) == (
144
+ ("a/b/c", "a,b,c", "abc", "a:b:c:d", "1,2", ""),
145
+ ("d", "d", "d", "a:b:c:d", "1,2", ""),
146
+ ("", "", "", "a:b:c:d", "1,2", ""),
147
+ )
148
+
149
+
150
+ def test_array_get_element(test_session):
151
+ db_dialect = test_session.catalog.warehouse.db.dialect
152
+
153
+ class Arr(dc.DataModel):
154
+ i: list[int]
155
+ f: list[float]
156
+
157
+ ds = list(
158
+ dc.read_values(
159
+ id=[1, 2, 3],
160
+ arr=(
161
+ Arr(i=[10, 20, 30], f=[1.0, 2.0, 3.0]),
162
+ Arr(i=[40, 50, 60], f=[4.0, 5.0, 6.0]),
163
+ Arr(i=[50], f=[5.0]),
164
+ ),
165
+ session=test_session,
166
+ )
167
+ .mutate(
168
+ t1=func.array.get_element("arr.i", 0),
169
+ t2=func.array.get_element("arr.i", 1),
170
+ t3=func.array.get_element("arr.i", 100),
171
+ t4=func.array.get_element("arr.f", 0),
172
+ t5=func.array.get_element("arr.f", 1),
173
+ t6=func.array.get_element([9.0], 0),
174
+ t7=func.array.get_element(["a", "b", "c", "d"], 0),
175
+ t8=func.array.get_element(["a", "b", "c", "d"], 1),
176
+ t9=func.array.get_element(["a", "b", "c", "d"], 100),
177
+ t10=func.array.get_element([], 0),
178
+ )
179
+ .order_by("id")
180
+ .collect("t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10")
181
+ )
182
+
183
+ tuples_almost_equal(
184
+ tuple(ds),
185
+ (
186
+ (
187
+ 10,
188
+ 20,
189
+ Int.default_value(db_dialect),
190
+ 1.0,
191
+ 2.0,
192
+ 9.0,
193
+ "a",
194
+ "b",
195
+ String.default_value(db_dialect),
196
+ String.default_value(db_dialect),
197
+ ),
198
+ (
199
+ 40,
200
+ 50,
201
+ Int.default_value(db_dialect),
202
+ 4.0,
203
+ 5.0,
204
+ 9.0,
205
+ "a",
206
+ "b",
207
+ String.default_value(db_dialect),
208
+ String.default_value(db_dialect),
209
+ ),
210
+ (
211
+ 50,
212
+ Int.default_value(db_dialect),
213
+ Int.default_value(db_dialect),
214
+ 5.0,
215
+ Float.default_value(db_dialect),
216
+ 9.0,
217
+ "a",
218
+ "b",
219
+ String.default_value(db_dialect),
220
+ String.default_value(db_dialect),
221
+ ),
222
+ ),
223
+ )