datachain 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (257) hide show
  1. {datachain-0.3.0 → datachain-0.3.1}/.github/workflows/tests.yml +20 -1
  2. {datachain-0.3.0/src/datachain.egg-info → datachain-0.3.1}/PKG-INFO +1 -1
  3. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/catalog/catalog.py +1 -1
  4. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/client/fsspec.py +1 -4
  5. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/client/local.py +2 -7
  6. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/data_storage/warehouse.py +8 -14
  7. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/dc.py +1 -1
  8. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/udf.py +21 -14
  9. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/query/batch.py +45 -41
  10. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/query/dataset.py +13 -6
  11. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/query/dispatch.py +53 -68
  12. datachain-0.3.1/src/datachain/query/queue.py +120 -0
  13. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/query/udf.py +23 -8
  14. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/utils.py +17 -2
  15. {datachain-0.3.0 → datachain-0.3.1/src/datachain.egg-info}/PKG-INFO +1 -1
  16. {datachain-0.3.0 → datachain-0.3.1}/src/datachain.egg-info/SOURCES.txt +1 -0
  17. {datachain-0.3.0 → datachain-0.3.1}/tests/conftest.py +2 -0
  18. {datachain-0.3.0 → datachain-0.3.1}/tests/func/test_catalog.py +1 -1
  19. {datachain-0.3.0 → datachain-0.3.1}/tests/func/test_pull.py +7 -7
  20. {datachain-0.3.0 → datachain-0.3.1}/tests/test_query_e2e.py +1 -1
  21. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_datachain.py +3 -3
  22. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_client.py +1 -1
  23. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_database_engine.py +2 -0
  24. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_id_generator.py +2 -0
  25. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_listing.py +1 -1
  26. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_storage.py +1 -1
  27. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_udf.py +2 -2
  28. {datachain-0.3.0 → datachain-0.3.1}/tests/utils.py +5 -3
  29. {datachain-0.3.0 → datachain-0.3.1}/.cruft.json +0 -0
  30. {datachain-0.3.0 → datachain-0.3.1}/.gitattributes +0 -0
  31. {datachain-0.3.0 → datachain-0.3.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  32. {datachain-0.3.0 → datachain-0.3.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  33. {datachain-0.3.0 → datachain-0.3.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  34. {datachain-0.3.0 → datachain-0.3.1}/.github/codecov.yaml +0 -0
  35. {datachain-0.3.0 → datachain-0.3.1}/.github/dependabot.yml +0 -0
  36. {datachain-0.3.0 → datachain-0.3.1}/.github/workflows/benchmarks.yml +0 -0
  37. {datachain-0.3.0 → datachain-0.3.1}/.github/workflows/release.yml +0 -0
  38. {datachain-0.3.0 → datachain-0.3.1}/.github/workflows/update-template.yaml +0 -0
  39. {datachain-0.3.0 → datachain-0.3.1}/.gitignore +0 -0
  40. {datachain-0.3.0 → datachain-0.3.1}/.pre-commit-config.yaml +0 -0
  41. {datachain-0.3.0 → datachain-0.3.1}/CODE_OF_CONDUCT.rst +0 -0
  42. {datachain-0.3.0 → datachain-0.3.1}/CONTRIBUTING.rst +0 -0
  43. {datachain-0.3.0 → datachain-0.3.1}/LICENSE +0 -0
  44. {datachain-0.3.0 → datachain-0.3.1}/README.rst +0 -0
  45. {datachain-0.3.0 → datachain-0.3.1}/docs/assets/captioned_cartoons.png +0 -0
  46. {datachain-0.3.0 → datachain-0.3.1}/docs/assets/datachain.png +0 -0
  47. {datachain-0.3.0 → datachain-0.3.1}/docs/assets/flowchart.png +0 -0
  48. {datachain-0.3.0 → datachain-0.3.1}/docs/index.md +0 -0
  49. {datachain-0.3.0 → datachain-0.3.1}/docs/references/datachain.md +0 -0
  50. {datachain-0.3.0 → datachain-0.3.1}/docs/references/datatype.md +0 -0
  51. {datachain-0.3.0 → datachain-0.3.1}/docs/references/file.md +0 -0
  52. {datachain-0.3.0 → datachain-0.3.1}/docs/references/index.md +0 -0
  53. {datachain-0.3.0 → datachain-0.3.1}/docs/references/sql.md +0 -0
  54. {datachain-0.3.0 → datachain-0.3.1}/docs/references/torch.md +0 -0
  55. {datachain-0.3.0 → datachain-0.3.1}/docs/references/udf.md +0 -0
  56. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  57. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
  58. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
  59. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
  60. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
  61. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
  62. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/README.md +0 -0
  63. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  64. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
  65. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
  66. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
  67. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
  68. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
  69. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  70. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
  71. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  72. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  73. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  74. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  75. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  76. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  77. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  78. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  79. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  80. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  81. {datachain-0.3.0 → datachain-0.3.1}/examples/computer_vision/openimage-detect.py +0 -0
  82. {datachain-0.3.0 → datachain-0.3.1}/examples/get_started/common_sql_functions.py +0 -0
  83. {datachain-0.3.0 → datachain-0.3.1}/examples/get_started/json-csv-reader.py +0 -0
  84. {datachain-0.3.0 → datachain-0.3.1}/examples/get_started/json-metadata-tutorial.ipynb +0 -0
  85. {datachain-0.3.0 → datachain-0.3.1}/examples/get_started/torch-loader.py +0 -0
  86. {datachain-0.3.0 → datachain-0.3.1}/examples/get_started/udfs/parallel.py +0 -0
  87. {datachain-0.3.0 → datachain-0.3.1}/examples/get_started/udfs/simple.py +0 -0
  88. {datachain-0.3.0 → datachain-0.3.1}/examples/get_started/udfs/stateful.py +0 -0
  89. {datachain-0.3.0 → datachain-0.3.1}/examples/llm/llm_chatbot_evaluation.ipynb +0 -0
  90. {datachain-0.3.0 → datachain-0.3.1}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  91. {datachain-0.3.0 → datachain-0.3.1}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  92. {datachain-0.3.0 → datachain-0.3.1}/examples/llm_and_nlp/llm-claude.py +0 -0
  93. {datachain-0.3.0 → datachain-0.3.1}/examples/llm_and_nlp/unstructured-text.py +0 -0
  94. {datachain-0.3.0 → datachain-0.3.1}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
  95. {datachain-0.3.0 → datachain-0.3.1}/examples/multimodal/clip_inference.py +0 -0
  96. {datachain-0.3.0 → datachain-0.3.1}/examples/multimodal/hf_pipeline.py +0 -0
  97. {datachain-0.3.0 → datachain-0.3.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  98. {datachain-0.3.0 → datachain-0.3.1}/examples/multimodal/wds.py +0 -0
  99. {datachain-0.3.0 → datachain-0.3.1}/examples/multimodal/wds_filtered.py +0 -0
  100. {datachain-0.3.0 → datachain-0.3.1}/mkdocs.yml +0 -0
  101. {datachain-0.3.0 → datachain-0.3.1}/noxfile.py +0 -0
  102. {datachain-0.3.0 → datachain-0.3.1}/pyproject.toml +0 -0
  103. {datachain-0.3.0 → datachain-0.3.1}/setup.cfg +0 -0
  104. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/__init__.py +0 -0
  105. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/__main__.py +0 -0
  106. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/asyn.py +0 -0
  107. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/cache.py +0 -0
  108. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/catalog/__init__.py +0 -0
  109. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/catalog/datasource.py +0 -0
  110. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/catalog/loader.py +0 -0
  111. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/catalog/subclass.py +0 -0
  112. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/cli.py +0 -0
  113. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/cli_utils.py +0 -0
  114. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/client/__init__.py +0 -0
  115. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/client/azure.py +0 -0
  116. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/client/fileslice.py +0 -0
  117. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/client/gcs.py +0 -0
  118. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/client/s3.py +0 -0
  119. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/config.py +0 -0
  120. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/data_storage/__init__.py +0 -0
  121. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/data_storage/db_engine.py +0 -0
  122. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/data_storage/id_generator.py +0 -0
  123. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/data_storage/job.py +0 -0
  124. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/data_storage/metastore.py +0 -0
  125. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/data_storage/schema.py +0 -0
  126. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/data_storage/serializer.py +0 -0
  127. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/data_storage/sqlite.py +0 -0
  128. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/dataset.py +0 -0
  129. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/error.py +0 -0
  130. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/job.py +0 -0
  131. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/__init__.py +0 -0
  132. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/arrow.py +0 -0
  133. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/clip.py +0 -0
  134. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/convert/__init__.py +0 -0
  135. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/convert/flatten.py +0 -0
  136. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  137. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  138. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/convert/unflatten.py +0 -0
  139. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  140. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/data_model.py +0 -0
  141. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/dataset_info.py +0 -0
  142. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/file.py +0 -0
  143. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/image.py +0 -0
  144. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/meta_formats.py +0 -0
  145. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/model_store.py +0 -0
  146. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/pytorch.py +0 -0
  147. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/settings.py +0 -0
  148. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/signal_schema.py +0 -0
  149. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/text.py +0 -0
  150. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/udf_signature.py +0 -0
  151. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/utils.py +0 -0
  152. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/vfile.py +0 -0
  153. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/webdataset.py +0 -0
  154. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/lib/webdataset_laion.py +0 -0
  155. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/listing.py +0 -0
  156. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/node.py +0 -0
  157. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/nodes_fetcher.py +0 -0
  158. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/nodes_thread_pool.py +0 -0
  159. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/progress.py +0 -0
  160. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/py.typed +0 -0
  161. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/query/__init__.py +0 -0
  162. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/query/builtins.py +0 -0
  163. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/query/metrics.py +0 -0
  164. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/query/params.py +0 -0
  165. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/query/schema.py +0 -0
  166. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/query/session.py +0 -0
  167. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/remote/__init__.py +0 -0
  168. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/remote/studio.py +0 -0
  169. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/__init__.py +0 -0
  170. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/default/__init__.py +0 -0
  171. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/default/base.py +0 -0
  172. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/functions/__init__.py +0 -0
  173. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/functions/array.py +0 -0
  174. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/functions/conditional.py +0 -0
  175. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/functions/path.py +0 -0
  176. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/functions/random.py +0 -0
  177. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/functions/string.py +0 -0
  178. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/selectable.py +0 -0
  179. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  180. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/sqlite/base.py +0 -0
  181. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/sqlite/types.py +0 -0
  182. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/sqlite/vector.py +0 -0
  183. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/types.py +0 -0
  184. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/sql/utils.py +0 -0
  185. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/storage.py +0 -0
  186. {datachain-0.3.0 → datachain-0.3.1}/src/datachain/torch/__init__.py +0 -0
  187. {datachain-0.3.0 → datachain-0.3.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  188. {datachain-0.3.0 → datachain-0.3.1}/src/datachain.egg-info/entry_points.txt +0 -0
  189. {datachain-0.3.0 → datachain-0.3.1}/src/datachain.egg-info/requires.txt +0 -0
  190. {datachain-0.3.0 → datachain-0.3.1}/src/datachain.egg-info/top_level.txt +0 -0
  191. {datachain-0.3.0 → datachain-0.3.1}/tests/__init__.py +0 -0
  192. {datachain-0.3.0 → datachain-0.3.1}/tests/benchmarks/__init__.py +0 -0
  193. {datachain-0.3.0 → datachain-0.3.1}/tests/benchmarks/conftest.py +0 -0
  194. {datachain-0.3.0 → datachain-0.3.1}/tests/benchmarks/test_ls.py +0 -0
  195. {datachain-0.3.0 → datachain-0.3.1}/tests/benchmarks/test_version.py +0 -0
  196. {datachain-0.3.0 → datachain-0.3.1}/tests/data.py +0 -0
  197. {datachain-0.3.0 → datachain-0.3.1}/tests/examples/__init__.py +0 -0
  198. {datachain-0.3.0 → datachain-0.3.1}/tests/examples/test_wds_e2e.py +0 -0
  199. {datachain-0.3.0 → datachain-0.3.1}/tests/examples/wds_data.py +0 -0
  200. {datachain-0.3.0 → datachain-0.3.1}/tests/func/__init__.py +0 -0
  201. {datachain-0.3.0 → datachain-0.3.1}/tests/func/test_client.py +0 -0
  202. {datachain-0.3.0 → datachain-0.3.1}/tests/func/test_datachain.py +0 -0
  203. {datachain-0.3.0 → datachain-0.3.1}/tests/func/test_dataset_query.py +0 -0
  204. {datachain-0.3.0 → datachain-0.3.1}/tests/func/test_datasets.py +0 -0
  205. {datachain-0.3.0 → datachain-0.3.1}/tests/func/test_feature_pickling.py +0 -0
  206. {datachain-0.3.0 → datachain-0.3.1}/tests/func/test_ls.py +0 -0
  207. {datachain-0.3.0 → datachain-0.3.1}/tests/func/test_pytorch.py +0 -0
  208. {datachain-0.3.0 → datachain-0.3.1}/tests/func/test_query.py +0 -0
  209. {datachain-0.3.0 → datachain-0.3.1}/tests/scripts/feature_class.py +0 -0
  210. {datachain-0.3.0 → datachain-0.3.1}/tests/scripts/feature_class_parallel.py +0 -0
  211. {datachain-0.3.0 → datachain-0.3.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  212. {datachain-0.3.0 → datachain-0.3.1}/tests/scripts/name_len_slow.py +0 -0
  213. {datachain-0.3.0 → datachain-0.3.1}/tests/test_cli_e2e.py +0 -0
  214. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/__init__.py +0 -0
  215. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/__init__.py +0 -0
  216. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/conftest.py +0 -0
  217. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_arrow.py +0 -0
  218. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_clip.py +0 -0
  219. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  220. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  221. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_feature.py +0 -0
  222. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_feature_utils.py +0 -0
  223. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_file.py +0 -0
  224. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_image.py +0 -0
  225. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_signal_schema.py +0 -0
  226. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  227. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_text.py +0 -0
  228. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_udf_signature.py +0 -0
  229. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_utils.py +0 -0
  230. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/lib/test_webdataset.py +0 -0
  231. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/sql/__init__.py +0 -0
  232. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  233. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  234. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/sql/test_array.py +0 -0
  235. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/sql/test_conditional.py +0 -0
  236. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/sql/test_path.py +0 -0
  237. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/sql/test_random.py +0 -0
  238. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/sql/test_selectable.py +0 -0
  239. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/sql/test_string.py +0 -0
  240. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_asyn.py +0 -0
  241. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_cache.py +0 -0
  242. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_catalog.py +0 -0
  243. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_catalog_loader.py +0 -0
  244. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_cli_parsing.py +0 -0
  245. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_client_s3.py +0 -0
  246. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_data_storage.py +0 -0
  247. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_dataset.py +0 -0
  248. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_dispatch.py +0 -0
  249. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_fileslice.py +0 -0
  250. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_metastore.py +0 -0
  251. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_module_exports.py +0 -0
  252. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_query_metrics.py +0 -0
  253. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_query_params.py +0 -0
  254. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_serializer.py +0 -0
  255. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_session.py +0 -0
  256. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_utils.py +0 -0
  257. {datachain-0.3.0 → datachain-0.3.1}/tests/unit/test_warehouse.py +0 -0
@@ -8,6 +8,7 @@ on:
8
8
 
9
9
  env:
10
10
  FORCE_COLOR: "1"
11
+ BRANCH: ${{ github.head_ref || github.ref_name }}
11
12
 
12
13
  concurrency:
13
14
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -144,12 +145,30 @@ jobs:
144
145
  - 6379:6379
145
146
  steps:
146
147
 
148
+ - name: Studio branch name
149
+ env:
150
+ BRANCH: ${{ env.BRANCH }}
151
+ STUDIO_READ_ACCESS_TOKEN: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
152
+ run: |
153
+ echo "DataChain branch: $BRANCH"
154
+ if [[ "$BRANCH" == "main" ]]
155
+ then
156
+ STUDIO_BRANCH=develop
157
+ elif git ls-remote --heads https://"$STUDIO_READ_ACCESS_TOKEN"@github.com/iterative/studio.git "$BRANCH" | grep -F "$BRANCH" 2>&1>/dev/null
158
+ then
159
+ STUDIO_BRANCH="$BRANCH"
160
+ else
161
+ STUDIO_BRANCH=develop
162
+ fi
163
+ echo "STUDIO_BRANCH=$STUDIO_BRANCH" >> $GITHUB_ENV
164
+ echo "Studio branch: $STUDIO_BRANCH"
165
+
147
166
  - name: Check out Studio
148
167
  uses: actions/checkout@v4
149
168
  with:
150
169
  fetch-depth: 0
151
170
  repository: iterative/studio
152
- ref: develop
171
+ ref: ${{ env.STUDIO_BRANCH }}
153
172
  token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
154
173
 
155
174
  - name: Check out repository
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -676,7 +676,7 @@ class Catalog:
676
676
 
677
677
  def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
678
678
  config = config or self.client_config
679
- return Client.parse_url(uri, self.metastore, self.cache, **config)
679
+ return Client.parse_url(uri, self.cache, **config)
680
680
 
681
681
  def get_client(self, uri: StorageURI, **config: Any) -> Client:
682
682
  """
@@ -37,7 +37,6 @@ from datachain.storage import StorageURI
37
37
  if TYPE_CHECKING:
38
38
  from fsspec.spec import AbstractFileSystem
39
39
 
40
- from datachain.data_storage import AbstractMetastore
41
40
 
42
41
  logger = logging.getLogger("datachain")
43
42
 
@@ -116,13 +115,12 @@ class Client(ABC):
116
115
  @staticmethod
117
116
  def parse_url(
118
117
  source: str,
119
- metastore: "AbstractMetastore",
120
118
  cache: DataChainCache,
121
119
  **kwargs,
122
120
  ) -> tuple["Client", str]:
123
121
  cls = Client.get_implementation(source)
124
122
  storage_url, rel_path = cls.split_url(source)
125
- client = cls.from_name(storage_url, metastore, cache, kwargs)
123
+ client = cls.from_name(storage_url, cache, kwargs)
126
124
  return client, rel_path
127
125
 
128
126
  @classmethod
@@ -136,7 +134,6 @@ class Client(ABC):
136
134
  def from_name(
137
135
  cls,
138
136
  name: str,
139
- metastore: "AbstractMetastore",
140
137
  cache: DataChainCache,
141
138
  kwargs: dict[str, Any],
142
139
  ) -> "Client":
@@ -2,7 +2,7 @@ import os
2
2
  import posixpath
3
3
  from datetime import datetime, timezone
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any
5
+ from typing import Any
6
6
  from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
@@ -12,9 +12,6 @@ from datachain.storage import StorageURI
12
12
 
13
13
  from .fsspec import Client
14
14
 
15
- if TYPE_CHECKING:
16
- from datachain.data_storage import AbstractMetastore
17
-
18
15
 
19
16
  class FileClient(Client):
20
17
  FS_CLASS = LocalFileSystem
@@ -97,9 +94,7 @@ class FileClient(Client):
97
94
  return cls.root_dir(), uri.removeprefix(cls.root_path().as_uri())
98
95
 
99
96
  @classmethod
100
- def from_name(
101
- cls, name: str, metastore: "AbstractMetastore", cache, kwargs
102
- ) -> "FileClient":
97
+ def from_name(cls, name: str, cache, kwargs) -> "FileClient":
103
98
  use_symlinks = kwargs.pop("use_symlinks", False)
104
99
  return cls(name, kwargs, cache, use_symlinks=use_symlinks)
105
100
 
@@ -17,7 +17,7 @@ from sqlalchemy.sql.expression import true
17
17
 
18
18
  from datachain.client import Client
19
19
  from datachain.data_storage.serializer import Serializable
20
- from datachain.dataset import DatasetRecord, RowDict
20
+ from datachain.dataset import DatasetRecord
21
21
  from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
22
22
  from datachain.sql.functions import path as pathfunc
23
23
  from datachain.sql.types import Int, SQLType
@@ -201,23 +201,17 @@ class AbstractWarehouse(ABC, Serializable):
201
201
  def dataset_select_paginated(
202
202
  self,
203
203
  query,
204
- limit: Optional[int] = None,
205
- order_by: tuple["ColumnElement[Any]", ...] = (),
206
204
  page_size: int = SELECT_BATCH_SIZE,
207
- ) -> Generator[RowDict, None, None]:
205
+ ) -> Generator[Sequence, None, None]:
208
206
  """
209
207
  This is equivalent to `db.execute`, but for selecting rows in batches
210
208
  """
211
- cols = query.selected_columns
212
- cols_names = [c.name for c in cols]
209
+ limit = query._limit
210
+ paginated_query = query.limit(page_size)
213
211
 
214
- if not order_by:
215
- ordering = [cols.sys__id]
216
- else:
217
- ordering = order_by # type: ignore[assignment]
218
-
219
- # reset query order by and apply new order by id
220
- paginated_query = query.order_by(None).order_by(*ordering).limit(page_size)
212
+ if not paginated_query._order_by_clauses:
213
+ # default order by is order by `sys__id`
214
+ paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
221
215
 
222
216
  results = None
223
217
  offset = 0
@@ -236,7 +230,7 @@ class AbstractWarehouse(ABC, Serializable):
236
230
  processed = False
237
231
  for row in results:
238
232
  processed = True
239
- yield RowDict(zip(cols_names, row))
233
+ yield row
240
234
  num_yielded += 1
241
235
 
242
236
  if not processed:
@@ -1623,7 +1623,7 @@ class DataChain(DatasetQuery):
1623
1623
 
1624
1624
  Using glob to match patterns
1625
1625
  ```py
1626
- dc.filter(C("file.name").glob("*.jpg))
1626
+ dc.filter(C("file.name").glob("*.jpg"))
1627
1627
  ```
1628
1628
 
1629
1629
  Using `datachain.sql.functions`
@@ -1,6 +1,5 @@
1
1
  import sys
2
2
  import traceback
3
- from collections.abc import Iterable, Iterator
4
3
  from typing import TYPE_CHECKING, Callable, Optional
5
4
 
6
5
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -14,16 +13,19 @@ from datachain.lib.model_store import ModelStore
14
13
  from datachain.lib.signal_schema import SignalSchema
15
14
  from datachain.lib.udf_signature import UdfSignature
16
15
  from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
17
- from datachain.query.batch import RowBatch
16
+ from datachain.query.batch import UDFInputBatch
18
17
  from datachain.query.schema import ColumnParameter
19
18
  from datachain.query.udf import UDFBase as _UDFBase
20
- from datachain.query.udf import UDFProperties, UDFResult
19
+ from datachain.query.udf import UDFProperties
21
20
 
22
21
  if TYPE_CHECKING:
22
+ from collections.abc import Iterable, Iterator, Sequence
23
+
23
24
  from typing_extensions import Self
24
25
 
25
26
  from datachain.catalog import Catalog
26
- from datachain.query.batch import BatchingResult
27
+ from datachain.query.batch import RowsOutput, UDFInput
28
+ from datachain.query.udf import UDFResult
27
29
 
28
30
 
29
31
  class UdfError(DataChainParamsError):
@@ -42,22 +44,27 @@ class UDFAdapter(_UDFBase):
42
44
 
43
45
  def run(
44
46
  self,
45
- udf_inputs: "Iterable[BatchingResult]",
47
+ udf_fields: "Sequence[str]",
48
+ udf_inputs: "Iterable[RowsOutput]",
46
49
  catalog: "Catalog",
47
50
  is_generator: bool,
48
51
  cache: bool,
49
52
  download_cb: Callback = DEFAULT_CALLBACK,
50
53
  processed_cb: Callback = DEFAULT_CALLBACK,
51
- ) -> Iterator[Iterable["UDFResult"]]:
54
+ ) -> "Iterator[Iterable[UDFResult]]":
52
55
  self.inner._catalog = catalog
53
56
  if hasattr(self.inner, "setup") and callable(self.inner.setup):
54
57
  self.inner.setup()
55
58
 
56
- for batch in udf_inputs:
57
- n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
58
- output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
59
- processed_cb.relative_update(n_rows)
60
- yield output
59
+ yield from super().run(
60
+ udf_fields,
61
+ udf_inputs,
62
+ catalog,
63
+ is_generator,
64
+ cache,
65
+ download_cb,
66
+ processed_cb,
67
+ )
61
68
 
62
69
  if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
63
70
  self.inner.teardown()
@@ -65,12 +72,12 @@ class UDFAdapter(_UDFBase):
65
72
  def run_once(
66
73
  self,
67
74
  catalog: "Catalog",
68
- arg: "BatchingResult",
75
+ arg: "UDFInput",
69
76
  is_generator: bool = False,
70
77
  cache: bool = False,
71
78
  cb: Callback = DEFAULT_CALLBACK,
72
- ) -> Iterable[UDFResult]:
73
- if isinstance(arg, RowBatch):
79
+ ) -> "Iterable[UDFResult]":
80
+ if isinstance(arg, UDFInputBatch):
74
81
  udf_inputs = [
75
82
  self.bind_parameters(catalog, row, cache=cache, cb=cb)
76
83
  for row in arg.rows
@@ -5,21 +5,29 @@ from collections.abc import Generator, Sequence
5
5
  from dataclasses import dataclass
6
6
  from typing import TYPE_CHECKING, Callable, Optional, Union
7
7
 
8
- import sqlalchemy as sa
9
-
10
8
  from datachain.data_storage.schema import PARTITION_COLUMN_ID
11
9
  from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
12
10
 
13
11
  if TYPE_CHECKING:
12
+ from sqlalchemy import Select
13
+
14
14
  from datachain.dataset import RowDict
15
15
 
16
16
 
17
17
  @dataclass
18
- class RowBatch:
18
+ class RowsOutputBatch:
19
+ rows: Sequence[Sequence]
20
+
21
+
22
+ RowsOutput = Union[Sequence, RowsOutputBatch]
23
+
24
+
25
+ @dataclass
26
+ class UDFInputBatch:
19
27
  rows: Sequence["RowDict"]
20
28
 
21
29
 
22
- BatchingResult = Union["RowDict", RowBatch]
30
+ UDFInput = Union["RowDict", UDFInputBatch]
23
31
 
24
32
 
25
33
  class BatchingStrategy(ABC):
@@ -28,9 +36,9 @@ class BatchingStrategy(ABC):
28
36
  @abstractmethod
29
37
  def __call__(
30
38
  self,
31
- execute: Callable,
32
- query: sa.sql.selectable.Select,
33
- ) -> Generator[BatchingResult, None, None]:
39
+ execute: Callable[..., Generator[Sequence, None, None]],
40
+ query: "Select",
41
+ ) -> Generator[RowsOutput, None, None]:
34
42
  """Apply the provided parameters to the UDF."""
35
43
 
36
44
 
@@ -42,10 +50,10 @@ class NoBatching(BatchingStrategy):
42
50
 
43
51
  def __call__(
44
52
  self,
45
- execute: Callable,
46
- query: sa.sql.selectable.Select,
47
- ) -> Generator["RowDict", None, None]:
48
- return execute(query, limit=query._limit, order_by=query._order_by_clauses)
53
+ execute: Callable[..., Generator[Sequence, None, None]],
54
+ query: "Select",
55
+ ) -> Generator[Sequence, None, None]:
56
+ return execute(query)
49
57
 
50
58
 
51
59
  class Batch(BatchingStrategy):
@@ -59,31 +67,24 @@ class Batch(BatchingStrategy):
59
67
 
60
68
  def __call__(
61
69
  self,
62
- execute: Callable,
63
- query: sa.sql.selectable.Select,
64
- ) -> Generator[RowBatch, None, None]:
70
+ execute: Callable[..., Generator[Sequence, None, None]],
71
+ query: "Select",
72
+ ) -> Generator[RowsOutputBatch, None, None]:
65
73
  # choose page size that is a multiple of the batch size
66
74
  page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
67
75
 
68
76
  # select rows in batches
69
- results: list[RowDict] = []
70
-
71
- with contextlib.closing(
72
- execute(
73
- query,
74
- page_size=page_size,
75
- limit=query._limit,
76
- order_by=query._order_by_clauses,
77
- )
78
- ) as rows:
77
+ results: list[Sequence] = []
78
+
79
+ with contextlib.closing(execute(query, page_size=page_size)) as rows:
79
80
  for row in rows:
80
81
  results.append(row)
81
82
  if len(results) >= self.count:
82
83
  batch, results = results[: self.count], results[self.count :]
83
- yield RowBatch(batch)
84
+ yield RowsOutputBatch(batch)
84
85
 
85
86
  if len(results) > 0:
86
- yield RowBatch(results)
87
+ yield RowsOutputBatch(results)
87
88
 
88
89
 
89
90
  class Partition(BatchingStrategy):
@@ -95,27 +96,30 @@ class Partition(BatchingStrategy):
95
96
 
96
97
  def __call__(
97
98
  self,
98
- execute: Callable,
99
- query: sa.sql.selectable.Select,
100
- ) -> Generator[RowBatch, None, None]:
99
+ execute: Callable[..., Generator[Sequence, None, None]],
100
+ query: "Select",
101
+ ) -> Generator[RowsOutputBatch, None, None]:
101
102
  current_partition: Optional[int] = None
102
- batch: list[RowDict] = []
103
-
104
- with contextlib.closing(
105
- execute(
106
- query,
107
- order_by=(PARTITION_COLUMN_ID, "sys__id", *query._order_by_clauses),
108
- limit=query._limit,
109
- )
110
- ) as rows:
103
+ batch: list[Sequence] = []
104
+
105
+ query_fields = [str(c.name) for c in query.selected_columns]
106
+ partition_column_idx = query_fields.index(PARTITION_COLUMN_ID)
107
+
108
+ ordered_query = query.order_by(None).order_by(
109
+ PARTITION_COLUMN_ID,
110
+ "sys__id",
111
+ *query._order_by_clauses,
112
+ )
113
+
114
+ with contextlib.closing(execute(ordered_query)) as rows:
111
115
  for row in rows:
112
- partition = row[PARTITION_COLUMN_ID]
116
+ partition = row[partition_column_idx]
113
117
  if current_partition != partition:
114
118
  current_partition = partition
115
119
  if len(batch) > 0:
116
- yield RowBatch(batch)
120
+ yield RowsOutputBatch(batch)
117
121
  batch = []
118
122
  batch.append(row)
119
123
 
120
124
  if len(batch) > 0:
121
- yield RowBatch(batch)
125
+ yield RowsOutputBatch(batch)
@@ -461,6 +461,8 @@ class UDFStep(Step, ABC):
461
461
 
462
462
  processes = determine_processes(self.parallel)
463
463
 
464
+ udf_fields = [str(c.name) for c in query.selected_columns]
465
+
464
466
  try:
465
467
  if workers:
466
468
  from datachain.catalog.loader import get_distributed_class
@@ -473,6 +475,7 @@ class UDFStep(Step, ABC):
473
475
  query,
474
476
  workers,
475
477
  processes,
478
+ udf_fields=udf_fields,
476
479
  is_generator=self.is_generator,
477
480
  use_partitioning=use_partitioning,
478
481
  cache=self.cache,
@@ -489,6 +492,7 @@ class UDFStep(Step, ABC):
489
492
  "warehouse_clone_params": self.catalog.warehouse.clone_params(),
490
493
  "table": udf_table,
491
494
  "query": query,
495
+ "udf_fields": udf_fields,
492
496
  "batching": batching,
493
497
  "processes": processes,
494
498
  "is_generator": self.is_generator,
@@ -528,6 +532,7 @@ class UDFStep(Step, ABC):
528
532
  generated_cb = get_generated_callback(self.is_generator)
529
533
  try:
530
534
  udf_results = udf.run(
535
+ udf_fields,
531
536
  udf_inputs,
532
537
  self.catalog,
533
538
  self.is_generator,
@@ -1244,21 +1249,23 @@ class DatasetQuery:
1244
1249
  actual_params = [normalize_param(p) for p in params]
1245
1250
  try:
1246
1251
  query = self.apply_steps().select()
1252
+ query_fields = [str(c.name) for c in query.selected_columns]
1247
1253
 
1248
- def row_iter() -> Generator[RowDict, None, None]:
1254
+ def row_iter() -> Generator[Sequence, None, None]:
1249
1255
  # warehouse isn't threadsafe, we need to clone() it
1250
1256
  # in the thread that uses the results
1251
1257
  with self.catalog.warehouse.clone() as warehouse:
1252
- gen = warehouse.dataset_select_paginated(
1253
- query, limit=query._limit, order_by=query._order_by_clauses
1254
- )
1258
+ gen = warehouse.dataset_select_paginated(query)
1255
1259
  with contextlib.closing(gen) as rows:
1256
1260
  yield from rows
1257
1261
 
1258
- async def get_params(row: RowDict) -> tuple:
1262
+ async def get_params(row: Sequence) -> tuple:
1263
+ row_dict = RowDict(zip(query_fields, row))
1259
1264
  return tuple(
1260
1265
  [
1261
- await p.get_value_async(self.catalog, row, mapper, **kwargs)
1266
+ await p.get_value_async(
1267
+ self.catalog, row_dict, mapper, **kwargs
1268
+ )
1262
1269
  for p in actual_params
1263
1270
  ]
1264
1271
  )