datachain 0.2.13__tar.gz → 0.2.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (255) hide show
  1. {datachain-0.2.13/src/datachain.egg-info → datachain-0.2.14}/PKG-INFO +2 -3
  2. {datachain-0.2.13 → datachain-0.2.14}/docs/index.md +2 -2
  3. {datachain-0.2.13 → datachain-0.2.14}/pyproject.toml +7 -3
  4. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/metastore.py +0 -4
  5. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/schema.py +7 -3
  6. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/sqlite.py +1 -4
  7. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/warehouse.py +1 -24
  8. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/flatten.py +4 -4
  9. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/values_to_tuples.py +4 -1
  10. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/dc.py +100 -5
  11. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/file.py +6 -11
  12. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/meta_formats.py +6 -5
  13. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/dataset.py +19 -21
  14. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/sqlite/base.py +3 -3
  15. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/sqlite/types.py +5 -13
  16. {datachain-0.2.13 → datachain-0.2.14/src/datachain.egg-info}/PKG-INFO +2 -3
  17. {datachain-0.2.13 → datachain-0.2.14}/src/datachain.egg-info/requires.txt +1 -2
  18. {datachain-0.2.13 → datachain-0.2.14}/tests/examples/test_wds_e2e.py +1 -1
  19. {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_datachain.py +2 -6
  20. {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_datasets.py +7 -6
  21. {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_feature_pickling.py +10 -3
  22. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_datachain.py +56 -0
  23. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_datachain_merge.py +19 -19
  24. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_feature.py +7 -7
  25. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_module_exports.py +25 -18
  26. {datachain-0.2.13 → datachain-0.2.14}/.cruft.json +0 -0
  27. {datachain-0.2.13 → datachain-0.2.14}/.gitattributes +0 -0
  28. {datachain-0.2.13 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  29. {datachain-0.2.13 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  30. {datachain-0.2.13 → datachain-0.2.14}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  31. {datachain-0.2.13 → datachain-0.2.14}/.github/codecov.yaml +0 -0
  32. {datachain-0.2.13 → datachain-0.2.14}/.github/dependabot.yml +0 -0
  33. {datachain-0.2.13 → datachain-0.2.14}/.github/workflows/benchmarks.yml +0 -0
  34. {datachain-0.2.13 → datachain-0.2.14}/.github/workflows/release.yml +0 -0
  35. {datachain-0.2.13 → datachain-0.2.14}/.github/workflows/tests.yml +0 -0
  36. {datachain-0.2.13 → datachain-0.2.14}/.github/workflows/update-template.yaml +0 -0
  37. {datachain-0.2.13 → datachain-0.2.14}/.gitignore +0 -0
  38. {datachain-0.2.13 → datachain-0.2.14}/.pre-commit-config.yaml +0 -0
  39. {datachain-0.2.13 → datachain-0.2.14}/CODE_OF_CONDUCT.rst +0 -0
  40. {datachain-0.2.13 → datachain-0.2.14}/CONTRIBUTING.rst +0 -0
  41. {datachain-0.2.13 → datachain-0.2.14}/LICENSE +0 -0
  42. {datachain-0.2.13 → datachain-0.2.14}/README.rst +0 -0
  43. {datachain-0.2.13 → datachain-0.2.14}/docs/assets/captioned_cartoons.png +0 -0
  44. {datachain-0.2.13 → datachain-0.2.14}/docs/assets/datachain.png +0 -0
  45. {datachain-0.2.13 → datachain-0.2.14}/docs/assets/flowchart.png +0 -0
  46. {datachain-0.2.13 → datachain-0.2.14}/docs/references/datachain.md +0 -0
  47. {datachain-0.2.13 → datachain-0.2.14}/docs/references/datatype.md +0 -0
  48. {datachain-0.2.13 → datachain-0.2.14}/docs/references/file.md +0 -0
  49. {datachain-0.2.13 → datachain-0.2.14}/docs/references/index.md +0 -0
  50. {datachain-0.2.13 → datachain-0.2.14}/docs/references/sql.md +0 -0
  51. {datachain-0.2.13 → datachain-0.2.14}/docs/references/torch.md +0 -0
  52. {datachain-0.2.13 → datachain-0.2.14}/docs/references/udf.md +0 -0
  53. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  54. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
  55. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
  56. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
  57. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
  58. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
  59. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/README.md +0 -0
  60. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  61. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
  62. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
  63. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
  64. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
  65. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
  66. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  67. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
  68. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  69. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  70. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  71. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  72. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  73. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  74. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  75. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  76. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  77. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  78. {datachain-0.2.13 → datachain-0.2.14}/examples/computer_vision/openimage-detect.py +0 -0
  79. {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/common_sql_functions.py +0 -0
  80. {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/json-csv-reader.py +0 -0
  81. {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/torch-loader.py +0 -0
  82. {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/udfs/parallel.py +0 -0
  83. {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/udfs/simple.py +0 -0
  84. {datachain-0.2.13 → datachain-0.2.14}/examples/get_started/udfs/stateful.py +0 -0
  85. {datachain-0.2.13 → datachain-0.2.14}/examples/llm/llm_chatbot_evaluation.ipynb +0 -0
  86. {datachain-0.2.13 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  87. {datachain-0.2.13 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  88. {datachain-0.2.13 → datachain-0.2.14}/examples/llm_and_nlp/llm-claude.py +0 -0
  89. {datachain-0.2.13 → datachain-0.2.14}/examples/llm_and_nlp/unstructured-text.py +0 -0
  90. {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/clip.py +0 -0
  91. {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
  92. {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/hf_pipeline.py +0 -0
  93. {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/openai_image_desc_lib.py +0 -0
  94. {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/wds.py +0 -0
  95. {datachain-0.2.13 → datachain-0.2.14}/examples/multimodal/wds_filtered.py +0 -0
  96. {datachain-0.2.13 → datachain-0.2.14}/mkdocs.yml +0 -0
  97. {datachain-0.2.13 → datachain-0.2.14}/noxfile.py +0 -0
  98. {datachain-0.2.13 → datachain-0.2.14}/setup.cfg +0 -0
  99. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/__init__.py +0 -0
  100. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/__main__.py +0 -0
  101. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/asyn.py +0 -0
  102. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/cache.py +0 -0
  103. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/catalog/__init__.py +0 -0
  104. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/catalog/catalog.py +0 -0
  105. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/catalog/datasource.py +0 -0
  106. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/catalog/loader.py +0 -0
  107. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/catalog/subclass.py +0 -0
  108. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/cli.py +0 -0
  109. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/cli_utils.py +0 -0
  110. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/__init__.py +0 -0
  111. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/azure.py +0 -0
  112. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/fileslice.py +0 -0
  113. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/fsspec.py +0 -0
  114. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/gcs.py +0 -0
  115. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/local.py +0 -0
  116. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/client/s3.py +0 -0
  117. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/config.py +0 -0
  118. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/__init__.py +0 -0
  119. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/db_engine.py +0 -0
  120. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/id_generator.py +0 -0
  121. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/job.py +0 -0
  122. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/data_storage/serializer.py +0 -0
  123. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/dataset.py +0 -0
  124. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/error.py +0 -0
  125. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/job.py +0 -0
  126. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/__init__.py +0 -0
  127. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/arrow.py +0 -0
  128. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/clip.py +0 -0
  129. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/__init__.py +0 -0
  130. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/python_to_sql.py +0 -0
  131. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/sql_to_python.py +0 -0
  132. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/convert/unflatten.py +0 -0
  133. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/data_model.py +0 -0
  134. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/dataset_info.py +0 -0
  135. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/image.py +0 -0
  136. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/model_store.py +0 -0
  137. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/pytorch.py +0 -0
  138. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/settings.py +0 -0
  139. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/signal_schema.py +0 -0
  140. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/text.py +0 -0
  141. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/udf.py +0 -0
  142. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/udf_signature.py +0 -0
  143. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/utils.py +0 -0
  144. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/vfile.py +0 -0
  145. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/webdataset.py +0 -0
  146. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/lib/webdataset_laion.py +0 -0
  147. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/listing.py +0 -0
  148. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/node.py +0 -0
  149. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/nodes_fetcher.py +0 -0
  150. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/nodes_thread_pool.py +0 -0
  151. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/progress.py +0 -0
  152. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/py.typed +0 -0
  153. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/__init__.py +0 -0
  154. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/batch.py +0 -0
  155. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/builtins.py +0 -0
  156. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/dispatch.py +0 -0
  157. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/metrics.py +0 -0
  158. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/params.py +0 -0
  159. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/schema.py +0 -0
  160. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/session.py +0 -0
  161. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/query/udf.py +0 -0
  162. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/remote/__init__.py +0 -0
  163. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/remote/studio.py +0 -0
  164. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/__init__.py +0 -0
  165. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/default/__init__.py +0 -0
  166. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/default/base.py +0 -0
  167. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/__init__.py +0 -0
  168. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/array.py +0 -0
  169. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/conditional.py +0 -0
  170. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/path.py +0 -0
  171. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/random.py +0 -0
  172. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/functions/string.py +0 -0
  173. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/selectable.py +0 -0
  174. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/sqlite/__init__.py +0 -0
  175. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/sqlite/vector.py +0 -0
  176. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/types.py +0 -0
  177. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/sql/utils.py +0 -0
  178. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/storage.py +0 -0
  179. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/torch/__init__.py +0 -0
  180. {datachain-0.2.13 → datachain-0.2.14}/src/datachain/utils.py +0 -0
  181. {datachain-0.2.13 → datachain-0.2.14}/src/datachain.egg-info/SOURCES.txt +0 -0
  182. {datachain-0.2.13 → datachain-0.2.14}/src/datachain.egg-info/dependency_links.txt +0 -0
  183. {datachain-0.2.13 → datachain-0.2.14}/src/datachain.egg-info/entry_points.txt +0 -0
  184. {datachain-0.2.13 → datachain-0.2.14}/src/datachain.egg-info/top_level.txt +0 -0
  185. {datachain-0.2.13 → datachain-0.2.14}/tests/__init__.py +0 -0
  186. {datachain-0.2.13 → datachain-0.2.14}/tests/benchmarks/__init__.py +0 -0
  187. {datachain-0.2.13 → datachain-0.2.14}/tests/benchmarks/conftest.py +0 -0
  188. {datachain-0.2.13 → datachain-0.2.14}/tests/benchmarks/test_ls.py +0 -0
  189. {datachain-0.2.13 → datachain-0.2.14}/tests/benchmarks/test_version.py +0 -0
  190. {datachain-0.2.13 → datachain-0.2.14}/tests/conftest.py +0 -0
  191. {datachain-0.2.13 → datachain-0.2.14}/tests/data.py +0 -0
  192. {datachain-0.2.13 → datachain-0.2.14}/tests/examples/__init__.py +0 -0
  193. {datachain-0.2.13 → datachain-0.2.14}/tests/examples/wds_data.py +0 -0
  194. {datachain-0.2.13 → datachain-0.2.14}/tests/func/__init__.py +0 -0
  195. {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_catalog.py +0 -0
  196. {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_client.py +0 -0
  197. {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_dataset_query.py +0 -0
  198. {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_ls.py +0 -0
  199. {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_pull.py +0 -0
  200. {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_pytorch.py +0 -0
  201. {datachain-0.2.13 → datachain-0.2.14}/tests/func/test_query.py +0 -0
  202. {datachain-0.2.13 → datachain-0.2.14}/tests/scripts/feature_class.py +0 -0
  203. {datachain-0.2.13 → datachain-0.2.14}/tests/scripts/feature_class_parallel.py +0 -0
  204. {datachain-0.2.13 → datachain-0.2.14}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  205. {datachain-0.2.13 → datachain-0.2.14}/tests/scripts/name_len_normal.py +0 -0
  206. {datachain-0.2.13 → datachain-0.2.14}/tests/scripts/name_len_slow.py +0 -0
  207. {datachain-0.2.13 → datachain-0.2.14}/tests/test_cli_e2e.py +0 -0
  208. {datachain-0.2.13 → datachain-0.2.14}/tests/test_query_e2e.py +0 -0
  209. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/__init__.py +0 -0
  210. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/__init__.py +0 -0
  211. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/conftest.py +0 -0
  212. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_arrow.py +0 -0
  213. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_clip.py +0 -0
  214. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  215. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_feature_utils.py +0 -0
  216. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_file.py +0 -0
  217. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_image.py +0 -0
  218. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_signal_schema.py +0 -0
  219. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_text.py +0 -0
  220. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_udf_signature.py +0 -0
  221. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_utils.py +0 -0
  222. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/lib/test_webdataset.py +0 -0
  223. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/__init__.py +0 -0
  224. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/sqlite/__init__.py +0 -0
  225. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/sqlite/test_utils.py +0 -0
  226. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_array.py +0 -0
  227. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_conditional.py +0 -0
  228. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_path.py +0 -0
  229. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_random.py +0 -0
  230. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_selectable.py +0 -0
  231. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/sql/test_string.py +0 -0
  232. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_asyn.py +0 -0
  233. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_cache.py +0 -0
  234. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_catalog.py +0 -0
  235. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_catalog_loader.py +0 -0
  236. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_cli_parsing.py +0 -0
  237. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_client.py +0 -0
  238. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_client_s3.py +0 -0
  239. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_data_storage.py +0 -0
  240. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_database_engine.py +0 -0
  241. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_dataset.py +0 -0
  242. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_dispatch.py +0 -0
  243. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_fileslice.py +0 -0
  244. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_id_generator.py +0 -0
  245. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_listing.py +0 -0
  246. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_metastore.py +0 -0
  247. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_query_metrics.py +0 -0
  248. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_query_params.py +0 -0
  249. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_serializer.py +0 -0
  250. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_session.py +0 -0
  251. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_storage.py +0 -0
  252. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_udf.py +0 -0
  253. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_utils.py +0 -0
  254. {datachain-0.2.13 → datachain-0.2.14}/tests/unit/test_warehouse.py +0 -0
  255. {datachain-0.2.13 → datachain-0.2.14}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.13
3
+ Version: 0.2.14
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -36,7 +36,7 @@ Requires-Dist: sqlalchemy>=2
36
36
  Requires-Dist: multiprocess==0.70.16
37
37
  Requires-Dist: dill==0.3.8
38
38
  Requires-Dist: cloudpickle
39
- Requires-Dist: ujson>=5.9.0
39
+ Requires-Dist: orjson>=3.10.5
40
40
  Requires-Dist: pydantic<3,>=2
41
41
  Requires-Dist: jmespath>=1.0
42
42
  Requires-Dist: datamodel-code-generator>=0.25
@@ -81,7 +81,6 @@ Requires-Dist: types-python-dateutil; extra == "dev"
81
81
  Requires-Dist: types-pytz; extra == "dev"
82
82
  Requires-Dist: types-PyYAML; extra == "dev"
83
83
  Requires-Dist: types-requests; extra == "dev"
84
- Requires-Dist: types-ujson; extra == "dev"
85
84
 
86
85
  |PyPI| |Python Version| |Codecov| |Tests|
87
86
 
@@ -58,8 +58,8 @@ def trim_text(text):
58
58
  match = re.search(r'[A-Z][^.]*\.', text)
59
59
  return match.group(0) if match else ''
60
60
 
61
- images = chain.collect_one("file")
62
- captions = chain.collect_one("scene")
61
+ images = chain.collect("file")
62
+ captions = chain.collect("scene")
63
63
  _ , axes = plt.subplots(1, len(captions), figsize=(15, 5))
64
64
 
65
65
  for ax, img, caption in zip(axes, images, captions):
@@ -39,7 +39,7 @@ dependencies = [
39
39
  "multiprocess==0.70.16",
40
40
  "dill==0.3.8",
41
41
  "cloudpickle",
42
- "ujson>=5.9.0",
42
+ "orjson>=3.10.5",
43
43
  "pydantic>=2,<3",
44
44
  "jmespath>=1.0",
45
45
  "datamodel-code-generator>=0.25",
@@ -91,8 +91,7 @@ dev = [
91
91
  "types-python-dateutil",
92
92
  "types-pytz",
93
93
  "types-PyYAML",
94
- "types-requests",
95
- "types-ujson"
94
+ "types-requests"
96
95
  ]
97
96
 
98
97
  [project.urls]
@@ -118,6 +117,11 @@ markers = [
118
117
  ]
119
118
  asyncio_mode = "auto"
120
119
  filterwarnings = [
120
+ "error::pandas.errors.PerformanceWarning",
121
+ "error::pydantic.warnings.PydanticDeprecatedSince20",
122
+ "error::pytest_mock.PytestMockWarning",
123
+ "error::pytest.PytestCollectionWarning",
124
+ "error::sqlalchemy.exc.SADeprecationWarning",
121
125
  "ignore:Field name .* shadows an attribute in parent:UserWarning" # datachain.lib.feature
122
126
  ]
123
127
 
@@ -421,10 +421,6 @@ class AbstractMetastore(ABC, Serializable):
421
421
  ) -> None:
422
422
  """Set the status of the given job and dataset."""
423
423
 
424
- @abstractmethod
425
- def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
426
- """Returns the possibly stale jobs."""
427
-
428
424
 
429
425
  class AbstractDBMetastore(AbstractMetastore):
430
426
  """
@@ -19,8 +19,12 @@ from datachain.sql.types import Int, SQLType, UInt64
19
19
  if TYPE_CHECKING:
20
20
  from sqlalchemy import Engine
21
21
  from sqlalchemy.engine.interfaces import Dialect
22
- from sqlalchemy.sql.base import Executable, ReadOnlyColumnCollection
23
- from sqlalchemy.sql.elements import KeyedColumnElement
22
+ from sqlalchemy.sql.base import (
23
+ ColumnCollection,
24
+ Executable,
25
+ ReadOnlyColumnCollection,
26
+ )
27
+ from sqlalchemy.sql.elements import ColumnElement
24
28
 
25
29
 
26
30
  def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
@@ -43,7 +47,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
43
47
 
44
48
 
45
49
  def convert_rows_custom_column_types(
46
- columns: "ReadOnlyColumnCollection[str, KeyedColumnElement[Any]]",
50
+ columns: "ColumnCollection[str, ColumnElement[Any]]",
47
51
  rows: Iterator[tuple[Any, ...]],
48
52
  dialect: "Dialect",
49
53
  ):
@@ -496,9 +496,6 @@ class SQLiteMetastore(AbstractDBMetastore):
496
496
  def _jobs_insert(self) -> "Insert":
497
497
  return sqlite.insert(self._jobs)
498
498
 
499
- def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
500
- raise NotImplementedError("get_possibly_stale_jobs not implemented for SQLite")
501
-
502
499
 
503
500
  class SQLiteWarehouse(AbstractWarehouse):
504
501
  """
@@ -594,7 +591,7 @@ class SQLiteWarehouse(AbstractWarehouse):
594
591
  ):
595
592
  rows = self.db.execute(select_query, **kwargs)
596
593
  yield from convert_rows_custom_column_types(
597
- select_query.columns, rows, sqlite_dialect
594
+ select_query.selected_columns, rows, sqlite_dialect
598
595
  )
599
596
 
600
597
  def get_dataset_sources(
@@ -494,7 +494,7 @@ class AbstractWarehouse(ABC, Serializable):
494
494
  This gets nodes based on the provided query, and should be used sparingly,
495
495
  as it will be slow on any OLAP database systems.
496
496
  """
497
- columns = [c.name for c in query.columns]
497
+ columns = [c.name for c in query.selected_columns]
498
498
  for row in self.db.execute(query):
499
499
  d = dict(zip(columns, row))
500
500
  yield Node(**d)
@@ -912,29 +912,6 @@ class AbstractWarehouse(ABC, Serializable):
912
912
  for name in names:
913
913
  self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
914
914
 
915
- def subtract_query(
916
- self,
917
- source_query: sa.sql.selectable.Select,
918
- target_query: sa.sql.selectable.Select,
919
- ) -> sa.sql.selectable.Select:
920
- sq = source_query.alias("source_query")
921
- tq = target_query.alias("target_query")
922
-
923
- source_target_join = sa.join(
924
- sq,
925
- tq,
926
- (sq.c.source == tq.c.source)
927
- & (sq.c.parent == tq.c.parent)
928
- & (sq.c.name == tq.c.name),
929
- isouter=True,
930
- )
931
-
932
- return (
933
- select(*sq.c)
934
- .select_from(source_target_join)
935
- .where((tq.c.name == None) | (tq.c.name == "")) # noqa: E711
936
- )
937
-
938
915
  def changed_query(
939
916
  self,
940
917
  source_query: sa.sql.selectable.Select,
@@ -48,10 +48,10 @@ def _flatten_fields_values(fields, obj: BaseModel):
48
48
  value = getattr(obj, name)
49
49
 
50
50
  if isinstance(value, list):
51
- yield [
52
- val.model_dump() if ModelStore.is_pydantic(type(val)) else val
53
- for val in value
54
- ]
51
+ if value and ModelStore.is_pydantic(type(value[0])):
52
+ yield [val.model_dump() for val in value]
53
+ else:
54
+ yield value
55
55
  elif isinstance(value, dict):
56
56
  yield {
57
57
  key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val
@@ -71,7 +71,10 @@ def values_to_tuples( # noqa: C901, PLR0912
71
71
  f"signal '{k}' has unsupported type '{typ.__name__}'."
72
72
  f" Please use DataModel types: {DataTypeNames}",
73
73
  )
74
- types_map[k] = typ
74
+ if typ is list:
75
+ types_map[k] = list[type(v[0][0])] # type: ignore[misc]
76
+ else:
77
+ types_map[k] = typ
75
78
 
76
79
  if length < 0:
77
80
  length = len_
@@ -342,7 +342,7 @@ class DataChain(DatasetQuery):
342
342
  spec: Optional[DataType] = None,
343
343
  schema_from: Optional[str] = "auto",
344
344
  jmespath: Optional[str] = None,
345
- object_name: str = "",
345
+ object_name: Optional[str] = "",
346
346
  model_name: Optional[str] = None,
347
347
  show_schema: Optional[bool] = False,
348
348
  meta_type: Optional[str] = "json",
@@ -364,12 +364,12 @@ class DataChain(DatasetQuery):
364
364
  nrows : optional row limit for jsonl and JSON arrays
365
365
 
366
366
  Example:
367
- infer JSON schema from data, reduce using JMESPATH, print schema
367
+ infer JSON schema from data, reduce using JMESPATH
368
368
  ```py
369
369
  chain = DataChain.from_json("gs://json", jmespath="key1.key2")
370
370
  ```
371
371
 
372
- infer JSON schema from a particular path, print data model
372
+ infer JSON schema from a particular path
373
373
  ```py
374
374
  chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
375
375
  ```
@@ -384,7 +384,7 @@ class DataChain(DatasetQuery):
384
384
  if (not object_name) and jmespath:
385
385
  object_name = jmespath_to_name(jmespath)
386
386
  if not object_name:
387
- object_name = "json"
387
+ object_name = meta_type
388
388
  chain = DataChain.from_storage(path=path, type=type, **kwargs)
389
389
  signal_dict = {
390
390
  object_name: read_meta(
@@ -397,7 +397,67 @@ class DataChain(DatasetQuery):
397
397
  nrows=nrows,
398
398
  )
399
399
  }
400
- return chain.gen(**signal_dict) # type: ignore[arg-type]
400
+ return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
401
+
402
+ @classmethod
403
+ def from_jsonl(
404
+ cls,
405
+ path,
406
+ type: Literal["binary", "text", "image"] = "text",
407
+ spec: Optional[DataType] = None,
408
+ schema_from: Optional[str] = "auto",
409
+ jmespath: Optional[str] = None,
410
+ object_name: Optional[str] = "",
411
+ model_name: Optional[str] = None,
412
+ show_schema: Optional[bool] = False,
413
+ meta_type: Optional[str] = "jsonl",
414
+ nrows=None,
415
+ **kwargs,
416
+ ) -> "DataChain":
417
+ """Get data from JSON lines. It returns the chain itself.
418
+
419
+ Parameters:
420
+ path : storage URI with directory. URI must start with storage prefix such
421
+ as `s3://`, `gs://`, `az://` or "file:///"
422
+ type : read file as "binary", "text", or "image" data. Default is "binary".
423
+ spec : optional Data Model
424
+ schema_from : path to sample to infer spec (if schema not provided)
425
+ object_name : generated object column name
426
+ model_name : optional generated model name
427
+ show_schema : print auto-generated schema
428
+ jmespath : optional JMESPATH expression to reduce JSON
429
+ nrows : optional row limit for jsonl and JSON arrays
430
+
431
+ Example:
432
+ infer JSONl schema from data, limit parsing to 1 row
433
+ ```py
434
+ chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
435
+ ```
436
+ """
437
+ if schema_from == "auto":
438
+ schema_from = path
439
+
440
+ def jmespath_to_name(s: str):
441
+ name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
442
+ return s[:name_end]
443
+
444
+ if (not object_name) and jmespath:
445
+ object_name = jmespath_to_name(jmespath)
446
+ if not object_name:
447
+ object_name = meta_type
448
+ chain = DataChain.from_storage(path=path, type=type, **kwargs)
449
+ signal_dict = {
450
+ object_name: read_meta(
451
+ schema_from=schema_from,
452
+ meta_type=meta_type,
453
+ spec=spec,
454
+ model_name=model_name,
455
+ show_schema=show_schema,
456
+ jmespath=jmespath,
457
+ nrows=nrows,
458
+ )
459
+ }
460
+ return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
401
461
 
402
462
  @classmethod
403
463
  def datasets(
@@ -951,6 +1011,41 @@ class DataChain(DatasetQuery):
951
1011
 
952
1012
  return ds
953
1013
 
1014
+ def subtract( # type: ignore[override]
1015
+ self,
1016
+ other: "DataChain",
1017
+ on: Optional[Union[str, Sequence[str]]] = None,
1018
+ ) -> "Self":
1019
+ """Remove rows that appear in another chain.
1020
+
1021
+ Parameters:
1022
+ other: chain whose rows will be removed from `self`
1023
+ on: columns to consider for determining row equality. If unspecified,
1024
+ defaults to all common columns between `self` and `other`.
1025
+ """
1026
+ if isinstance(on, str):
1027
+ on = [on]
1028
+ if on is None:
1029
+ other_columns = set(other._effective_signals_schema.db_signals())
1030
+ signals = [
1031
+ c
1032
+ for c in self._effective_signals_schema.db_signals()
1033
+ if c in other_columns
1034
+ ]
1035
+ if not signals:
1036
+ raise DataChainParamsError("subtract(): no common columns")
1037
+ elif not isinstance(on, Sequence):
1038
+ raise TypeError(
1039
+ f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
1040
+ )
1041
+ elif not on:
1042
+ raise DataChainParamsError(
1043
+ "'on' cannot be empty",
1044
+ )
1045
+ else:
1046
+ signals = self.signals_schema.resolve(*on).db_signals()
1047
+ return super()._subtract(other, signals)
1048
+
954
1049
  @classmethod
955
1050
  def from_values(
956
1051
  cls,
@@ -12,7 +12,6 @@ from urllib.parse import unquote, urlparse
12
12
  from urllib.request import url2pathname
13
13
 
14
14
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
15
- from fsspec.implementations.local import LocalFileSystem
16
15
  from PIL import Image
17
16
  from pydantic import Field, field_validator
18
17
 
@@ -283,9 +282,8 @@ class File(DataModel):
283
282
  def get_path(self) -> str:
284
283
  """Returns file path."""
285
284
  path = unquote(self.get_uri())
286
- fs = self.get_fs()
287
- if isinstance(fs, LocalFileSystem):
288
- # Drop file:// protocol
285
+ source = urlparse(self.source)
286
+ if source.scheme == "file":
289
287
  path = urlparse(path).path
290
288
  path = url2pathname(path)
291
289
  return path
@@ -300,13 +298,10 @@ class File(DataModel):
300
298
  elif placement == "etag":
301
299
  path = f"{self.etag}{self.get_file_suffix()}"
302
300
  elif placement == "fullpath":
303
- fs = self.get_fs()
304
- if isinstance(fs, LocalFileSystem):
305
- path = unquote(self.get_full_name())
306
- else:
307
- path = (
308
- Path(urlparse(self.source).netloc) / unquote(self.get_full_name())
309
- ).as_posix()
301
+ path = unquote(self.get_full_name())
302
+ source = urlparse(self.source)
303
+ if source.scheme and source.scheme != "file":
304
+ path = posixpath.join(source.netloc, path)
310
305
  elif placement == "checksum":
311
306
  raise NotImplementedError("Checksum placement not implemented yet")
312
307
  else:
@@ -11,9 +11,9 @@ from collections.abc import Iterator
11
11
  from typing import Any, Callable
12
12
 
13
13
  import jmespath as jsp
14
- from pydantic import ValidationError
14
+ from pydantic import Field, ValidationError # noqa: F401
15
15
 
16
- from datachain.lib.data_model import ModelStore # noqa: F401
16
+ from datachain.lib.data_model import DataModel # noqa: F401
17
17
  from datachain.lib.file import File
18
18
 
19
19
 
@@ -87,7 +87,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
87
87
  except subprocess.CalledProcessError as e:
88
88
  model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
89
89
  print(f"{model_output}")
90
- print("\n" + f"ModelStore.register({model_name})" + "\n")
90
+ print("\n" + "from datachain.lib.data_model import DataModel" + "\n")
91
+ print("\n" + f"DataModel.register({model_name})" + "\n")
91
92
  print("\n" + f"spec={model_name}" + "\n")
92
93
  return model_output
93
94
 
@@ -147,7 +148,7 @@ def read_meta( # noqa: C901
147
148
 
148
149
  def parse_data(
149
150
  file: File,
150
- DataModel=spec, # noqa: N803
151
+ data_model=spec,
151
152
  meta_type=meta_type,
152
153
  jmespath=jmespath,
153
154
  nrows=nrows,
@@ -155,7 +156,7 @@ def read_meta( # noqa: C901
155
156
  def validator(json_object: dict) -> spec:
156
157
  json_string = json.dumps(json_object)
157
158
  try:
158
- data_instance = DataModel.model_validate_json(json_string)
159
+ data_instance = data_model.model_validate_json(json_string)
159
160
  yield data_instance
160
161
  except ValidationError as e:
161
162
  print(f"Validation error occurred in file {file.name}:", e)
@@ -25,6 +25,7 @@ from typing import (
25
25
 
26
26
  import attrs
27
27
  import sqlalchemy
28
+ import sqlalchemy as sa
28
29
  from attrs import frozen
29
30
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
30
31
  from sqlalchemy import Column
@@ -250,7 +251,7 @@ class DatasetDiffOperation(Step):
250
251
  self,
251
252
  source_query: Select,
252
253
  target_query: Select,
253
- ) -> Select:
254
+ ) -> sa.Selectable:
254
255
  """
255
256
  Should return select query that calculates desired diff between dataset queries
256
257
  """
@@ -268,7 +269,7 @@ class DatasetDiffOperation(Step):
268
269
 
269
270
  columns = [
270
271
  c if isinstance(c, Column) else Column(c.name, c.type)
271
- for c in source_query.columns
272
+ for c in source_query.selected_columns
272
273
  ]
273
274
  temp_table = self.catalog.warehouse.create_dataset_rows_table(
274
275
  temp_table_name,
@@ -292,23 +293,16 @@ class DatasetDiffOperation(Step):
292
293
 
293
294
  @frozen
294
295
  class Subtract(DatasetDiffOperation):
295
- """
296
- Calculates rows that are in a source query but are not in target query (diff)
297
- This can be used to do delta updates (calculate UDF only on newly added rows)
298
- Example:
299
- >>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
300
- >>> ds_updated = (
301
- DatasetQuery("gs://dvcx-datalakes/dogs-and-cats")
302
- .filter(C.size > 1000) # we can also filter out source query
303
- .subtract(ds)
304
- .add_signals(calc_embeddings) # calculae embeddings only on new rows
305
- .union(ds) # union with old dataset that's missing new rows
306
- .save("dogs_cats_updated")
307
- )
308
- """
296
+ on: Sequence[str]
309
297
 
310
- def query(self, source_query: Select, target_query: Select) -> Select:
311
- return self.catalog.warehouse.subtract_query(source_query, target_query)
298
+ def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
299
+ sq = source_query.alias("source_query")
300
+ tq = target_query.alias("target_query")
301
+ where_clause = sa.and_(
302
+ getattr(sq.c, col_name).is_not_distinct_from(getattr(tq.c, col_name))
303
+ for col_name in self.on
304
+ ) # type: ignore[arg-type]
305
+ return sq.select().except_(sq.select().where(where_clause))
312
306
 
313
307
 
314
308
  @frozen
@@ -1260,7 +1254,7 @@ class DatasetQuery:
1260
1254
  def as_iterable(self, **kwargs) -> Iterator[ResultIter]:
1261
1255
  try:
1262
1256
  query = self.apply_steps().select()
1263
- selected_columns = [c.name for c in query.columns]
1257
+ selected_columns = [c.name for c in query.selected_columns]
1264
1258
  yield ResultIter(
1265
1259
  self.catalog.warehouse.dataset_rows_select(query, **kwargs),
1266
1260
  selected_columns,
@@ -1564,8 +1558,12 @@ class DatasetQuery:
1564
1558
 
1565
1559
  @detach
1566
1560
  def subtract(self, dq: "DatasetQuery") -> "Self":
1561
+ return self._subtract(dq, on=["source", "parent", "name"])
1562
+
1563
+ @detach
1564
+ def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":
1567
1565
  query = self.clone()
1568
- query.steps.append(Subtract(dq, self.catalog))
1566
+ query.steps.append(Subtract(dq, self.catalog, on=on))
1569
1567
  return query
1570
1568
 
1571
1569
  @detach
@@ -1684,7 +1682,7 @@ class DatasetQuery:
1684
1682
  f.row_number().over(order_by=q._order_by_clauses).label("sys__id")
1685
1683
  )
1686
1684
 
1687
- cols = tuple(c.name for c in q.columns)
1685
+ cols = tuple(c.name for c in q.selected_columns)
1688
1686
  insert_q = sqlalchemy.insert(dr.get_table()).from_select(cols, q)
1689
1687
  self.catalog.warehouse.db.execute(insert_q, **kwargs)
1690
1688
  self.catalog.metastore.update_dataset_status(
@@ -5,8 +5,8 @@ from datetime import MAXYEAR, MINYEAR, datetime, timezone
5
5
  from types import MappingProxyType
6
6
  from typing import Callable, Optional
7
7
 
8
+ import orjson
8
9
  import sqlalchemy as sa
9
- import ujson
10
10
  from sqlalchemy.dialects import sqlite
11
11
  from sqlalchemy.ext.compiler import compiles
12
12
  from sqlalchemy.sql.elements import literal
@@ -149,7 +149,7 @@ def missing_vector_function(name, exc):
149
149
 
150
150
 
151
151
  def sqlite_string_split(string: str, sep: str, maxsplit: int = -1) -> str:
152
- return ujson.dumps(string.split(sep, maxsplit))
152
+ return orjson.dumps(string.split(sep, maxsplit)).decode("utf-8")
153
153
 
154
154
 
155
155
  def register_user_defined_sql_functions() -> None:
@@ -274,7 +274,7 @@ def compile_euclidean_distance(element, compiler, **kwargs):
274
274
 
275
275
 
276
276
  def py_json_array_length(arr):
277
- return len(ujson.loads(arr))
277
+ return len(orjson.loads(arr))
278
278
 
279
279
 
280
280
  def compile_array_length(element, compiler, **kwargs):
@@ -1,7 +1,6 @@
1
- import json
2
1
  import sqlite3
3
2
 
4
- import ujson
3
+ import orjson
5
4
  from sqlalchemy import types
6
5
 
7
6
  from datachain.sql.types import TypeConverter, TypeReadConverter
@@ -29,22 +28,15 @@ class Array(types.UserDefinedType):
29
28
 
30
29
 
31
30
  def adapt_array(arr):
32
- return ujson.dumps(arr)
31
+ return orjson.dumps(arr).decode("utf-8")
33
32
 
34
33
 
35
34
  def convert_array(arr):
36
- return ujson.loads(arr)
35
+ return orjson.loads(arr)
37
36
 
38
37
 
39
38
  def adapt_np_array(arr):
40
- def _json_serialize(obj):
41
- if isinstance(obj, np.ndarray):
42
- return obj.tolist()
43
- return obj
44
-
45
- if np.issubdtype(arr.dtype, np.object_):
46
- return json.dumps(arr.tolist(), default=_json_serialize)
47
- return ujson.dumps(arr.tolist())
39
+ return orjson.dumps(arr, option=orjson.OPT_SERIALIZE_NUMPY).decode("utf-8")
48
40
 
49
41
 
50
42
  def adapt_np_generic(val):
@@ -70,5 +62,5 @@ class SQLiteTypeConverter(TypeConverter):
70
62
  class SQLiteTypeReadConverter(TypeReadConverter):
71
63
  def array(self, value, item_type, dialect):
72
64
  if isinstance(value, str):
73
- value = ujson.loads(value)
65
+ value = orjson.loads(value)
74
66
  return super().array(value, item_type, dialect)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.13
3
+ Version: 0.2.14
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -36,7 +36,7 @@ Requires-Dist: sqlalchemy>=2
36
36
  Requires-Dist: multiprocess==0.70.16
37
37
  Requires-Dist: dill==0.3.8
38
38
  Requires-Dist: cloudpickle
39
- Requires-Dist: ujson>=5.9.0
39
+ Requires-Dist: orjson>=3.10.5
40
40
  Requires-Dist: pydantic<3,>=2
41
41
  Requires-Dist: jmespath>=1.0
42
42
  Requires-Dist: datamodel-code-generator>=0.25
@@ -81,7 +81,6 @@ Requires-Dist: types-python-dateutil; extra == "dev"
81
81
  Requires-Dist: types-pytz; extra == "dev"
82
82
  Requires-Dist: types-PyYAML; extra == "dev"
83
83
  Requires-Dist: types-requests; extra == "dev"
84
- Requires-Dist: types-ujson; extra == "dev"
85
84
 
86
85
  |PyPI| |Python Version| |Codecov| |Tests|
87
86
 
@@ -17,7 +17,7 @@ sqlalchemy>=2
17
17
  multiprocess==0.70.16
18
18
  dill==0.3.8
19
19
  cloudpickle
20
- ujson>=5.9.0
20
+ orjson>=3.10.5
21
21
  pydantic<3,>=2
22
22
  jmespath>=1.0
23
23
  datamodel-code-generator>=0.25
@@ -33,7 +33,6 @@ types-python-dateutil
33
33
  types-pytz
34
34
  types-PyYAML
35
35
  types-requests
36
- types-ujson
37
36
 
38
37
  [docs]
39
38
  mkdocs>=1.5.2
@@ -90,7 +90,7 @@ def test_wds(catalog, webdataset_tars):
90
90
  assert laion_wds.file.parent
91
91
  assert laion_wds.file.name == f"{idx}.jpg"
92
92
  assert laion_wds.file.location
93
- assert laion_wds.json.dict() == Laion(**data).dict()
93
+ assert laion_wds.json.model_dump() == Laion(**data).model_dump()
94
94
 
95
95
  assert num_rows == len(WDS_TAR_SHARDS)
96
96
 
@@ -17,12 +17,8 @@ from tests.utils import images_equal
17
17
 
18
18
 
19
19
  @pytest.mark.parametrize("anon", [True, False])
20
- def test_catalog_anon(catalog, anon):
21
- chain = (
22
- DataChain.from_storage("gs://dvcx-datalakes/dogs-and-cats/", anon=anon)
23
- .limit(5)
24
- .save("test_catalog_anon")
25
- )
20
+ def test_catalog_anon(tmp_dir, catalog, anon):
21
+ chain = DataChain.from_storage(tmp_dir.as_uri(), anon=anon)
26
22
  assert chain.catalog.client_config.get("anon", False) is anon
27
23
 
28
24
 
@@ -210,15 +210,16 @@ def test_create_dataset_from_sources_failed(listed_bucket, cloud_test_catalog, m
210
210
  dataset_name = uuid.uuid4().hex
211
211
  src_uri = cloud_test_catalog.src_uri
212
212
  catalog = cloud_test_catalog.catalog
213
- with mocker.patch.object(
213
+ # Mocks are automatically undone at the end of a test.
214
+ mocker.patch.object(
214
215
  catalog.warehouse.__class__,
215
216
  "create_dataset_rows_table",
216
217
  side_effect=RuntimeError("Error"),
217
- ) as _:
218
- with pytest.raises(RuntimeError):
219
- catalog.create_dataset_from_sources(
220
- dataset_name, [f"{src_uri}/dogs/*"], recursive=True
221
- )
218
+ )
219
+ with pytest.raises(RuntimeError):
220
+ catalog.create_dataset_from_sources(
221
+ dataset_name, [f"{src_uri}/dogs/*"], recursive=True
222
+ )
222
223
 
223
224
  dataset = catalog.get_dataset(dataset_name)
224
225
  dataset_version = dataset.get_version(dataset.latest_version)