datachain 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (249) hide show
  1. {datachain-0.4.0/src/datachain.egg-info → datachain-0.5.0}/PKG-INFO +1 -1
  2. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/cli.py +3 -2
  3. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/metastore.py +8 -8
  4. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/warehouse.py +1 -3
  5. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/dataset.py +0 -3
  6. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/dc.py +197 -113
  7. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/listing.py +5 -3
  8. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/pytorch.py +5 -1
  9. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/dataset.py +1 -1
  10. {datachain-0.4.0 → datachain-0.5.0/src/datachain.egg-info}/PKG-INFO +1 -1
  11. {datachain-0.4.0 → datachain-0.5.0}/tests/conftest.py +0 -1
  12. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_catalog.py +5 -2
  13. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_datachain.py +4 -4
  14. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_pull.py +0 -1
  15. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_datachain.py +21 -25
  16. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_datachain_merge.py +1 -1
  17. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_warehouse.py +0 -2
  18. {datachain-0.4.0 → datachain-0.5.0}/.cruft.json +0 -0
  19. {datachain-0.4.0 → datachain-0.5.0}/.gitattributes +0 -0
  20. {datachain-0.4.0 → datachain-0.5.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  21. {datachain-0.4.0 → datachain-0.5.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  22. {datachain-0.4.0 → datachain-0.5.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  23. {datachain-0.4.0 → datachain-0.5.0}/.github/codecov.yaml +0 -0
  24. {datachain-0.4.0 → datachain-0.5.0}/.github/dependabot.yml +0 -0
  25. {datachain-0.4.0 → datachain-0.5.0}/.github/workflows/benchmarks.yml +0 -0
  26. {datachain-0.4.0 → datachain-0.5.0}/.github/workflows/release.yml +0 -0
  27. {datachain-0.4.0 → datachain-0.5.0}/.github/workflows/tests-studio.yml +0 -0
  28. {datachain-0.4.0 → datachain-0.5.0}/.github/workflows/tests.yml +0 -0
  29. {datachain-0.4.0 → datachain-0.5.0}/.github/workflows/update-template.yaml +0 -0
  30. {datachain-0.4.0 → datachain-0.5.0}/.gitignore +0 -0
  31. {datachain-0.4.0 → datachain-0.5.0}/.pre-commit-config.yaml +0 -0
  32. {datachain-0.4.0 → datachain-0.5.0}/CODE_OF_CONDUCT.rst +0 -0
  33. {datachain-0.4.0 → datachain-0.5.0}/CONTRIBUTING.rst +0 -0
  34. {datachain-0.4.0 → datachain-0.5.0}/LICENSE +0 -0
  35. {datachain-0.4.0 → datachain-0.5.0}/README.rst +0 -0
  36. {datachain-0.4.0 → datachain-0.5.0}/docs/assets/captioned_cartoons.png +0 -0
  37. {datachain-0.4.0 → datachain-0.5.0}/docs/assets/datachain-white.svg +0 -0
  38. {datachain-0.4.0 → datachain-0.5.0}/docs/assets/datachain.svg +0 -0
  39. {datachain-0.4.0 → datachain-0.5.0}/docs/assets/flowchart.png +0 -0
  40. {datachain-0.4.0 → datachain-0.5.0}/docs/index.md +0 -0
  41. {datachain-0.4.0 → datachain-0.5.0}/docs/references/datachain.md +0 -0
  42. {datachain-0.4.0 → datachain-0.5.0}/docs/references/datatype.md +0 -0
  43. {datachain-0.4.0 → datachain-0.5.0}/docs/references/file.md +0 -0
  44. {datachain-0.4.0 → datachain-0.5.0}/docs/references/index.md +0 -0
  45. {datachain-0.4.0 → datachain-0.5.0}/docs/references/sql.md +0 -0
  46. {datachain-0.4.0 → datachain-0.5.0}/docs/references/torch.md +0 -0
  47. {datachain-0.4.0 → datachain-0.5.0}/docs/references/udf.md +0 -0
  48. {datachain-0.4.0 → datachain-0.5.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  49. {datachain-0.4.0 → datachain-0.5.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  50. {datachain-0.4.0 → datachain-0.5.0}/examples/computer_vision/openimage-detect.py +0 -0
  51. {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/common_sql_functions.py +0 -0
  52. {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/json-csv-reader.py +0 -0
  53. {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/torch-loader.py +0 -0
  54. {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/udfs/parallel.py +0 -0
  55. {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/udfs/simple.py +0 -0
  56. {datachain-0.4.0 → datachain-0.5.0}/examples/get_started/udfs/stateful.py +0 -0
  57. {datachain-0.4.0 → datachain-0.5.0}/examples/llm_and_nlp/claude-query.py +0 -0
  58. {datachain-0.4.0 → datachain-0.5.0}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  59. {datachain-0.4.0 → datachain-0.5.0}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  60. {datachain-0.4.0 → datachain-0.5.0}/examples/multimodal/clip_inference.py +0 -0
  61. {datachain-0.4.0 → datachain-0.5.0}/examples/multimodal/hf_pipeline.py +0 -0
  62. {datachain-0.4.0 → datachain-0.5.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  63. {datachain-0.4.0 → datachain-0.5.0}/examples/multimodal/wds.py +0 -0
  64. {datachain-0.4.0 → datachain-0.5.0}/examples/multimodal/wds_filtered.py +0 -0
  65. {datachain-0.4.0 → datachain-0.5.0}/mkdocs.yml +0 -0
  66. {datachain-0.4.0 → datachain-0.5.0}/noxfile.py +0 -0
  67. {datachain-0.4.0 → datachain-0.5.0}/overrides/main.html +0 -0
  68. {datachain-0.4.0 → datachain-0.5.0}/pyproject.toml +0 -0
  69. {datachain-0.4.0 → datachain-0.5.0}/setup.cfg +0 -0
  70. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/__init__.py +0 -0
  71. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/__main__.py +0 -0
  72. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/asyn.py +0 -0
  73. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/cache.py +0 -0
  74. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/catalog/__init__.py +0 -0
  75. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/catalog/catalog.py +0 -0
  76. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/catalog/datasource.py +0 -0
  77. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/catalog/loader.py +0 -0
  78. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/cli_utils.py +0 -0
  79. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/__init__.py +0 -0
  80. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/azure.py +0 -0
  81. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/fileslice.py +0 -0
  82. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/fsspec.py +0 -0
  83. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/gcs.py +0 -0
  84. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/hf.py +0 -0
  85. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/local.py +0 -0
  86. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/client/s3.py +0 -0
  87. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/config.py +0 -0
  88. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/__init__.py +0 -0
  89. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/db_engine.py +0 -0
  90. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/id_generator.py +0 -0
  91. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/job.py +0 -0
  92. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/schema.py +0 -0
  93. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/serializer.py +0 -0
  94. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/data_storage/sqlite.py +0 -0
  95. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/error.py +0 -0
  96. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/job.py +0 -0
  97. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/__init__.py +0 -0
  98. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/arrow.py +0 -0
  99. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/clip.py +0 -0
  100. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/__init__.py +0 -0
  101. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/flatten.py +0 -0
  102. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  103. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  104. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/unflatten.py +0 -0
  105. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  106. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/data_model.py +0 -0
  107. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/dataset_info.py +0 -0
  108. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/file.py +0 -0
  109. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/hf.py +0 -0
  110. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/image.py +0 -0
  111. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/listing_info.py +0 -0
  112. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/meta_formats.py +0 -0
  113. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/model_store.py +0 -0
  114. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/settings.py +0 -0
  115. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/signal_schema.py +0 -0
  116. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/tar.py +0 -0
  117. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/text.py +0 -0
  118. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/udf.py +0 -0
  119. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/udf_signature.py +0 -0
  120. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/utils.py +0 -0
  121. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/vfile.py +0 -0
  122. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/webdataset.py +0 -0
  123. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/lib/webdataset_laion.py +0 -0
  124. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/listing.py +0 -0
  125. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/node.py +0 -0
  126. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/nodes_fetcher.py +0 -0
  127. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/nodes_thread_pool.py +0 -0
  128. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/progress.py +0 -0
  129. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/py.typed +0 -0
  130. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/__init__.py +0 -0
  131. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/batch.py +0 -0
  132. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/dispatch.py +0 -0
  133. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/metrics.py +0 -0
  134. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/params.py +0 -0
  135. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/queue.py +0 -0
  136. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/schema.py +0 -0
  137. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/session.py +0 -0
  138. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/query/udf.py +0 -0
  139. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/remote/__init__.py +0 -0
  140. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/remote/studio.py +0 -0
  141. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/__init__.py +0 -0
  142. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/default/__init__.py +0 -0
  143. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/default/base.py +0 -0
  144. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/__init__.py +0 -0
  145. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/array.py +0 -0
  146. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/conditional.py +0 -0
  147. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/path.py +0 -0
  148. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/random.py +0 -0
  149. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/functions/string.py +0 -0
  150. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/selectable.py +0 -0
  151. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  152. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/sqlite/base.py +0 -0
  153. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/sqlite/types.py +0 -0
  154. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/sqlite/vector.py +0 -0
  155. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/types.py +0 -0
  156. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/sql/utils.py +0 -0
  157. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/storage.py +0 -0
  158. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/telemetry.py +0 -0
  159. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/torch/__init__.py +0 -0
  160. {datachain-0.4.0 → datachain-0.5.0}/src/datachain/utils.py +0 -0
  161. {datachain-0.4.0 → datachain-0.5.0}/src/datachain.egg-info/SOURCES.txt +0 -0
  162. {datachain-0.4.0 → datachain-0.5.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  163. {datachain-0.4.0 → datachain-0.5.0}/src/datachain.egg-info/entry_points.txt +0 -0
  164. {datachain-0.4.0 → datachain-0.5.0}/src/datachain.egg-info/requires.txt +0 -0
  165. {datachain-0.4.0 → datachain-0.5.0}/src/datachain.egg-info/top_level.txt +0 -0
  166. {datachain-0.4.0 → datachain-0.5.0}/tests/__init__.py +0 -0
  167. {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/__init__.py +0 -0
  168. {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/conftest.py +0 -0
  169. {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  170. {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  171. {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/datasets/.gitignore +0 -0
  172. {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  173. {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/test_datachain.py +0 -0
  174. {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/test_ls.py +0 -0
  175. {datachain-0.4.0 → datachain-0.5.0}/tests/benchmarks/test_version.py +0 -0
  176. {datachain-0.4.0 → datachain-0.5.0}/tests/data.py +0 -0
  177. {datachain-0.4.0 → datachain-0.5.0}/tests/examples/__init__.py +0 -0
  178. {datachain-0.4.0 → datachain-0.5.0}/tests/examples/test_examples.py +0 -0
  179. {datachain-0.4.0 → datachain-0.5.0}/tests/examples/test_wds_e2e.py +0 -0
  180. {datachain-0.4.0 → datachain-0.5.0}/tests/examples/wds_data.py +0 -0
  181. {datachain-0.4.0 → datachain-0.5.0}/tests/func/__init__.py +0 -0
  182. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_client.py +0 -0
  183. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_dataset_query.py +0 -0
  184. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_datasets.py +0 -0
  185. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_feature_pickling.py +0 -0
  186. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_listing.py +0 -0
  187. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_ls.py +0 -0
  188. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_meta_formats.py +0 -0
  189. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_metrics.py +0 -0
  190. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_pytorch.py +0 -0
  191. {datachain-0.4.0 → datachain-0.5.0}/tests/func/test_query.py +0 -0
  192. {datachain-0.4.0 → datachain-0.5.0}/tests/scripts/feature_class.py +0 -0
  193. {datachain-0.4.0 → datachain-0.5.0}/tests/scripts/feature_class_parallel.py +0 -0
  194. {datachain-0.4.0 → datachain-0.5.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  195. {datachain-0.4.0 → datachain-0.5.0}/tests/scripts/name_len_slow.py +0 -0
  196. {datachain-0.4.0 → datachain-0.5.0}/tests/test_cli_e2e.py +0 -0
  197. {datachain-0.4.0 → datachain-0.5.0}/tests/test_query_e2e.py +0 -0
  198. {datachain-0.4.0 → datachain-0.5.0}/tests/test_telemetry.py +0 -0
  199. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/__init__.py +0 -0
  200. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/__init__.py +0 -0
  201. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/conftest.py +0 -0
  202. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_arrow.py +0 -0
  203. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_clip.py +0 -0
  204. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  205. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_feature.py +0 -0
  206. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_feature_utils.py +0 -0
  207. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_file.py +0 -0
  208. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_hf.py +0 -0
  209. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_image.py +0 -0
  210. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_schema.py +0 -0
  211. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_signal_schema.py +0 -0
  212. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  213. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_text.py +0 -0
  214. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_udf_signature.py +0 -0
  215. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_utils.py +0 -0
  216. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/lib/test_webdataset.py +0 -0
  217. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/__init__.py +0 -0
  218. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  219. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  220. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_array.py +0 -0
  221. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_conditional.py +0 -0
  222. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_path.py +0 -0
  223. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_random.py +0 -0
  224. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_selectable.py +0 -0
  225. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/sql/test_string.py +0 -0
  226. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_asyn.py +0 -0
  227. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_cache.py +0 -0
  228. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_catalog.py +0 -0
  229. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_catalog_loader.py +0 -0
  230. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_cli_parsing.py +0 -0
  231. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_client.py +0 -0
  232. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_client_s3.py +0 -0
  233. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_data_storage.py +0 -0
  234. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_database_engine.py +0 -0
  235. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_dataset.py +0 -0
  236. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_dispatch.py +0 -0
  237. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_fileslice.py +0 -0
  238. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_id_generator.py +0 -0
  239. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_listing.py +0 -0
  240. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_metastore.py +0 -0
  241. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_module_exports.py +0 -0
  242. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_query.py +0 -0
  243. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_query_metrics.py +0 -0
  244. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_query_params.py +0 -0
  245. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_serializer.py +0 -0
  246. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_session.py +0 -0
  247. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_storage.py +0 -0
  248. {datachain-0.4.0 → datachain-0.5.0}/tests/unit/test_utils.py +0 -0
  249. {datachain-0.4.0 → datachain-0.5.0}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING, Optional, Union
12
12
 
13
13
  import shtab
14
14
 
15
- from datachain import utils
15
+ from datachain import Session, utils
16
16
  from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
17
17
  from datachain.lib.dc import DataChain
18
18
  from datachain.telemetry import telemetry
@@ -770,7 +770,8 @@ def show(
770
770
  show_records(records, collapse_columns=not no_collapse)
771
771
  if schema and dataset_version.feature_schema:
772
772
  print("\nSchema:")
773
- dc = DataChain(name=name, version=version, catalog=catalog)
773
+ session = Session.get(catalog=catalog)
774
+ dc = DataChain.from_dataset(name=name, version=version, session=session)
774
775
  dc.print_schema()
775
776
 
776
777
 
@@ -15,7 +15,6 @@ from uuid import uuid4
15
15
  from sqlalchemy import (
16
16
  JSON,
17
17
  BigInteger,
18
- Boolean,
19
18
  Column,
20
19
  DateTime,
21
20
  ForeignKey,
@@ -228,7 +227,7 @@ class AbstractMetastore(ABC, Serializable):
228
227
  self,
229
228
  dataset: DatasetRecord,
230
229
  version: int,
231
- status: int = DatasetStatus.CREATED,
230
+ status: int,
232
231
  sources: str = "",
233
232
  feature_schema: Optional[dict] = None,
234
233
  query_script: str = "",
@@ -448,7 +447,6 @@ class AbstractDBMetastore(AbstractMetastore):
448
447
  Column("name", Text, nullable=False),
449
448
  Column("description", Text),
450
449
  Column("labels", JSON, nullable=True),
451
- Column("shadow", Boolean, nullable=False),
452
450
  Column("status", Integer, nullable=False),
453
451
  Column("feature_schema", JSON, nullable=True),
454
452
  Column("created_at", DateTime(timezone=True)),
@@ -481,8 +479,11 @@ class AbstractDBMetastore(AbstractMetastore):
481
479
  nullable=False,
482
480
  ),
483
481
  Column("version", Integer, nullable=False),
484
- # adding default for now until we fully remove shadow datasets
485
- Column("status", Integer, nullable=False, default=DatasetStatus.COMPLETE),
482
+ Column(
483
+ "status",
484
+ Integer,
485
+ nullable=False,
486
+ ),
486
487
  Column("feature_schema", JSON, nullable=True),
487
488
  Column("created_at", DateTime(timezone=True)),
488
489
  Column("finished_at", DateTime(timezone=True)),
@@ -969,7 +970,6 @@ class AbstractDBMetastore(AbstractMetastore):
969
970
  # TODO abstract this method and add registered = True based on kwargs
970
971
  query = self._datasets_insert().values(
971
972
  name=name,
972
- shadow=False,
973
973
  status=status,
974
974
  feature_schema=json.dumps(feature_schema or {}),
975
975
  created_at=datetime.now(timezone.utc),
@@ -992,7 +992,7 @@ class AbstractDBMetastore(AbstractMetastore):
992
992
  self,
993
993
  dataset: DatasetRecord,
994
994
  version: int,
995
- status: int = DatasetStatus.CREATED,
995
+ status: int,
996
996
  sources: str = "",
997
997
  feature_schema: Optional[dict] = None,
998
998
  query_script: str = "",
@@ -1018,7 +1018,7 @@ class AbstractDBMetastore(AbstractMetastore):
1018
1018
  query = self._datasets_versions_insert().values(
1019
1019
  dataset_id=dataset.id,
1020
1020
  version=version,
1021
- status=status, # for now until we remove shadow datasets
1021
+ status=status,
1022
1022
  feature_schema=json.dumps(feature_schema or {}),
1023
1023
  created_at=created_at or datetime.now(timezone.utc),
1024
1024
  finished_at=finished_at,
@@ -919,9 +919,7 @@ class AbstractWarehouse(ABC, Serializable):
919
919
  def is_temp_table_name(self, name: str) -> bool:
920
920
  """Returns if the given table name refers to a temporary
921
921
  or no longer needed table."""
922
- return name.startswith(
923
- (self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX, "ds_shadow_")
924
- ) or name.endswith("_shadow")
922
+ return name.startswith((self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX))
925
923
 
926
924
  def get_temp_table_names(self) -> list[str]:
927
925
  return [
@@ -267,7 +267,6 @@ class DatasetRecord:
267
267
  name: str
268
268
  description: Optional[str]
269
269
  labels: list[str]
270
- shadow: bool
271
270
  schema: dict[str, Union[SQLType, type[SQLType]]]
272
271
  feature_schema: dict
273
272
  versions: list[DatasetVersion]
@@ -296,7 +295,6 @@ class DatasetRecord:
296
295
  name: str,
297
296
  description: Optional[str],
298
297
  labels: str,
299
- shadow: int,
300
298
  status: int,
301
299
  feature_schema: Optional[str],
302
300
  created_at: datetime,
@@ -356,7 +354,6 @@ class DatasetRecord:
356
354
  name,
357
355
  description,
358
356
  labels_lst,
359
- bool(shadow),
360
357
  cls.parse_schema(schema_dct), # type: ignore[arg-type]
361
358
  json.loads(feature_schema) if feature_schema else {},
362
359
  [dataset_version],
@@ -54,7 +54,6 @@ from datachain.query import Session
54
54
  from datachain.query.dataset import (
55
55
  DatasetQuery,
56
56
  PartitionByType,
57
- detach,
58
57
  )
59
58
  from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
60
59
  from datachain.sql.functions import path as pathfunc
@@ -159,7 +158,7 @@ class Sys(DataModel):
159
158
  rand: int
160
159
 
161
160
 
162
- class DataChain(DatasetQuery):
161
+ class DataChain:
163
162
  """DataChain - a data structure for batch data processing and evaluation.
164
163
 
165
164
  It represents a sequence of data manipulation steps such as reading data from
@@ -238,33 +237,20 @@ class DataChain(DatasetQuery):
238
237
  "size": 0,
239
238
  }
240
239
 
241
- def __init__(self, *args, settings: Optional[dict] = None, **kwargs):
242
- """This method needs to be redefined as a part of Dataset and DataChain
243
- decoupling.
244
- """
245
- super().__init__( # type: ignore[misc]
246
- *args,
247
- **kwargs,
248
- indexing_column_types=File._datachain_column_types,
249
- )
250
-
251
- telemetry.send_event_once("class", "datachain_init", **kwargs)
252
-
253
- if settings:
254
- self._settings = Settings(**settings)
255
- else:
256
- self._settings = Settings()
257
- self._setup: dict = {}
258
-
259
- self.signals_schema = SignalSchema({"sys": Sys})
260
- if self.feature_schema:
261
- self.signals_schema |= SignalSchema.deserialize(self.feature_schema)
262
- else:
263
- self.signals_schema |= SignalSchema.from_column_types(
264
- self.column_types or {}
265
- )
266
-
267
- self._sys = False
240
+ def __init__(
241
+ self,
242
+ query: DatasetQuery,
243
+ settings: Settings,
244
+ signal_schema: SignalSchema,
245
+ setup: Optional[dict] = None,
246
+ _sys: bool = False,
247
+ ) -> None:
248
+ """Don't instantiate this directly, use one of the from_XXX constructors."""
249
+ self._query = query
250
+ self._settings = settings
251
+ self.signals_schema = signal_schema
252
+ self._setup: dict = setup or {}
253
+ self._sys = _sys
268
254
 
269
255
  @property
270
256
  def schema(self) -> dict[str, DataType]:
@@ -290,18 +276,55 @@ class DataChain(DatasetQuery):
290
276
  def c(self, column: Union[str, Column]) -> Column:
291
277
  """Returns Column instance attached to the current chain."""
292
278
  c = self.column(column) if isinstance(column, str) else self.column(column.name)
293
- c.table = self.table
279
+ c.table = self._query.table
294
280
  return c
295
281
 
282
+ @property
283
+ def session(self) -> Session:
284
+ """Session of the chain."""
285
+ return self._query.session
286
+
287
+ @property
288
+ def name(self) -> Optional[str]:
289
+ """Name of the underlying dataset, if there is one."""
290
+ return self._query.name
291
+
292
+ @property
293
+ def version(self) -> Optional[int]:
294
+ """Version of the underlying dataset, if there is one."""
295
+ return self._query.version
296
+
297
+ def __or__(self, other: "Self") -> "Self":
298
+ """Return `self.union(other)`."""
299
+ return self.union(other)
300
+
296
301
  def print_schema(self) -> None:
297
302
  """Print schema of the chain."""
298
303
  self._effective_signals_schema.print_tree()
299
304
 
300
- def clone(self, new_table: bool = True) -> "Self":
305
+ def clone(self) -> "Self":
301
306
  """Make a copy of the chain in a new table."""
302
- obj = super().clone(new_table=new_table)
303
- obj.signals_schema = copy.deepcopy(self.signals_schema)
304
- return obj
307
+ return self._evolve(query=self._query.clone(new_table=True))
308
+
309
+ def _evolve(
310
+ self,
311
+ *,
312
+ query: Optional[DatasetQuery] = None,
313
+ settings: Optional[Settings] = None,
314
+ signal_schema=None,
315
+ _sys=None,
316
+ ) -> "Self":
317
+ if query is None:
318
+ query = self._query.clone(new_table=False)
319
+ if settings is None:
320
+ settings = self._settings
321
+ if signal_schema is None:
322
+ signal_schema = copy.deepcopy(self.signals_schema)
323
+ if _sys is None:
324
+ _sys = self._sys
325
+ return type(self)(
326
+ query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
327
+ )
305
328
 
306
329
  def settings(
307
330
  self,
@@ -332,11 +355,11 @@ class DataChain(DatasetQuery):
332
355
  )
333
356
  ```
334
357
  """
335
- chain = self.clone()
336
- if sys is not None:
337
- chain._sys = sys
338
- chain._settings.add(Settings(cache, parallel, workers, min_task_size))
339
- return chain
358
+ if sys is None:
359
+ sys = self._sys
360
+ settings = copy.copy(self._settings)
361
+ settings.add(Settings(cache, parallel, workers, min_task_size))
362
+ return self._evolve(settings=settings, _sys=sys)
340
363
 
341
364
  def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
342
365
  """Reset all settings to default values."""
@@ -434,7 +457,7 @@ class DataChain(DatasetQuery):
434
457
  version: Optional[int] = None,
435
458
  session: Optional[Session] = None,
436
459
  settings: Optional[dict] = None,
437
- ) -> "DataChain":
460
+ ) -> "Self":
438
461
  """Get data from a saved Dataset. It returns the chain itself.
439
462
 
440
463
  Parameters:
@@ -446,7 +469,24 @@ class DataChain(DatasetQuery):
446
469
  chain = DataChain.from_dataset("my_cats")
447
470
  ```
448
471
  """
449
- return DataChain(name=name, version=version, session=session, settings=settings)
472
+ query = DatasetQuery(
473
+ name=name,
474
+ version=version,
475
+ session=session,
476
+ indexing_column_types=File._datachain_column_types,
477
+ )
478
+ telemetry.send_event_once("class", "datachain_init", name=name, version=version)
479
+ if settings:
480
+ _settings = Settings(**settings)
481
+ else:
482
+ _settings = Settings()
483
+
484
+ signals_schema = SignalSchema({"sys": Sys})
485
+ if query.feature_schema:
486
+ signals_schema |= SignalSchema.deserialize(query.feature_schema)
487
+ else:
488
+ signals_schema |= SignalSchema.from_column_types(query.column_types or {})
489
+ return cls(query, _settings, signals_schema)
450
490
 
451
491
  @classmethod
452
492
  def from_json(
@@ -699,7 +739,11 @@ class DataChain(DatasetQuery):
699
739
  version : version of a dataset. Default - the last version that exist.
700
740
  """
701
741
  schema = self.signals_schema.clone_without_sys_signals().serialize()
702
- return super().save(name=name, version=version, feature_schema=schema, **kwargs)
742
+ return self._evolve(
743
+ query=self._query.save(
744
+ name=name, version=version, feature_schema=schema, **kwargs
745
+ )
746
+ )
703
747
 
704
748
  def apply(self, func, *args, **kwargs):
705
749
  """Apply any function to the chain.
@@ -765,13 +809,14 @@ class DataChain(DatasetQuery):
765
809
  """
766
810
  udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
767
811
 
768
- chain = self.add_signals(
769
- udf_obj.to_udf_wrapper(),
770
- **self._settings.to_dict(),
812
+ return self._evolve(
813
+ query=self._query.add_signals(
814
+ udf_obj.to_udf_wrapper(),
815
+ **self._settings.to_dict(),
816
+ ),
817
+ signal_schema=self.signals_schema | udf_obj.output,
771
818
  )
772
819
 
773
- return chain.add_schema(udf_obj.output).reset_settings(self._settings)
774
-
775
820
  def gen(
776
821
  self,
777
822
  func: Optional[Callable] = None,
@@ -800,14 +845,14 @@ class DataChain(DatasetQuery):
800
845
  ```
801
846
  """
802
847
  udf_obj = self._udf_to_obj(Generator, func, params, output, signal_map)
803
- chain = DatasetQuery.generate(
804
- self,
805
- udf_obj.to_udf_wrapper(),
806
- **self._settings.to_dict(),
848
+ return self._evolve(
849
+ query=self._query.generate(
850
+ udf_obj.to_udf_wrapper(),
851
+ **self._settings.to_dict(),
852
+ ),
853
+ signal_schema=udf_obj.output,
807
854
  )
808
855
 
809
- return chain.reset_schema(udf_obj.output).reset_settings(self._settings)
810
-
811
856
  def agg(
812
857
  self,
813
858
  func: Optional[Callable] = None,
@@ -840,15 +885,15 @@ class DataChain(DatasetQuery):
840
885
  ```
841
886
  """
842
887
  udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
843
- chain = DatasetQuery.generate(
844
- self,
845
- udf_obj.to_udf_wrapper(),
846
- partition_by=partition_by,
847
- **self._settings.to_dict(),
888
+ return self._evolve(
889
+ query=self._query.generate(
890
+ udf_obj.to_udf_wrapper(),
891
+ partition_by=partition_by,
892
+ **self._settings.to_dict(),
893
+ ),
894
+ signal_schema=udf_obj.output,
848
895
  )
849
896
 
850
- return chain.reset_schema(udf_obj.output).reset_settings(self._settings)
851
-
852
897
  def batch_map(
853
898
  self,
854
899
  func: Optional[Callable] = None,
@@ -876,14 +921,14 @@ class DataChain(DatasetQuery):
876
921
  ```
877
922
  """
878
923
  udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
879
- chain = DatasetQuery.add_signals(
880
- self,
881
- udf_obj.to_udf_wrapper(batch),
882
- **self._settings.to_dict(),
924
+ return self._evolve(
925
+ query=self._query.add_signals(
926
+ udf_obj.to_udf_wrapper(batch),
927
+ **self._settings.to_dict(),
928
+ ),
929
+ signal_schema=self.signals_schema | udf_obj.output,
883
930
  )
884
931
 
885
- return chain.add_schema(udf_obj.output).reset_settings(self._settings)
886
-
887
932
  def _udf_to_obj(
888
933
  self,
889
934
  target_class: type[UDFBase],
@@ -907,17 +952,12 @@ class DataChain(DatasetQuery):
907
952
  return target_class._create(sign, params_schema)
908
953
 
909
954
  def _extend_to_data_model(self, method_name, *args, **kwargs):
910
- super_func = getattr(super(), method_name)
955
+ query_func = getattr(self._query, method_name)
911
956
 
912
957
  new_schema = self.signals_schema.resolve(*args)
913
958
  columns = [C(col) for col in new_schema.db_signals()]
914
- res = super_func(*columns, **kwargs)
915
- if isinstance(res, DataChain):
916
- res.signals_schema = new_schema
917
-
918
- return res
959
+ return query_func(*columns, **kwargs)
919
960
 
920
- @detach
921
961
  @resolve_columns
922
962
  def order_by(self, *args, descending: bool = False) -> "Self":
923
963
  """Orders by specified set of signals.
@@ -928,9 +968,8 @@ class DataChain(DatasetQuery):
928
968
  if descending:
929
969
  args = tuple(sqlalchemy.desc(a) for a in args)
930
970
 
931
- return super().order_by(*args)
971
+ return self._evolve(query=self._query.order_by(*args))
932
972
 
933
- @detach
934
973
  def distinct(self, arg: str, *args: str) -> "Self": # type: ignore[override]
935
974
  """Removes duplicate rows based on uniqueness of some input column(s)
936
975
  i.e if rows are found with the same value of input column(s), only one
@@ -942,29 +981,30 @@ class DataChain(DatasetQuery):
942
981
  )
943
982
  ```
944
983
  """
945
- return super().distinct(*self.signals_schema.resolve(arg, *args).db_signals())
984
+ return self._evolve(
985
+ query=self._query.distinct(
986
+ *self.signals_schema.resolve(arg, *args).db_signals()
987
+ )
988
+ )
946
989
 
947
- @detach
948
990
  def select(self, *args: str, _sys: bool = True) -> "Self":
949
991
  """Select only a specified set of signals."""
950
992
  new_schema = self.signals_schema.resolve(*args)
951
993
  if _sys:
952
994
  new_schema = SignalSchema({"sys": Sys}) | new_schema
953
995
  columns = new_schema.db_signals()
954
- chain = super().select(*columns)
955
- chain.signals_schema = new_schema
956
- return chain
996
+ return self._evolve(
997
+ query=self._query.select(*columns), signal_schema=new_schema
998
+ )
957
999
 
958
- @detach
959
1000
  def select_except(self, *args: str) -> "Self":
960
1001
  """Select all the signals expect the specified signals."""
961
1002
  new_schema = self.signals_schema.select_except_signals(*args)
962
1003
  columns = new_schema.db_signals()
963
- chain = super().select(*columns)
964
- chain.signals_schema = new_schema
965
- return chain
1004
+ return self._evolve(
1005
+ query=self._query.select(*columns), signal_schema=new_schema
1006
+ )
966
1007
 
967
- @detach
968
1008
  def mutate(self, **kwargs) -> "Self":
969
1009
  """Create new signals based on existing signals.
970
1010
 
@@ -1029,9 +1069,9 @@ class DataChain(DatasetQuery):
1029
1069
  # adding new signal
1030
1070
  mutated[name] = value
1031
1071
 
1032
- chain = super().mutate(**mutated)
1033
- chain.signals_schema = schema.mutate(kwargs)
1034
- return chain
1072
+ return self._evolve(
1073
+ query=self._query.mutate(**mutated), signal_schema=schema.mutate(kwargs)
1074
+ )
1035
1075
 
1036
1076
  @property
1037
1077
  def _effective_signals_schema(self) -> "SignalSchema":
@@ -1058,7 +1098,7 @@ class DataChain(DatasetQuery):
1058
1098
  a tuple of row values.
1059
1099
  """
1060
1100
  db_signals = self._effective_signals_schema.db_signals()
1061
- with super().select(*db_signals).as_iterable() as rows:
1101
+ with self._query.select(*db_signals).as_iterable() as rows:
1062
1102
  if row_factory:
1063
1103
  rows = (row_factory(db_signals, r) for r in rows)
1064
1104
  yield from rows
@@ -1126,7 +1166,7 @@ class DataChain(DatasetQuery):
1126
1166
  chain = self.select(*cols) if cols else self
1127
1167
  signals_schema = chain._effective_signals_schema
1128
1168
  db_signals = signals_schema.db_signals()
1129
- with super().select(*db_signals).as_iterable() as rows:
1169
+ with self._query.select(*db_signals).as_iterable() as rows:
1130
1170
  for row in rows:
1131
1171
  ret = signals_schema.row_to_features(
1132
1172
  row, catalog=chain.session.catalog, cache=chain._settings.cache
@@ -1156,7 +1196,7 @@ class DataChain(DatasetQuery):
1156
1196
  """
1157
1197
  from datachain.torch import PytorchDataset
1158
1198
 
1159
- if self.attached:
1199
+ if self._query.attached:
1160
1200
  chain = self
1161
1201
  else:
1162
1202
  chain = self.save()
@@ -1164,7 +1204,7 @@ class DataChain(DatasetQuery):
1164
1204
  return PytorchDataset(
1165
1205
  chain.name,
1166
1206
  chain.version,
1167
- catalog=self.catalog,
1207
+ catalog=self.session.catalog,
1168
1208
  transform=transform,
1169
1209
  tokenizer=tokenizer,
1170
1210
  tokenizer_kwargs=tokenizer_kwargs,
@@ -1175,7 +1215,6 @@ class DataChain(DatasetQuery):
1175
1215
  schema = self.signals_schema.clone_without_file_signals()
1176
1216
  return self.select(*schema.values.keys())
1177
1217
 
1178
- @detach
1179
1218
  def merge(
1180
1219
  self,
1181
1220
  right_ds: "DataChain",
@@ -1240,7 +1279,7 @@ class DataChain(DatasetQuery):
1240
1279
  )
1241
1280
 
1242
1281
  if self == right_ds:
1243
- right_ds = right_ds.clone(new_table=True)
1282
+ right_ds = right_ds.clone()
1244
1283
 
1245
1284
  errors = []
1246
1285
 
@@ -1266,9 +1305,11 @@ class DataChain(DatasetQuery):
1266
1305
  on, right_on, f"Could not resolve {', '.join(errors)}"
1267
1306
  )
1268
1307
 
1269
- ds = self.join(right_ds, sqlalchemy.and_(*ops), inner, rname + "{name}")
1270
-
1271
- ds.feature_schema = None
1308
+ query = self._query.join(
1309
+ right_ds._query, sqlalchemy.and_(*ops), inner, rname + "{name}"
1310
+ )
1311
+ query.feature_schema = None
1312
+ ds = self._evolve(query=query)
1272
1313
 
1273
1314
  signals_schema = self.signals_schema.clone_without_sys_signals()
1274
1315
  right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
@@ -1278,6 +1319,14 @@ class DataChain(DatasetQuery):
1278
1319
 
1279
1320
  return ds
1280
1321
 
1322
+ def union(self, other: "Self") -> "Self":
1323
+ """Return the set union of the two datasets.
1324
+
1325
+ Parameters:
1326
+ other: chain whose rows will be added to `self`.
1327
+ """
1328
+ return self._evolve(query=self._query.union(other._query))
1329
+
1281
1330
  def subtract( # type: ignore[override]
1282
1331
  self,
1283
1332
  other: "DataChain",
@@ -1341,7 +1390,7 @@ class DataChain(DatasetQuery):
1341
1390
  other.signals_schema.resolve(*right_on).db_signals(),
1342
1391
  ) # type: ignore[arg-type]
1343
1392
  )
1344
- return super().subtract(other, signals) # type: ignore[arg-type]
1393
+ return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
1345
1394
 
1346
1395
  @classmethod
1347
1396
  def from_values(
@@ -1449,7 +1498,7 @@ class DataChain(DatasetQuery):
1449
1498
  transpose : Whether to transpose rows and columns.
1450
1499
  truncate : Whether or not to truncate the contents of columns.
1451
1500
  """
1452
- dc = self.limit(limit) if limit > 0 else self
1501
+ dc = self.limit(limit) if limit > 0 else self # type: ignore[misc]
1453
1502
  df = dc.to_pandas(flatten)
1454
1503
 
1455
1504
  if df.empty:
@@ -1782,7 +1831,7 @@ class DataChain(DatasetQuery):
1782
1831
  settings: Optional[dict] = None,
1783
1832
  in_memory: bool = False,
1784
1833
  schema: Optional[dict[str, DataType]] = None,
1785
- ) -> "DataChain":
1834
+ ) -> "Self":
1786
1835
  """Create a DataChain from the provided records. This method can be used for
1787
1836
  programmatically generating a chain in contrast of reading data from storages
1788
1837
  or other sources.
@@ -1837,7 +1886,7 @@ class DataChain(DatasetQuery):
1837
1886
  insert_q = dr.get_table().insert()
1838
1887
  for record in to_insert:
1839
1888
  db.execute(insert_q.values(**record))
1840
- return DataChain(name=dsr.name, settings=settings)
1889
+ return cls.from_dataset(name=dsr.name, session=session, settings=settings)
1841
1890
 
1842
1891
  def sum(self, fr: DataType): # type: ignore[override]
1843
1892
  """Compute the sum of a column."""
@@ -1898,8 +1947,8 @@ class DataChain(DatasetQuery):
1898
1947
  ) -> None:
1899
1948
  """Method that exports all files from chain to some folder."""
1900
1949
  if placement == "filename" and (
1901
- super().distinct(pathfunc.name(C(f"{signal}__path"))).count()
1902
- != self.count()
1950
+ self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
1951
+ != self._query.count()
1903
1952
  ):
1904
1953
  raise ValueError("Files with the same name found")
1905
1954
 
@@ -1919,10 +1968,9 @@ class DataChain(DatasetQuery):
1919
1968
  NOTE: Samples are not deterministic, and streamed/paginated queries or
1920
1969
  multiple workers will draw samples with replacement.
1921
1970
  """
1922
- return super().sample(n)
1971
+ return self._evolve(query=self._query.sample(n))
1923
1972
 
1924
- @detach
1925
- def filter(self, *args) -> "Self":
1973
+ def filter(self, *args: Any) -> "Self":
1926
1974
  """Filter the chain according to conditions.
1927
1975
 
1928
1976
  Example:
@@ -1955,14 +2003,50 @@ class DataChain(DatasetQuery):
1955
2003
  )
1956
2004
  ```
1957
2005
  """
1958
- return super().filter(*args)
2006
+ return self._evolve(query=self._query.filter(*args))
1959
2007
 
1960
- @detach
1961
2008
  def limit(self, n: int) -> "Self":
1962
- """Return the first n rows of the chain."""
1963
- return super().limit(n)
2009
+ """Return the first `n` rows of the chain.
2010
+
2011
+ If the chain is unordered, which rows are returned is undefined.
2012
+ If the chain has less than `n` rows, the whole chain is returned.
2013
+
2014
+ Parameters:
2015
+ n (int): Number of rows to return.
2016
+ """
2017
+ return self._evolve(query=self._query.limit(n))
1964
2018
 
1965
- @detach
1966
2019
  def offset(self, offset: int) -> "Self":
1967
- """Return the results starting with the offset row."""
1968
- return super().offset(offset)
2020
+ """Return the results starting with the offset row.
2021
+
2022
+ If the chain is unordered, which rows are skipped in undefined.
2023
+ If the chain has less than `offset` rows, the result is an empty chain.
2024
+
2025
+ Parameters:
2026
+ offset (int): Number of rows to skip.
2027
+ """
2028
+ return self._evolve(query=self._query.offset(offset))
2029
+
2030
+ def count(self) -> int:
2031
+ """Return the number of rows in the chain."""
2032
+ return self._query.count()
2033
+
2034
+ def exec(self) -> "Self":
2035
+ """Execute the chain."""
2036
+ return self._evolve(query=self._query.exec())
2037
+
2038
+ def chunk(self, index: int, total: int) -> "Self":
2039
+ """Split a chain into smaller chunks for e.g. parallelization.
2040
+
2041
+ Example:
2042
+ ```py
2043
+ chain = DataChain.from_storage(...)
2044
+ chunk_1 = query._chunk(0, 2)
2045
+ chunk_2 = query._chunk(1, 2)
2046
+ ```
2047
+
2048
+ Note:
2049
+ Bear in mind that `index` is 0-indexed but `total` isn't.
2050
+ Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
2051
+ """
2052
+ return self._evolve(query=self._query.chunk(index, total))