datachain 0.6.0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (257) hide show
  1. {datachain-0.6.0/src/datachain.egg-info → datachain-0.6.1}/PKG-INFO +2 -2
  2. {datachain-0.6.0 → datachain-0.6.1}/pyproject.toml +1 -1
  3. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/__init__.py +2 -0
  4. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/catalog/catalog.py +1 -9
  5. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/sqlite.py +8 -0
  6. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/warehouse.py +0 -4
  7. datachain-0.6.1/src/datachain/lib/convert/sql_to_python.py +14 -0
  8. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/dc.py +64 -28
  9. datachain-0.6.1/src/datachain/lib/func/__init__.py +14 -0
  10. datachain-0.6.1/src/datachain/lib/func/aggregate.py +42 -0
  11. datachain-0.6.1/src/datachain/lib/func/func.py +64 -0
  12. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/signal_schema.py +9 -3
  13. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/utils.py +5 -0
  14. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/__init__.py +1 -2
  15. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/batch.py +0 -1
  16. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/dataset.py +22 -43
  17. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/schema.py +1 -61
  18. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/session.py +33 -25
  19. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/__init__.py +1 -1
  20. datachain-0.6.1/src/datachain/sql/functions/aggregate.py +47 -0
  21. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/array.py +0 -8
  22. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/sqlite/base.py +20 -2
  23. {datachain-0.6.0 → datachain-0.6.1/src/datachain.egg-info}/PKG-INFO +2 -2
  24. {datachain-0.6.0 → datachain-0.6.1}/src/datachain.egg-info/SOURCES.txt +4 -0
  25. {datachain-0.6.0 → datachain-0.6.1}/src/datachain.egg-info/requires.txt +1 -1
  26. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_datachain.py +61 -8
  27. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_dataset_query.py +0 -34
  28. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_datasets.py +33 -0
  29. datachain-0.6.1/tests/scripts/feature_class_exception.py +34 -0
  30. {datachain-0.6.0 → datachain-0.6.1}/tests/test_atomicity.py +10 -4
  31. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_datachain.py +350 -19
  32. datachain-0.6.1/tests/unit/lib/test_sql_to_python.py +25 -0
  33. {datachain-0.6.0 → datachain-0.6.1}/tests/utils.py +20 -0
  34. datachain-0.6.0/src/datachain/lib/convert/sql_to_python.py +0 -18
  35. datachain-0.6.0/tests/scripts/feature_class_exception.py +0 -11
  36. datachain-0.6.0/tests/unit/lib/test_sql_to_python.py +0 -28
  37. {datachain-0.6.0 → datachain-0.6.1}/.cruft.json +0 -0
  38. {datachain-0.6.0 → datachain-0.6.1}/.gitattributes +0 -0
  39. {datachain-0.6.0 → datachain-0.6.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  40. {datachain-0.6.0 → datachain-0.6.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  41. {datachain-0.6.0 → datachain-0.6.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  42. {datachain-0.6.0 → datachain-0.6.1}/.github/codecov.yaml +0 -0
  43. {datachain-0.6.0 → datachain-0.6.1}/.github/dependabot.yml +0 -0
  44. {datachain-0.6.0 → datachain-0.6.1}/.github/workflows/benchmarks.yml +0 -0
  45. {datachain-0.6.0 → datachain-0.6.1}/.github/workflows/release.yml +0 -0
  46. {datachain-0.6.0 → datachain-0.6.1}/.github/workflows/tests-studio.yml +0 -0
  47. {datachain-0.6.0 → datachain-0.6.1}/.github/workflows/tests.yml +0 -0
  48. {datachain-0.6.0 → datachain-0.6.1}/.github/workflows/update-template.yaml +0 -0
  49. {datachain-0.6.0 → datachain-0.6.1}/.gitignore +0 -0
  50. {datachain-0.6.0 → datachain-0.6.1}/.pre-commit-config.yaml +0 -0
  51. {datachain-0.6.0 → datachain-0.6.1}/CODE_OF_CONDUCT.rst +0 -0
  52. {datachain-0.6.0 → datachain-0.6.1}/CONTRIBUTING.rst +0 -0
  53. {datachain-0.6.0 → datachain-0.6.1}/LICENSE +0 -0
  54. {datachain-0.6.0 → datachain-0.6.1}/README.rst +0 -0
  55. {datachain-0.6.0 → datachain-0.6.1}/docs/assets/captioned_cartoons.png +0 -0
  56. {datachain-0.6.0 → datachain-0.6.1}/docs/assets/datachain-white.svg +0 -0
  57. {datachain-0.6.0 → datachain-0.6.1}/docs/assets/datachain.svg +0 -0
  58. {datachain-0.6.0 → datachain-0.6.1}/docs/assets/flowchart.png +0 -0
  59. {datachain-0.6.0 → datachain-0.6.1}/docs/index.md +0 -0
  60. {datachain-0.6.0 → datachain-0.6.1}/docs/references/datachain.md +0 -0
  61. {datachain-0.6.0 → datachain-0.6.1}/docs/references/datatype.md +0 -0
  62. {datachain-0.6.0 → datachain-0.6.1}/docs/references/file.md +0 -0
  63. {datachain-0.6.0 → datachain-0.6.1}/docs/references/index.md +0 -0
  64. {datachain-0.6.0 → datachain-0.6.1}/docs/references/sql.md +0 -0
  65. {datachain-0.6.0 → datachain-0.6.1}/docs/references/torch.md +0 -0
  66. {datachain-0.6.0 → datachain-0.6.1}/docs/references/udf.md +0 -0
  67. {datachain-0.6.0 → datachain-0.6.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  68. {datachain-0.6.0 → datachain-0.6.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  69. {datachain-0.6.0 → datachain-0.6.1}/examples/computer_vision/openimage-detect.py +0 -0
  70. {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/common_sql_functions.py +0 -0
  71. {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/json-csv-reader.py +0 -0
  72. {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/torch-loader.py +0 -0
  73. {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/udfs/parallel.py +0 -0
  74. {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/udfs/simple.py +0 -0
  75. {datachain-0.6.0 → datachain-0.6.1}/examples/get_started/udfs/stateful.py +0 -0
  76. {datachain-0.6.0 → datachain-0.6.1}/examples/llm_and_nlp/claude-query.py +0 -0
  77. {datachain-0.6.0 → datachain-0.6.1}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  78. {datachain-0.6.0 → datachain-0.6.1}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  79. {datachain-0.6.0 → datachain-0.6.1}/examples/multimodal/clip_inference.py +0 -0
  80. {datachain-0.6.0 → datachain-0.6.1}/examples/multimodal/hf_pipeline.py +0 -0
  81. {datachain-0.6.0 → datachain-0.6.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  82. {datachain-0.6.0 → datachain-0.6.1}/examples/multimodal/wds.py +0 -0
  83. {datachain-0.6.0 → datachain-0.6.1}/examples/multimodal/wds_filtered.py +0 -0
  84. {datachain-0.6.0 → datachain-0.6.1}/mkdocs.yml +0 -0
  85. {datachain-0.6.0 → datachain-0.6.1}/noxfile.py +0 -0
  86. {datachain-0.6.0 → datachain-0.6.1}/overrides/main.html +0 -0
  87. {datachain-0.6.0 → datachain-0.6.1}/setup.cfg +0 -0
  88. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/__main__.py +0 -0
  89. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/asyn.py +0 -0
  90. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/cache.py +0 -0
  91. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/catalog/__init__.py +0 -0
  92. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/catalog/datasource.py +0 -0
  93. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/catalog/loader.py +0 -0
  94. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/cli.py +0 -0
  95. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/cli_utils.py +0 -0
  96. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/__init__.py +0 -0
  97. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/azure.py +0 -0
  98. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/fileslice.py +0 -0
  99. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/fsspec.py +0 -0
  100. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/gcs.py +0 -0
  101. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/hf.py +0 -0
  102. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/local.py +0 -0
  103. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/client/s3.py +0 -0
  104. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/config.py +0 -0
  105. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/__init__.py +0 -0
  106. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/db_engine.py +0 -0
  107. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/id_generator.py +0 -0
  108. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/job.py +0 -0
  109. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/metastore.py +0 -0
  110. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/schema.py +0 -0
  111. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/data_storage/serializer.py +0 -0
  112. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/dataset.py +0 -0
  113. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/error.py +0 -0
  114. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/job.py +0 -0
  115. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/__init__.py +0 -0
  116. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/arrow.py +0 -0
  117. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/clip.py +0 -0
  118. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/convert/__init__.py +0 -0
  119. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/convert/flatten.py +0 -0
  120. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  121. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/convert/unflatten.py +0 -0
  122. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  123. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/data_model.py +0 -0
  124. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/dataset_info.py +0 -0
  125. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/file.py +0 -0
  126. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/hf.py +0 -0
  127. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/image.py +0 -0
  128. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/listing.py +0 -0
  129. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/listing_info.py +0 -0
  130. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/meta_formats.py +0 -0
  131. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/model_store.py +0 -0
  132. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/pytorch.py +0 -0
  133. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/settings.py +0 -0
  134. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/tar.py +0 -0
  135. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/text.py +0 -0
  136. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/udf.py +0 -0
  137. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/udf_signature.py +0 -0
  138. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/vfile.py +0 -0
  139. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/webdataset.py +0 -0
  140. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/lib/webdataset_laion.py +0 -0
  141. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/listing.py +0 -0
  142. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/node.py +0 -0
  143. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/nodes_fetcher.py +0 -0
  144. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/nodes_thread_pool.py +0 -0
  145. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/progress.py +0 -0
  146. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/py.typed +0 -0
  147. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/dispatch.py +0 -0
  148. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/metrics.py +0 -0
  149. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/params.py +0 -0
  150. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/query/queue.py +0 -0
  151. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/remote/__init__.py +0 -0
  152. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/remote/studio.py +0 -0
  153. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/__init__.py +0 -0
  154. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/default/__init__.py +0 -0
  155. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/default/base.py +0 -0
  156. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/conditional.py +0 -0
  157. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/path.py +0 -0
  158. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/random.py +0 -0
  159. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/functions/string.py +0 -0
  160. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/selectable.py +0 -0
  161. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  162. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/sqlite/types.py +0 -0
  163. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/sqlite/vector.py +0 -0
  164. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/types.py +0 -0
  165. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/sql/utils.py +0 -0
  166. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/storage.py +0 -0
  167. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/telemetry.py +0 -0
  168. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/torch/__init__.py +0 -0
  169. {datachain-0.6.0 → datachain-0.6.1}/src/datachain/utils.py +0 -0
  170. {datachain-0.6.0 → datachain-0.6.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  171. {datachain-0.6.0 → datachain-0.6.1}/src/datachain.egg-info/entry_points.txt +0 -0
  172. {datachain-0.6.0 → datachain-0.6.1}/src/datachain.egg-info/top_level.txt +0 -0
  173. {datachain-0.6.0 → datachain-0.6.1}/tests/__init__.py +0 -0
  174. {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/__init__.py +0 -0
  175. {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/conftest.py +0 -0
  176. {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  177. {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  178. {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/datasets/.gitignore +0 -0
  179. {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  180. {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/test_datachain.py +0 -0
  181. {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/test_ls.py +0 -0
  182. {datachain-0.6.0 → datachain-0.6.1}/tests/benchmarks/test_version.py +0 -0
  183. {datachain-0.6.0 → datachain-0.6.1}/tests/conftest.py +0 -0
  184. {datachain-0.6.0 → datachain-0.6.1}/tests/data.py +0 -0
  185. {datachain-0.6.0 → datachain-0.6.1}/tests/examples/__init__.py +0 -0
  186. {datachain-0.6.0 → datachain-0.6.1}/tests/examples/test_examples.py +0 -0
  187. {datachain-0.6.0 → datachain-0.6.1}/tests/examples/test_wds_e2e.py +0 -0
  188. {datachain-0.6.0 → datachain-0.6.1}/tests/examples/wds_data.py +0 -0
  189. {datachain-0.6.0 → datachain-0.6.1}/tests/func/__init__.py +0 -0
  190. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_catalog.py +0 -0
  191. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_client.py +0 -0
  192. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_feature_pickling.py +0 -0
  193. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_listing.py +0 -0
  194. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_ls.py +0 -0
  195. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_meta_formats.py +0 -0
  196. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_metrics.py +0 -0
  197. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_pull.py +0 -0
  198. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_pytorch.py +0 -0
  199. {datachain-0.6.0 → datachain-0.6.1}/tests/func/test_query.py +0 -0
  200. {datachain-0.6.0 → datachain-0.6.1}/tests/scripts/feature_class.py +0 -0
  201. {datachain-0.6.0 → datachain-0.6.1}/tests/scripts/feature_class_parallel.py +0 -0
  202. {datachain-0.6.0 → datachain-0.6.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  203. {datachain-0.6.0 → datachain-0.6.1}/tests/scripts/name_len_slow.py +0 -0
  204. {datachain-0.6.0 → datachain-0.6.1}/tests/test_cli_e2e.py +0 -0
  205. {datachain-0.6.0 → datachain-0.6.1}/tests/test_query_e2e.py +0 -0
  206. {datachain-0.6.0 → datachain-0.6.1}/tests/test_telemetry.py +0 -0
  207. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/__init__.py +0 -0
  208. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/__init__.py +0 -0
  209. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/conftest.py +0 -0
  210. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_arrow.py +0 -0
  211. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_clip.py +0 -0
  212. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  213. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  214. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_feature.py +0 -0
  215. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_feature_utils.py +0 -0
  216. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_file.py +0 -0
  217. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_hf.py +0 -0
  218. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_image.py +0 -0
  219. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_schema.py +0 -0
  220. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_signal_schema.py +0 -0
  221. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_text.py +0 -0
  222. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_udf_signature.py +0 -0
  223. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_utils.py +0 -0
  224. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/lib/test_webdataset.py +0 -0
  225. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/__init__.py +0 -0
  226. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  227. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  228. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_array.py +0 -0
  229. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_conditional.py +0 -0
  230. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_path.py +0 -0
  231. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_random.py +0 -0
  232. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_selectable.py +0 -0
  233. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/sql/test_string.py +0 -0
  234. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_asyn.py +0 -0
  235. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_cache.py +0 -0
  236. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_catalog.py +0 -0
  237. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_catalog_loader.py +0 -0
  238. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_cli_parsing.py +0 -0
  239. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_client.py +0 -0
  240. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_client_s3.py +0 -0
  241. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_data_storage.py +0 -0
  242. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_database_engine.py +0 -0
  243. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_dataset.py +0 -0
  244. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_dispatch.py +0 -0
  245. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_fileslice.py +0 -0
  246. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_id_generator.py +0 -0
  247. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_listing.py +0 -0
  248. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_metastore.py +0 -0
  249. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_module_exports.py +0 -0
  250. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_query.py +0 -0
  251. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_query_metrics.py +0 -0
  252. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_query_params.py +0 -0
  253. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_serializer.py +0 -0
  254. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_session.py +0 -0
  255. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_storage.py +0 -0
  256. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_utils.py +0 -0
  257. {datachain-0.6.0 → datachain-0.6.1}/tests/unit/test_warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -81,7 +81,7 @@ Requires-Dist: requests-mock; extra == "tests"
81
81
  Requires-Dist: scipy; extra == "tests"
82
82
  Provides-Extra: dev
83
83
  Requires-Dist: datachain[docs,tests]; extra == "dev"
84
- Requires-Dist: mypy==1.11.2; extra == "dev"
84
+ Requires-Dist: mypy==1.12.0; extra == "dev"
85
85
  Requires-Dist: types-python-dateutil; extra == "dev"
86
86
  Requires-Dist: types-pytz; extra == "dev"
87
87
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -93,7 +93,7 @@ tests = [
93
93
  ]
94
94
  dev = [
95
95
  "datachain[docs,tests]",
96
- "mypy==1.11.2",
96
+ "mypy==1.12.0",
97
97
  "types-python-dateutil",
98
98
  "types-pytz",
99
99
  "types-PyYAML",
@@ -1,3 +1,4 @@
1
+ from datachain.lib import func
1
2
  from datachain.lib.data_model import DataModel, DataType, is_chain_type
2
3
  from datachain.lib.dc import C, Column, DataChain, Sys
3
4
  from datachain.lib.file import (
@@ -34,6 +35,7 @@ __all__ = [
34
35
  "Sys",
35
36
  "TarVFile",
36
37
  "TextFile",
38
+ "func",
37
39
  "is_chain_type",
38
40
  "metrics",
39
41
  "param",
@@ -989,13 +989,6 @@ class Catalog:
989
989
  c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
990
990
  }
991
991
 
992
- job_id = job_id or os.getenv("DATACHAIN_JOB_ID")
993
- if not job_id:
994
- from datachain.query.session import Session
995
-
996
- session = Session.get(catalog=self)
997
- job_id = session.job_id
998
-
999
992
  dataset = self.metastore.create_dataset_version(
1000
993
  dataset,
1001
994
  version,
@@ -1218,6 +1211,7 @@ class Catalog:
1218
1211
  preview=dataset_version.preview,
1219
1212
  job_id=dataset_version.job_id,
1220
1213
  )
1214
+
1221
1215
  # to avoid re-creating rows table, we are just renaming it for a new version
1222
1216
  # of target dataset
1223
1217
  self.warehouse.rename_dataset_table(
@@ -1325,8 +1319,6 @@ class Catalog:
1325
1319
  if offset:
1326
1320
  q = q.offset(offset)
1327
1321
 
1328
- q = q.order_by("sys__id")
1329
-
1330
1322
  return q.to_db_records()
1331
1323
 
1332
1324
  def signed_url(self, source: str, path: str, client_config=None) -> str:
@@ -763,6 +763,14 @@ class SQLiteWarehouse(AbstractWarehouse):
763
763
  query: Select,
764
764
  progress_cb: Optional[Callable[[int], None]] = None,
765
765
  ) -> None:
766
+ if len(query._group_by_clause) > 0:
767
+ select_q = query.with_only_columns(
768
+ *[c for c in query.selected_columns if c.name != "sys__id"]
769
+ )
770
+ q = table.insert().from_select(list(select_q.selected_columns), select_q)
771
+ self.db.execute(q)
772
+ return
773
+
766
774
  if "sys__id" in query.selected_columns:
767
775
  col_id = query.selected_columns.sys__id
768
776
  else:
@@ -215,10 +215,6 @@ class AbstractWarehouse(ABC, Serializable):
215
215
  limit = query._limit
216
216
  paginated_query = query.limit(page_size)
217
217
 
218
- if not paginated_query._order_by_clauses:
219
- # default order by is order by `sys__id`
220
- paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
221
-
222
218
  results = None
223
219
  offset = 0
224
220
  num_yielded = 0
@@ -0,0 +1,14 @@
1
+ from decimal import Decimal
2
+ from typing import Any
3
+
4
+ from sqlalchemy import ColumnElement
5
+
6
+
7
+ def sql_to_python(sql_exp: ColumnElement) -> Any:
8
+ try:
9
+ type_ = sql_exp.type.python_type
10
+ if type_ == Decimal:
11
+ type_ = float
12
+ except NotImplementedError:
13
+ type_ = str
14
+ return type_
@@ -29,6 +29,7 @@ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
29
29
  from datachain.lib.dataset_info import DatasetInfo
30
30
  from datachain.lib.file import ArrowRow, File, get_file_type
31
31
  from datachain.lib.file import ExportPlacement as FileExportPlacement
32
+ from datachain.lib.func import Func
32
33
  from datachain.lib.listing import (
33
34
  is_listing_dataset,
34
35
  is_listing_expired,
@@ -42,21 +43,12 @@ from datachain.lib.meta_formats import read_meta, read_schema
42
43
  from datachain.lib.model_store import ModelStore
43
44
  from datachain.lib.settings import Settings
44
45
  from datachain.lib.signal_schema import SignalSchema
45
- from datachain.lib.udf import (
46
- Aggregator,
47
- BatchMapper,
48
- Generator,
49
- Mapper,
50
- UDFBase,
51
- )
46
+ from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
52
47
  from datachain.lib.udf_signature import UdfSignature
53
- from datachain.lib.utils import DataChainParamsError
48
+ from datachain.lib.utils import DataChainColumnError, DataChainParamsError
54
49
  from datachain.query import Session
55
- from datachain.query.dataset import (
56
- DatasetQuery,
57
- PartitionByType,
58
- )
59
- from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
50
+ from datachain.query.dataset import DatasetQuery, PartitionByType
51
+ from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
60
52
  from datachain.sql.functions import path as pathfunc
61
53
  from datachain.telemetry import telemetry
62
54
  from datachain.utils import batched_it, inside_notebook
@@ -149,11 +141,6 @@ class DatasetMergeError(DataChainParamsError): # noqa: D101
149
141
  super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
150
142
 
151
143
 
152
- class DataChainColumnError(DataChainParamsError): # noqa: D101
153
- def __init__(self, col_name, msg): # noqa: D107
154
- super().__init__(f"Error for column {col_name}: {msg}")
155
-
156
-
157
144
  OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
158
145
 
159
146
 
@@ -982,10 +969,9 @@ class DataChain:
982
969
  row is left in the result set.
983
970
 
984
971
  Example:
985
- ```py
986
- dc.distinct("file.parent", "file.name")
987
- )
988
- ```
972
+ ```py
973
+ dc.distinct("file.parent", "file.name")
974
+ ```
989
975
  """
990
976
  return self._evolve(
991
977
  query=self._query.distinct(
@@ -1011,6 +997,60 @@ class DataChain:
1011
997
  query=self._query.select(*columns), signal_schema=new_schema
1012
998
  )
1013
999
 
1000
+ def group_by(
1001
+ self,
1002
+ *,
1003
+ partition_by: Union[str, Sequence[str]],
1004
+ **kwargs: Func,
1005
+ ) -> "Self":
1006
+ """Group rows by specified set of signals and return new signals
1007
+ with aggregated values.
1008
+
1009
+ Example:
1010
+ ```py
1011
+ chain = chain.group_by(
1012
+ cnt=func.count(),
1013
+ partition_by=("file_source", "file_ext"),
1014
+ )
1015
+ ```
1016
+ """
1017
+ if isinstance(partition_by, str):
1018
+ partition_by = [partition_by]
1019
+ if not partition_by:
1020
+ raise ValueError("At least one column should be provided for partition_by")
1021
+
1022
+ if not kwargs:
1023
+ raise ValueError("At least one column should be provided for group_by")
1024
+ for col_name, func in kwargs.items():
1025
+ if not isinstance(func, Func):
1026
+ raise DataChainColumnError(
1027
+ col_name,
1028
+ f"Column {col_name} has type {type(func)} but expected Func object",
1029
+ )
1030
+
1031
+ partition_by_columns: list[Column] = []
1032
+ signal_columns: list[Column] = []
1033
+ schema_fields: dict[str, DataType] = {}
1034
+
1035
+ # validate partition_by columns and add them to the schema
1036
+ for col_name in partition_by:
1037
+ col_db_name = ColumnMeta.to_db_name(col_name)
1038
+ col_type = self.signals_schema.get_column_type(col_db_name)
1039
+ col = Column(col_db_name, python_to_sql(col_type))
1040
+ partition_by_columns.append(col)
1041
+ schema_fields[col_db_name] = col_type
1042
+
1043
+ # validate signal columns and add them to the schema
1044
+ for col_name, func in kwargs.items():
1045
+ col = func.get_column(self.signals_schema, label=col_name)
1046
+ signal_columns.append(col)
1047
+ schema_fields[col_name] = func.get_result_type(self.signals_schema)
1048
+
1049
+ return self._evolve(
1050
+ query=self._query.group_by(signal_columns, partition_by_columns),
1051
+ signal_schema=SignalSchema(schema_fields),
1052
+ )
1053
+
1014
1054
  def mutate(self, **kwargs) -> "Self":
1015
1055
  """Create new signals based on existing signals.
1016
1056
 
@@ -1477,12 +1517,6 @@ class DataChain:
1477
1517
  fr_map = {col.lower(): df[col].tolist() for col in df.columns}
1478
1518
 
1479
1519
  for column in fr_map:
1480
- if column in DatasetRow.schema:
1481
- raise DatasetPrepareError(
1482
- name,
1483
- f"import from pandas error - column '{column}' conflicts with"
1484
- " default schema",
1485
- )
1486
1520
  if not column.isidentifier():
1487
1521
  raise DatasetPrepareError(
1488
1522
  name,
@@ -1994,6 +2028,8 @@ class DataChain:
1994
2028
  ),
1995
2029
  )
1996
2030
 
2031
+ session.add_dataset_version(dsr, dsr.latest_version)
2032
+
1997
2033
  if isinstance(to_insert, dict):
1998
2034
  to_insert = [to_insert]
1999
2035
  elif not to_insert:
@@ -0,0 +1,14 @@
1
+ from .aggregate import any_value, avg, collect, concat, count, max, min, sum
2
+ from .func import Func
3
+
4
+ __all__ = [
5
+ "Func",
6
+ "any_value",
7
+ "avg",
8
+ "collect",
9
+ "concat",
10
+ "count",
11
+ "max",
12
+ "min",
13
+ "sum",
14
+ ]
@@ -0,0 +1,42 @@
1
+ from typing import Optional
2
+
3
+ from sqlalchemy import func as sa_func
4
+
5
+ from datachain.sql import functions as dc_func
6
+
7
+ from .func import Func
8
+
9
+
10
+ def count(col: Optional[str] = None) -> Func:
11
+ return Func(inner=sa_func.count, col=col, result_type=int)
12
+
13
+
14
+ def sum(col: str) -> Func:
15
+ return Func(inner=sa_func.sum, col=col)
16
+
17
+
18
+ def avg(col: str) -> Func:
19
+ return Func(inner=dc_func.aggregate.avg, col=col)
20
+
21
+
22
+ def min(col: str) -> Func:
23
+ return Func(inner=sa_func.min, col=col)
24
+
25
+
26
+ def max(col: str) -> Func:
27
+ return Func(inner=sa_func.max, col=col)
28
+
29
+
30
+ def any_value(col: str) -> Func:
31
+ return Func(inner=dc_func.aggregate.any_value, col=col)
32
+
33
+
34
+ def collect(col: str) -> Func:
35
+ return Func(inner=dc_func.aggregate.collect, col=col, is_array=True)
36
+
37
+
38
+ def concat(col: str, separator="") -> Func:
39
+ def inner(arg):
40
+ return dc_func.aggregate.group_concat(arg, separator)
41
+
42
+ return Func(inner=inner, col=col, result_type=str)
@@ -0,0 +1,64 @@
1
+ from typing import TYPE_CHECKING, Callable, Optional
2
+
3
+ from datachain.lib.convert.python_to_sql import python_to_sql
4
+ from datachain.lib.utils import DataChainColumnError
5
+ from datachain.query.schema import Column, ColumnMeta
6
+
7
+ if TYPE_CHECKING:
8
+ from datachain import DataType
9
+ from datachain.lib.signal_schema import SignalSchema
10
+
11
+
12
+ class Func:
13
+ def __init__(
14
+ self,
15
+ inner: Callable,
16
+ col: Optional[str] = None,
17
+ result_type: Optional["DataType"] = None,
18
+ is_array: bool = False,
19
+ ) -> None:
20
+ self.inner = inner
21
+ self.col = col
22
+ self.result_type = result_type
23
+ self.is_array = is_array
24
+
25
+ @property
26
+ def db_col(self) -> Optional[str]:
27
+ return ColumnMeta.to_db_name(self.col) if self.col else None
28
+
29
+ def db_col_type(self, signals_schema: "SignalSchema") -> Optional["DataType"]:
30
+ if not self.db_col:
31
+ return None
32
+ col_type: type = signals_schema.get_column_type(self.db_col)
33
+ return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
34
+
35
+ def get_result_type(self, signals_schema: "SignalSchema") -> "DataType":
36
+ col_type = self.db_col_type(signals_schema)
37
+
38
+ if self.result_type:
39
+ return self.result_type
40
+
41
+ if col_type:
42
+ return col_type
43
+
44
+ raise DataChainColumnError(
45
+ str(self.inner),
46
+ "Column name is required to infer result type",
47
+ )
48
+
49
+ def get_column(
50
+ self, signals_schema: "SignalSchema", label: Optional[str] = None
51
+ ) -> Column:
52
+ if self.col:
53
+ if label == "collect":
54
+ print(label)
55
+ col_type = self.get_result_type(signals_schema)
56
+ col = Column(self.db_col, python_to_sql(col_type))
57
+ func_col = self.inner(col)
58
+ else:
59
+ func_col = self.inner()
60
+
61
+ if label:
62
+ func_col = func_col.label(label)
63
+
64
+ return func_col
@@ -400,6 +400,12 @@ class SignalSchema:
400
400
  if ModelStore.is_pydantic(finfo.annotation):
401
401
  SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
402
402
 
403
+ def get_column_type(self, col_name: str) -> DataType:
404
+ for path, _type, has_subtree, _ in self.get_flat_tree():
405
+ if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name:
406
+ return _type
407
+ raise SignalResolvingError([col_name], "is not found")
408
+
403
409
  def db_signals(
404
410
  self, name: Optional[str] = None, as_columns=False
405
411
  ) -> Union[list[str], list[Column]]:
@@ -490,7 +496,7 @@ class SignalSchema:
490
496
  new_values[name] = args_map[name]
491
497
  else:
492
498
  # adding new signal
493
- new_values.update(sql_to_python({name: value}))
499
+ new_values[name] = sql_to_python(value)
494
500
 
495
501
  return SignalSchema(new_values)
496
502
 
@@ -534,12 +540,12 @@ class SignalSchema:
534
540
  for name, val in values.items()
535
541
  }
536
542
 
537
- def get_flat_tree(self) -> Iterator[tuple[list[str], type, bool, int]]:
543
+ def get_flat_tree(self) -> Iterator[tuple[list[str], DataType, bool, int]]:
538
544
  yield from self._get_flat_tree(self.tree, [], 0)
539
545
 
540
546
  def _get_flat_tree(
541
547
  self, tree: dict, prefix: list[str], depth: int
542
- ) -> Iterator[tuple[list[str], type, bool, int]]:
548
+ ) -> Iterator[tuple[list[str], DataType, bool, int]]:
543
549
  for name, (type_, substree) in tree.items():
544
550
  suffix = name.split(".")
545
551
  new_prefix = prefix + suffix
@@ -23,3 +23,8 @@ class DataChainError(Exception):
23
23
  class DataChainParamsError(DataChainError):
24
24
  def __init__(self, message):
25
25
  super().__init__(message)
26
+
27
+
28
+ class DataChainColumnError(DataChainParamsError):
29
+ def __init__(self, col_name, msg):
30
+ super().__init__(f"Error for column {col_name}: {msg}")
@@ -1,12 +1,11 @@
1
1
  from .dataset import DatasetQuery
2
2
  from .params import param
3
- from .schema import C, DatasetRow, LocalFilename, Object, Stream
3
+ from .schema import C, LocalFilename, Object, Stream
4
4
  from .session import Session
5
5
 
6
6
  __all__ = [
7
7
  "C",
8
8
  "DatasetQuery",
9
- "DatasetRow",
10
9
  "LocalFilename",
11
10
  "Object",
12
11
  "Session",
@@ -97,7 +97,6 @@ class Partition(BatchingStrategy):
97
97
 
98
98
  ordered_query = query.order_by(None).order_by(
99
99
  PARTITION_COLUMN_ID,
100
- "sys__id",
101
100
  *query._order_by_clauses,
102
101
  )
103
102
 
@@ -591,10 +591,6 @@ class UDFSignal(UDFStep):
591
591
  return query, []
592
592
  table = self.catalog.warehouse.create_pre_udf_table(query)
593
593
  q: Select = sqlalchemy.select(*table.c)
594
- if query._order_by_clauses:
595
- # we are adding ordering only if it's explicitly added by user in
596
- # query part before adding signals
597
- q = q.order_by(table.c.sys__id)
598
594
  return q, [table]
599
595
 
600
596
  def create_result_query(
@@ -630,11 +626,6 @@ class UDFSignal(UDFStep):
630
626
  else:
631
627
  res = sqlalchemy.select(*cols1).select_from(subq)
632
628
 
633
- if query._order_by_clauses:
634
- # if ordering is used in query part before adding signals, we
635
- # will have it as order by id from select from pre-created udf table
636
- res = res.order_by(subq.c.sys__id)
637
-
638
629
  if self.partition_by is not None:
639
630
  subquery = res.subquery()
640
631
  res = sqlalchemy.select(*subquery.c).select_from(subquery)
@@ -666,13 +657,6 @@ class RowGenerator(UDFStep):
666
657
  def create_result_query(
667
658
  self, udf_table, query: Select
668
659
  ) -> tuple[QueryGeneratorFunc, list["sqlalchemy.Column"]]:
669
- if not query._order_by_clauses:
670
- # if we are not selecting all rows in UDF, we need to ensure that
671
- # we get the same rows as we got as inputs of UDF since selecting
672
- # without ordering can be non deterministic in some databases
673
- c = query.selected_columns
674
- query = query.order_by(c.sys__id)
675
-
676
660
  udf_table_query = udf_table.select().subquery()
677
661
  udf_table_cols: list[sqlalchemy.Label[Any]] = [
678
662
  label(c.name, c) for c in udf_table_query.columns
@@ -957,24 +941,24 @@ class SQLJoin(Step):
957
941
 
958
942
 
959
943
  @frozen
960
- class GroupBy(Step):
961
- """Group rows by a specific column."""
962
-
963
- cols: PartitionByType
944
+ class SQLGroupBy(SQLClause):
945
+ cols: Sequence[Union[str, ColumnElement]]
946
+ group_by: Sequence[Union[str, ColumnElement]]
964
947
 
965
- def clone(self) -> "Self":
966
- return self.__class__(self.cols)
948
+ def apply_sql_clause(self, query) -> Select:
949
+ if not self.cols:
950
+ raise ValueError("No columns to select")
951
+ if not self.group_by:
952
+ raise ValueError("No columns to group by")
967
953
 
968
- def apply(
969
- self, query_generator: QueryGenerator, temp_tables: list[str]
970
- ) -> StepResult:
971
- query = query_generator.select()
972
- grouped_query = query.group_by(*self.cols)
954
+ subquery = query.subquery()
973
955
 
974
- def q(*columns):
975
- return grouped_query.with_only_columns(*columns)
956
+ cols = [
957
+ subquery.c[str(c)] if isinstance(c, (str, C)) else c
958
+ for c in [*self.group_by, *self.cols]
959
+ ]
976
960
 
977
- return step_result(q, grouped_query.selected_columns)
961
+ return sqlalchemy.select(*cols).select_from(subquery).group_by(*self.group_by)
978
962
 
979
963
 
980
964
  def _validate_columns(
@@ -1130,25 +1114,14 @@ class DatasetQuery:
1130
1114
  query.steps = query.steps[-1:] + query.steps[:-1]
1131
1115
 
1132
1116
  result = query.starting_step.apply()
1133
- group_by = None
1134
1117
  self.dependencies.update(result.dependencies)
1135
1118
 
1136
1119
  for step in query.steps:
1137
- if isinstance(step, GroupBy):
1138
- if group_by is not None:
1139
- raise TypeError("only one group_by allowed")
1140
- group_by = step
1141
- continue
1142
-
1143
1120
  result = step.apply(
1144
1121
  result.query_generator, self.temp_table_names
1145
1122
  ) # a chain of steps linked by results
1146
1123
  self.dependencies.update(result.dependencies)
1147
1124
 
1148
- if group_by:
1149
- result = group_by.apply(result.query_generator, self.temp_table_names)
1150
- self.dependencies.update(result.dependencies)
1151
-
1152
1125
  return result.query_generator
1153
1126
 
1154
1127
  @staticmethod
@@ -1410,9 +1383,13 @@ class DatasetQuery:
1410
1383
  return query.as_scalar()
1411
1384
 
1412
1385
  @detach
1413
- def group_by(self, *cols: ColumnElement) -> "Self":
1386
+ def group_by(
1387
+ self,
1388
+ cols: Sequence[ColumnElement],
1389
+ group_by: Sequence[ColumnElement],
1390
+ ) -> "Self":
1414
1391
  query = self.clone()
1415
- query.steps.append(GroupBy(cols))
1392
+ query.steps.append(SQLGroupBy(cols, group_by))
1416
1393
  return query
1417
1394
 
1418
1395
  @detach
@@ -1591,6 +1568,8 @@ class DatasetQuery:
1591
1568
  )
1592
1569
  version = version or dataset.latest_version
1593
1570
 
1571
+ self.session.add_dataset_version(dataset=dataset, version=version)
1572
+
1594
1573
  dr = self.catalog.warehouse.dataset_rows(dataset)
1595
1574
 
1596
1575
  self.catalog.warehouse.copy_table(dr.get_table(), query.select())
@@ -1,16 +1,13 @@
1
1
  import functools
2
- import json
3
2
  from abc import ABC, abstractmethod
4
- from datetime import datetime, timezone
5
3
  from fnmatch import fnmatch
6
- from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
4
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
7
5
 
8
6
  import attrs
9
7
  import sqlalchemy as sa
10
8
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
11
9
 
12
10
  from datachain.lib.file import File
13
- from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
14
11
 
15
12
  if TYPE_CHECKING:
16
13
  from datachain.catalog import Catalog
@@ -228,61 +225,4 @@ def normalize_param(param: UDFParamSpec) -> UDFParameter:
228
225
  raise TypeError(f"Invalid UDF parameter: {param}")
229
226
 
230
227
 
231
- class DatasetRow:
232
- schema: ClassVar[dict[str, type[SQLType]]] = {
233
- "source": String,
234
- "path": String,
235
- "size": Int64,
236
- "location": JSON,
237
- "is_latest": Boolean,
238
- "last_modified": DateTime,
239
- "version": String,
240
- "etag": String,
241
- }
242
-
243
- @staticmethod
244
- def create(
245
- path: str,
246
- source: str = "",
247
- size: int = 0,
248
- location: Optional[dict[str, Any]] = None,
249
- is_latest: bool = True,
250
- last_modified: Optional[datetime] = None,
251
- version: str = "",
252
- etag: str = "",
253
- ) -> tuple[
254
- str,
255
- str,
256
- int,
257
- Optional[str],
258
- int,
259
- bool,
260
- datetime,
261
- str,
262
- str,
263
- int,
264
- ]:
265
- if location:
266
- location = json.dumps([location]) # type: ignore [assignment]
267
-
268
- last_modified = last_modified or datetime.now(timezone.utc)
269
-
270
- return ( # type: ignore [return-value]
271
- source,
272
- path,
273
- size,
274
- location,
275
- is_latest,
276
- last_modified,
277
- version,
278
- etag,
279
- )
280
-
281
- @staticmethod
282
- def extend(**columns):
283
- cols = {**DatasetRow.schema}
284
- cols.update(columns)
285
- return cols
286
-
287
-
288
228
  C = Column