datachain 0.6.9__tar.gz → 0.6.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (265) hide show
  1. {datachain-0.6.9/src/datachain.egg-info → datachain-0.6.11}/PKG-INFO +2 -2
  2. {datachain-0.6.9 → datachain-0.6.11}/mkdocs.yml +1 -1
  3. {datachain-0.6.9 → datachain-0.6.11}/pyproject.toml +1 -1
  4. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/catalog/catalog.py +15 -3
  5. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/sqlite.py +6 -2
  6. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/dc.py +53 -0
  7. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/listing.py +24 -7
  8. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/sqlite/types.py +8 -1
  9. datachain-0.6.11/src/datachain/toolkit/__init__.py +3 -0
  10. datachain-0.6.11/src/datachain/toolkit/split.py +67 -0
  11. {datachain-0.6.9 → datachain-0.6.11/src/datachain.egg-info}/PKG-INFO +2 -2
  12. {datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/SOURCES.txt +5 -1
  13. {datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/requires.txt +1 -1
  14. {datachain-0.6.9 → datachain-0.6.11}/tests/conftest.py +41 -1
  15. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_dataset_query.py +66 -0
  16. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_pull.py +33 -6
  17. datachain-0.6.11/tests/func/test_toolkit.py +42 -0
  18. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_datachain.py +42 -0
  19. datachain-0.6.11/tests/unit/sql/sqlite/test_types.py +19 -0
  20. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_listing.py +2 -1
  21. {datachain-0.6.9 → datachain-0.6.11}/.cruft.json +0 -0
  22. {datachain-0.6.9 → datachain-0.6.11}/.gitattributes +0 -0
  23. {datachain-0.6.9 → datachain-0.6.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  24. {datachain-0.6.9 → datachain-0.6.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  25. {datachain-0.6.9 → datachain-0.6.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  26. {datachain-0.6.9 → datachain-0.6.11}/.github/codecov.yaml +0 -0
  27. {datachain-0.6.9 → datachain-0.6.11}/.github/dependabot.yml +0 -0
  28. {datachain-0.6.9 → datachain-0.6.11}/.github/workflows/benchmarks.yml +0 -0
  29. {datachain-0.6.9 → datachain-0.6.11}/.github/workflows/release.yml +0 -0
  30. {datachain-0.6.9 → datachain-0.6.11}/.github/workflows/tests-studio.yml +0 -0
  31. {datachain-0.6.9 → datachain-0.6.11}/.github/workflows/tests.yml +0 -0
  32. {datachain-0.6.9 → datachain-0.6.11}/.github/workflows/update-template.yaml +0 -0
  33. {datachain-0.6.9 → datachain-0.6.11}/.gitignore +0 -0
  34. {datachain-0.6.9 → datachain-0.6.11}/.pre-commit-config.yaml +0 -0
  35. {datachain-0.6.9 → datachain-0.6.11}/CODE_OF_CONDUCT.rst +0 -0
  36. {datachain-0.6.9 → datachain-0.6.11}/CONTRIBUTING.rst +0 -0
  37. {datachain-0.6.9 → datachain-0.6.11}/LICENSE +0 -0
  38. {datachain-0.6.9 → datachain-0.6.11}/README.rst +0 -0
  39. {datachain-0.6.9 → datachain-0.6.11}/docs/assets/captioned_cartoons.png +0 -0
  40. {datachain-0.6.9 → datachain-0.6.11}/docs/assets/datachain-white.svg +0 -0
  41. {datachain-0.6.9 → datachain-0.6.11}/docs/assets/datachain.svg +0 -0
  42. {datachain-0.6.9 → datachain-0.6.11}/docs/index.md +0 -0
  43. {datachain-0.6.9 → datachain-0.6.11/docs}/overrides/main.html +0 -0
  44. {datachain-0.6.9 → datachain-0.6.11}/docs/references/datachain.md +0 -0
  45. {datachain-0.6.9 → datachain-0.6.11}/docs/references/datatype.md +0 -0
  46. {datachain-0.6.9 → datachain-0.6.11}/docs/references/file.md +0 -0
  47. {datachain-0.6.9 → datachain-0.6.11}/docs/references/index.md +0 -0
  48. {datachain-0.6.9 → datachain-0.6.11}/docs/references/sql.md +0 -0
  49. {datachain-0.6.9 → datachain-0.6.11}/docs/references/torch.md +0 -0
  50. {datachain-0.6.9 → datachain-0.6.11}/docs/references/udf.md +0 -0
  51. {datachain-0.6.9 → datachain-0.6.11}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  52. {datachain-0.6.9 → datachain-0.6.11}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  53. {datachain-0.6.9 → datachain-0.6.11}/examples/computer_vision/openimage-detect.py +0 -0
  54. {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/common_sql_functions.py +0 -0
  55. {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/json-csv-reader.py +0 -0
  56. {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/torch-loader.py +0 -0
  57. {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/udfs/parallel.py +0 -0
  58. {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/udfs/simple.py +0 -0
  59. {datachain-0.6.9 → datachain-0.6.11}/examples/get_started/udfs/stateful.py +0 -0
  60. {datachain-0.6.9 → datachain-0.6.11}/examples/llm_and_nlp/claude-query.py +0 -0
  61. {datachain-0.6.9 → datachain-0.6.11}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  62. {datachain-0.6.9 → datachain-0.6.11}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  63. {datachain-0.6.9 → datachain-0.6.11}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  64. {datachain-0.6.9 → datachain-0.6.11}/examples/multimodal/clip_inference.py +0 -0
  65. {datachain-0.6.9 → datachain-0.6.11}/examples/multimodal/hf_pipeline.py +0 -0
  66. {datachain-0.6.9 → datachain-0.6.11}/examples/multimodal/openai_image_desc_lib.py +0 -0
  67. {datachain-0.6.9 → datachain-0.6.11}/examples/multimodal/wds.py +0 -0
  68. {datachain-0.6.9 → datachain-0.6.11}/examples/multimodal/wds_filtered.py +0 -0
  69. {datachain-0.6.9 → datachain-0.6.11}/noxfile.py +0 -0
  70. {datachain-0.6.9 → datachain-0.6.11}/setup.cfg +0 -0
  71. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/__init__.py +0 -0
  72. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/__main__.py +0 -0
  73. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/asyn.py +0 -0
  74. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/cache.py +0 -0
  75. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/catalog/__init__.py +0 -0
  76. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/catalog/datasource.py +0 -0
  77. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/catalog/loader.py +0 -0
  78. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/cli.py +0 -0
  79. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/cli_utils.py +0 -0
  80. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/__init__.py +0 -0
  81. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/azure.py +0 -0
  82. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/fileslice.py +0 -0
  83. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/fsspec.py +0 -0
  84. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/gcs.py +0 -0
  85. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/hf.py +0 -0
  86. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/local.py +0 -0
  87. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/client/s3.py +0 -0
  88. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/config.py +0 -0
  89. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/__init__.py +0 -0
  90. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/db_engine.py +0 -0
  91. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/id_generator.py +0 -0
  92. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/job.py +0 -0
  93. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/metastore.py +0 -0
  94. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/schema.py +0 -0
  95. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/serializer.py +0 -0
  96. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/data_storage/warehouse.py +0 -0
  97. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/dataset.py +0 -0
  98. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/error.py +0 -0
  99. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/job.py +0 -0
  100. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/__init__.py +0 -0
  101. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/arrow.py +0 -0
  102. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/clip.py +0 -0
  103. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/__init__.py +0 -0
  104. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/flatten.py +0 -0
  105. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/python_to_sql.py +0 -0
  106. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/sql_to_python.py +0 -0
  107. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/unflatten.py +0 -0
  108. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  109. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/data_model.py +0 -0
  110. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/dataset_info.py +0 -0
  111. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/file.py +0 -0
  112. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/func/__init__.py +0 -0
  113. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/func/aggregate.py +0 -0
  114. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/func/func.py +0 -0
  115. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/hf.py +0 -0
  116. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/image.py +0 -0
  117. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/listing.py +0 -0
  118. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/listing_info.py +0 -0
  119. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/meta_formats.py +0 -0
  120. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/model_store.py +0 -0
  121. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/models/__init__.py +0 -0
  122. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/models/bbox.py +0 -0
  123. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/models/pose.py +0 -0
  124. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/models/yolo.py +0 -0
  125. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/pytorch.py +0 -0
  126. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/settings.py +0 -0
  127. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/signal_schema.py +0 -0
  128. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/tar.py +0 -0
  129. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/text.py +0 -0
  130. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/udf.py +0 -0
  131. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/udf_signature.py +0 -0
  132. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/utils.py +0 -0
  133. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/vfile.py +0 -0
  134. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/webdataset.py +0 -0
  135. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/lib/webdataset_laion.py +0 -0
  136. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/node.py +0 -0
  137. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/nodes_fetcher.py +0 -0
  138. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/nodes_thread_pool.py +0 -0
  139. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/progress.py +0 -0
  140. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/py.typed +0 -0
  141. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/__init__.py +0 -0
  142. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/batch.py +0 -0
  143. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/dataset.py +0 -0
  144. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/dispatch.py +0 -0
  145. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/metrics.py +0 -0
  146. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/params.py +0 -0
  147. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/queue.py +0 -0
  148. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/schema.py +0 -0
  149. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/query/session.py +0 -0
  150. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/remote/__init__.py +0 -0
  151. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/remote/studio.py +0 -0
  152. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/__init__.py +0 -0
  153. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/default/__init__.py +0 -0
  154. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/default/base.py +0 -0
  155. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/__init__.py +0 -0
  156. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/aggregate.py +0 -0
  157. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/array.py +0 -0
  158. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/conditional.py +0 -0
  159. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/path.py +0 -0
  160. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/random.py +0 -0
  161. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/functions/string.py +0 -0
  162. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/selectable.py +0 -0
  163. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/sqlite/__init__.py +0 -0
  164. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/sqlite/base.py +0 -0
  165. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/sqlite/vector.py +0 -0
  166. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/types.py +0 -0
  167. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/sql/utils.py +0 -0
  168. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/studio.py +0 -0
  169. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/telemetry.py +0 -0
  170. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/torch/__init__.py +0 -0
  171. {datachain-0.6.9 → datachain-0.6.11}/src/datachain/utils.py +0 -0
  172. {datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/dependency_links.txt +0 -0
  173. {datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/entry_points.txt +0 -0
  174. {datachain-0.6.9 → datachain-0.6.11}/src/datachain.egg-info/top_level.txt +0 -0
  175. {datachain-0.6.9 → datachain-0.6.11}/tests/__init__.py +0 -0
  176. {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/__init__.py +0 -0
  177. {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/conftest.py +0 -0
  178. {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  179. {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/datasets/.dvc/config +0 -0
  180. {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/datasets/.gitignore +0 -0
  181. {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  182. {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/test_datachain.py +0 -0
  183. {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/test_ls.py +0 -0
  184. {datachain-0.6.9 → datachain-0.6.11}/tests/benchmarks/test_version.py +0 -0
  185. {datachain-0.6.9 → datachain-0.6.11}/tests/data.py +0 -0
  186. {datachain-0.6.9 → datachain-0.6.11}/tests/examples/__init__.py +0 -0
  187. {datachain-0.6.9 → datachain-0.6.11}/tests/examples/test_examples.py +0 -0
  188. {datachain-0.6.9 → datachain-0.6.11}/tests/examples/test_wds_e2e.py +0 -0
  189. {datachain-0.6.9 → datachain-0.6.11}/tests/examples/wds_data.py +0 -0
  190. {datachain-0.6.9 → datachain-0.6.11}/tests/func/__init__.py +0 -0
  191. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_catalog.py +0 -0
  192. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_client.py +0 -0
  193. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_datachain.py +0 -0
  194. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_datasets.py +0 -0
  195. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_feature_pickling.py +0 -0
  196. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_listing.py +0 -0
  197. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_ls.py +0 -0
  198. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_meta_formats.py +0 -0
  199. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_metrics.py +0 -0
  200. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_pytorch.py +0 -0
  201. {datachain-0.6.9 → datachain-0.6.11}/tests/func/test_query.py +0 -0
  202. {datachain-0.6.9 → datachain-0.6.11}/tests/scripts/feature_class.py +0 -0
  203. {datachain-0.6.9 → datachain-0.6.11}/tests/scripts/feature_class_exception.py +0 -0
  204. {datachain-0.6.9 → datachain-0.6.11}/tests/scripts/feature_class_parallel.py +0 -0
  205. {datachain-0.6.9 → datachain-0.6.11}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  206. {datachain-0.6.9 → datachain-0.6.11}/tests/scripts/name_len_slow.py +0 -0
  207. {datachain-0.6.9 → datachain-0.6.11}/tests/test_atomicity.py +0 -0
  208. {datachain-0.6.9 → datachain-0.6.11}/tests/test_cli_e2e.py +0 -0
  209. {datachain-0.6.9 → datachain-0.6.11}/tests/test_cli_studio.py +0 -0
  210. {datachain-0.6.9 → datachain-0.6.11}/tests/test_query_e2e.py +0 -0
  211. {datachain-0.6.9 → datachain-0.6.11}/tests/test_telemetry.py +0 -0
  212. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/__init__.py +0 -0
  213. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/__init__.py +0 -0
  214. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/conftest.py +0 -0
  215. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_arrow.py +0 -0
  216. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_clip.py +0 -0
  217. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  218. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_datachain_merge.py +0 -0
  219. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_feature.py +0 -0
  220. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_feature_utils.py +0 -0
  221. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_file.py +0 -0
  222. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_hf.py +0 -0
  223. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_image.py +0 -0
  224. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_listing_info.py +0 -0
  225. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_models.py +0 -0
  226. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_schema.py +0 -0
  227. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_signal_schema.py +0 -0
  228. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_sql_to_python.py +0 -0
  229. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_text.py +0 -0
  230. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_udf_signature.py +0 -0
  231. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_utils.py +0 -0
  232. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/lib/test_webdataset.py +0 -0
  233. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/__init__.py +0 -0
  234. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/sqlite/__init__.py +0 -0
  235. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
  236. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_array.py +0 -0
  237. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_conditional.py +0 -0
  238. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_path.py +0 -0
  239. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_random.py +0 -0
  240. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_selectable.py +0 -0
  241. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/sql/test_string.py +0 -0
  242. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_asyn.py +0 -0
  243. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_cache.py +0 -0
  244. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_catalog.py +0 -0
  245. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_catalog_loader.py +0 -0
  246. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_cli_parsing.py +0 -0
  247. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_client.py +0 -0
  248. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_client_s3.py +0 -0
  249. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_config.py +0 -0
  250. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_data_storage.py +0 -0
  251. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_database_engine.py +0 -0
  252. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_dataset.py +0 -0
  253. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_dispatch.py +0 -0
  254. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_fileslice.py +0 -0
  255. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_id_generator.py +0 -0
  256. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_metastore.py +0 -0
  257. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_module_exports.py +0 -0
  258. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_query.py +0 -0
  259. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_query_metrics.py +0 -0
  260. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_query_params.py +0 -0
  261. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_serializer.py +0 -0
  262. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_session.py +0 -0
  263. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_utils.py +0 -0
  264. {datachain-0.6.9 → datachain-0.6.11}/tests/unit/test_warehouse.py +0 -0
  265. {datachain-0.6.9 → datachain-0.6.11}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.9
3
+ Version: 0.6.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -71,7 +71,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
71
71
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
72
72
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
73
73
  Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
74
- Requires-Dist: pytest-servers[all]>=0.5.7; extra == "tests"
74
+ Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
75
75
  Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
76
76
  Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
77
77
  Requires-Dist: virtualenv; extra == "tests"
@@ -15,7 +15,7 @@ validation:
15
15
 
16
16
  theme:
17
17
  name: material
18
- custom_dir: overrides
18
+ custom_dir: docs/overrides
19
19
  logo: assets/datachain-white.svg
20
20
  favicon: assets/datachain.svg
21
21
  icon:
@@ -82,7 +82,7 @@ tests = [
82
82
  "pytest-sugar>=0.9.6",
83
83
  "pytest-cov>=4.1.0",
84
84
  "pytest-mock>=3.12.0",
85
- "pytest-servers[all]>=0.5.7",
85
+ "pytest-servers[all]>=0.5.8",
86
86
  "pytest-benchmark[histogram]",
87
87
  "pytest-xdist>=3.3.1",
88
88
  "virtualenv",
@@ -603,9 +603,10 @@ class Catalog:
603
603
  )
604
604
 
605
605
  lst = Listing(
606
+ self.metastore.clone(),
606
607
  self.warehouse.clone(),
607
608
  Client.get_client(list_uri, self.cache, **self.client_config),
608
- self.get_dataset(list_ds_name),
609
+ dataset_name=list_ds_name,
609
610
  object_name=object_name,
610
611
  )
611
612
 
@@ -698,9 +699,13 @@ class Catalog:
698
699
 
699
700
  client = self.get_client(source, **client_config)
700
701
  uri = client.uri
701
- st = self.warehouse.clone()
702
702
  dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
703
- listing = Listing(st, client, self.get_dataset(dataset_name))
703
+ listing = Listing(
704
+ self.metastore.clone(),
705
+ self.warehouse.clone(),
706
+ client,
707
+ dataset_name=dataset_name,
708
+ )
704
709
  rows = DatasetQuery(
705
710
  name=dataset.name, version=ds_version, catalog=self
706
711
  ).to_db_records()
@@ -1354,6 +1359,13 @@ class Catalog:
1354
1359
  # we will create new one if it doesn't exist
1355
1360
  pass
1356
1361
 
1362
+ if dataset and version and dataset.has_version(version):
1363
+ """No need to communicate with Studio at all"""
1364
+ dataset_uri = create_dataset_uri(remote_dataset_name, version)
1365
+ print(f"Local copy of dataset {dataset_uri} already present")
1366
+ _instantiate_dataset()
1367
+ return
1368
+
1357
1369
  remote_dataset = self.get_remote_dataset(remote_dataset_name)
1358
1370
  # if version is not specified in uri, take the latest one
1359
1371
  if not version:
@@ -747,8 +747,12 @@ class SQLiteWarehouse(AbstractWarehouse):
747
747
 
748
748
  ids = self.db.execute(select_ids).fetchall()
749
749
 
750
- select_q = query.with_only_columns(
751
- *[c for c in query.selected_columns if c.name != "sys__id"]
750
+ select_q = (
751
+ query.with_only_columns(
752
+ *[c for c in query.selected_columns if c.name != "sys__id"]
753
+ )
754
+ .offset(None)
755
+ .limit(None)
752
756
  )
753
757
 
754
758
  for batch in batched_it(ids, 10_000):
@@ -642,6 +642,59 @@ class DataChain:
642
642
  }
643
643
  return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
644
644
 
645
+ def explode(
646
+ self,
647
+ col: str,
648
+ model_name: Optional[str] = None,
649
+ object_name: Optional[str] = None,
650
+ ) -> "DataChain":
651
+ """Explodes a column containing JSON objects (dict or str DataChain type) into
652
+ individual columns based on the schema of the JSON. Schema is inferred from
653
+ the first row of the column.
654
+
655
+ Args:
656
+ col: the name of the column containing JSON to be exploded.
657
+ model_name: optional generated model name. By default generates the name
658
+ automatically.
659
+ object_name: optional generated object column name. By default generates the
660
+ name automatically.
661
+
662
+ Returns:
663
+ DataChain: A new DataChain instance with the new set of columns.
664
+ """
665
+ import json
666
+
667
+ import pyarrow as pa
668
+
669
+ from datachain.lib.arrow import schema_to_output
670
+
671
+ json_value = next(self.limit(1).collect(col))
672
+ json_dict = (
673
+ json.loads(json_value) if isinstance(json_value, str) else json_value
674
+ )
675
+
676
+ if not isinstance(json_dict, dict):
677
+ raise TypeError(f"Column {col} should be a string or dict type with JSON")
678
+
679
+ schema = pa.Table.from_pylist([json_dict]).schema
680
+ output = schema_to_output(schema, None)
681
+
682
+ if not model_name:
683
+ model_name = f"{col.title()}ExplodedModel"
684
+
685
+ model = dict_to_data_model(model_name, output)
686
+
687
+ def json_to_model(json_value: Union[str, dict]):
688
+ json_dict = (
689
+ json.loads(json_value) if isinstance(json_value, str) else json_value
690
+ )
691
+ return model.model_validate(json_dict)
692
+
693
+ if not object_name:
694
+ object_name = f"{col}_expl"
695
+
696
+ return self.map(json_to_model, params=col, output={object_name: model})
697
+
645
698
  @classmethod
646
699
  def datasets(
647
700
  cls,
@@ -1,6 +1,7 @@
1
1
  import glob
2
2
  import os
3
3
  from collections.abc import Iterable, Iterator
4
+ from functools import cached_property
4
5
  from itertools import zip_longest
5
6
  from typing import TYPE_CHECKING, Optional
6
7
 
@@ -15,28 +16,34 @@ from datachain.utils import suffix_to_number
15
16
  if TYPE_CHECKING:
16
17
  from datachain.catalog.datasource import DataSource
17
18
  from datachain.client import Client
18
- from datachain.data_storage import AbstractWarehouse
19
+ from datachain.data_storage import AbstractMetastore, AbstractWarehouse
19
20
  from datachain.dataset import DatasetRecord
20
21
 
21
22
 
22
23
  class Listing:
23
24
  def __init__(
24
25
  self,
26
+ metastore: "AbstractMetastore",
25
27
  warehouse: "AbstractWarehouse",
26
28
  client: "Client",
27
- dataset: Optional["DatasetRecord"],
29
+ dataset_name: Optional["str"] = None,
30
+ dataset_version: Optional[int] = None,
28
31
  object_name: str = "file",
29
32
  ):
33
+ self.metastore = metastore
30
34
  self.warehouse = warehouse
31
35
  self.client = client
32
- self.dataset = dataset # dataset representing bucket listing
36
+ self.dataset_name = dataset_name # dataset representing bucket listing
37
+ self.dataset_version = dataset_version # dataset representing bucket listing
33
38
  self.object_name = object_name
34
39
 
35
40
  def clone(self) -> "Listing":
36
41
  return self.__class__(
42
+ self.metastore.clone(),
37
43
  self.warehouse.clone(),
38
44
  self.client,
39
- self.dataset,
45
+ self.dataset_name,
46
+ self.dataset_version,
40
47
  self.object_name,
41
48
  )
42
49
 
@@ -53,12 +60,22 @@ class Listing:
53
60
  def uri(self):
54
61
  from datachain.lib.listing import listing_uri_from_name
55
62
 
56
- return listing_uri_from_name(self.dataset.name)
63
+ assert self.dataset_name
57
64
 
58
- @property
65
+ return listing_uri_from_name(self.dataset_name)
66
+
67
+ @cached_property
68
+ def dataset(self) -> "DatasetRecord":
69
+ assert self.dataset_name
70
+ return self.metastore.get_dataset(self.dataset_name)
71
+
72
+ @cached_property
59
73
  def dataset_rows(self):
74
+ dataset = self.dataset
60
75
  return self.warehouse.dataset_rows(
61
- self.dataset, self.dataset.latest_version, object_name=self.object_name
76
+ dataset,
77
+ self.dataset_version or dataset.latest_version,
78
+ object_name=self.object_name,
62
79
  )
63
80
 
64
81
  def expand_path(self, path, use_glob=True) -> list[Node]:
@@ -36,7 +36,14 @@ def convert_array(arr):
36
36
 
37
37
 
38
38
  def adapt_np_array(arr):
39
- return orjson.dumps(arr, option=orjson.OPT_SERIALIZE_NUMPY).decode("utf-8")
39
+ def _json_serialize(obj):
40
+ if isinstance(obj, np.ndarray):
41
+ return obj.tolist()
42
+ return obj
43
+
44
+ return orjson.dumps(
45
+ arr, option=orjson.OPT_SERIALIZE_NUMPY, default=_json_serialize
46
+ ).decode("utf-8")
40
47
 
41
48
 
42
49
  def adapt_np_generic(val):
@@ -0,0 +1,3 @@
1
+ from .split import train_test_split
2
+
3
+ __all__ = ["train_test_split"]
@@ -0,0 +1,67 @@
1
+ from datachain import C, DataChain
2
+
3
+
4
+ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
5
+ """
6
+ Splits a DataChain into multiple subsets based on the provided weights.
7
+
8
+ This function partitions the rows or items of a DataChain into disjoint subsets,
9
+ ensuring that the relative sizes of the subsets correspond to the given weights.
10
+ It is particularly useful for creating training, validation, and test datasets.
11
+
12
+ Args:
13
+ dc (DataChain):
14
+ The DataChain instance to split.
15
+ weights (list[float]):
16
+ A list of weights indicating the relative proportions of the splits.
17
+ The weights do not need to sum to 1; they will be normalized internally.
18
+ For example:
19
+ - `[0.7, 0.3]` corresponds to a 70/30 split;
20
+ - `[2, 1, 1]` corresponds to a 50/25/25 split.
21
+
22
+ Returns:
23
+ list[DataChain]:
24
+ A list of DataChain instances, one for each weight in the weights list.
25
+
26
+ Examples:
27
+ Train-test split:
28
+ ```python
29
+ from datachain import DataChain
30
+ from datachain.toolkit import train_test_split
31
+
32
+ # Load a DataChain from a storage source (e.g., S3 bucket)
33
+ dc = DataChain.from_storage("s3://bucket/dir/")
34
+
35
+ # Perform a 70/30 train-test split
36
+ train, test = train_test_split(dc, [0.7, 0.3])
37
+
38
+ # Save the resulting splits
39
+ train.save("dataset_train")
40
+ test.save("dataset_test")
41
+ ```
42
+
43
+ Train-test-validation split:
44
+ ```python
45
+ train, test, val = train_test_split(dc, [0.7, 0.2, 0.1])
46
+ train.save("dataset_train")
47
+ test.save("dataset_test")
48
+ val.save("dataset_val")
49
+ ```
50
+
51
+ Note:
52
+ The splits are random but deterministic, based on Dataset `sys__rand` field.
53
+ """
54
+ if len(weights) < 2:
55
+ raise ValueError("Weights should have at least two elements")
56
+ if any(weight < 0 for weight in weights):
57
+ raise ValueError("Weights should be non-negative")
58
+
59
+ weights_normalized = [weight / sum(weights) for weight in weights]
60
+
61
+ return [
62
+ dc.filter(
63
+ C("sys__rand") % 1000 >= round(sum(weights_normalized[:index]) * 1000),
64
+ C("sys__rand") % 1000 < round(sum(weights_normalized[: index + 1]) * 1000),
65
+ )
66
+ for index, _ in enumerate(weights_normalized)
67
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.9
3
+ Version: 0.6.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -71,7 +71,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
71
71
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
72
72
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
73
73
  Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
74
- Requires-Dist: pytest-servers[all]>=0.5.7; extra == "tests"
74
+ Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
75
75
  Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
76
76
  Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
77
77
  Requires-Dist: virtualenv; extra == "tests"
@@ -23,6 +23,7 @@ docs/index.md
23
23
  docs/assets/captioned_cartoons.png
24
24
  docs/assets/datachain-white.svg
25
25
  docs/assets/datachain.svg
26
+ docs/overrides/main.html
26
27
  docs/references/datachain.md
27
28
  docs/references/datatype.md
28
29
  docs/references/file.md
@@ -48,7 +49,6 @@ examples/multimodal/hf_pipeline.py
48
49
  examples/multimodal/openai_image_desc_lib.py
49
50
  examples/multimodal/wds.py
50
51
  examples/multimodal/wds_filtered.py
51
- overrides/main.html
52
52
  src/datachain/__init__.py
53
53
  src/datachain/__main__.py
54
54
  src/datachain/asyn.py
@@ -160,6 +160,8 @@ src/datachain/sql/sqlite/__init__.py
160
160
  src/datachain/sql/sqlite/base.py
161
161
  src/datachain/sql/sqlite/types.py
162
162
  src/datachain/sql/sqlite/vector.py
163
+ src/datachain/toolkit/__init__.py
164
+ src/datachain/toolkit/split.py
163
165
  src/datachain/torch/__init__.py
164
166
  tests/__init__.py
165
167
  tests/conftest.py
@@ -197,6 +199,7 @@ tests/func/test_metrics.py
197
199
  tests/func/test_pull.py
198
200
  tests/func/test_pytorch.py
199
201
  tests/func/test_query.py
202
+ tests/func/test_toolkit.py
200
203
  tests/scripts/feature_class.py
201
204
  tests/scripts/feature_class_exception.py
202
205
  tests/scripts/feature_class_parallel.py
@@ -256,4 +259,5 @@ tests/unit/sql/test_random.py
256
259
  tests/unit/sql/test_selectable.py
257
260
  tests/unit/sql/test_string.py
258
261
  tests/unit/sql/sqlite/__init__.py
262
+ tests/unit/sql/sqlite/test_types.py
259
263
  tests/unit/sql/sqlite/test_utils.py
@@ -70,7 +70,7 @@ pytest<9,>=8
70
70
  pytest-sugar>=0.9.6
71
71
  pytest-cov>=4.1.0
72
72
  pytest-mock>=3.12.0
73
- pytest-servers[all]>=0.5.7
73
+ pytest-servers[all]>=0.5.8
74
74
  pytest-benchmark[histogram]
75
75
  pytest-xdist>=3.3.1
76
76
  virtualenv
@@ -22,7 +22,7 @@ from datachain.data_storage.sqlite import (
22
22
  SQLiteWarehouse,
23
23
  )
24
24
  from datachain.dataset import DatasetRecord
25
- from datachain.lib.dc import DataChain
25
+ from datachain.lib.dc import DataChain, Sys
26
26
  from datachain.query.session import Session
27
27
  from datachain.utils import (
28
28
  ENV_DATACHAIN_GLOBAL_CONFIG_DIR,
@@ -701,3 +701,43 @@ def studio_datasets(requests_mock):
701
701
  ]
702
702
 
703
703
  requests_mock.post(f"{STUDIO_URL}/api/datachain/ls-datasets", json=datasets)
704
+
705
+
706
+ @pytest.fixture
707
+ def not_random_ds(test_session):
708
+ return DataChain.from_records(
709
+ [
710
+ {"sys__id": 1, "sys__rand": 50, "fib": 0},
711
+ {"sys__id": 2, "sys__rand": 150, "fib": 1},
712
+ {"sys__id": 3, "sys__rand": 250, "fib": 1},
713
+ {"sys__id": 4, "sys__rand": 350, "fib": 2},
714
+ {"sys__id": 5, "sys__rand": 450, "fib": 3},
715
+ {"sys__id": 6, "sys__rand": 550, "fib": 5},
716
+ {"sys__id": 7, "sys__rand": 650, "fib": 8},
717
+ {"sys__id": 8, "sys__rand": 750, "fib": 13},
718
+ {"sys__id": 9, "sys__rand": 850, "fib": 21},
719
+ {"sys__id": 10, "sys__rand": 950, "fib": 34},
720
+ ],
721
+ session=test_session,
722
+ schema={"sys": Sys, "fib": int},
723
+ )
724
+
725
+
726
+ @pytest.fixture
727
+ def pseudo_random_ds(test_session):
728
+ return DataChain.from_records(
729
+ [
730
+ {"sys__id": 1, "sys__rand": 1344339883, "fib": 0},
731
+ {"sys__id": 2, "sys__rand": 3901153096, "fib": 1},
732
+ {"sys__id": 3, "sys__rand": 4255991360, "fib": 1},
733
+ {"sys__id": 4, "sys__rand": 2526403609, "fib": 2},
734
+ {"sys__id": 5, "sys__rand": 1871733386, "fib": 3},
735
+ {"sys__id": 6, "sys__rand": 9380910850, "fib": 5},
736
+ {"sys__id": 7, "sys__rand": 2770679740, "fib": 8},
737
+ {"sys__id": 8, "sys__rand": 2538886575, "fib": 13},
738
+ {"sys__id": 9, "sys__rand": 3969542617, "fib": 21},
739
+ {"sys__id": 10, "sys__rand": 7541790992, "fib": 34},
740
+ ],
741
+ session=test_session,
742
+ schema={"sys": Sys, "fib": int},
743
+ )
@@ -459,6 +459,72 @@ def test_order_by_limit(cloud_test_catalog, save, animal_dataset):
459
459
  ]
460
460
 
461
461
 
462
+ @pytest.mark.parametrize("save", [True, False])
463
+ def test_limit(cloud_test_catalog, save, animal_dataset):
464
+ catalog = cloud_test_catalog.catalog
465
+ q = (
466
+ DatasetQuery(animal_dataset.name, catalog=catalog)
467
+ .order_by(C("file.path"))
468
+ .limit(2)
469
+ )
470
+ if save:
471
+ ds_name = "animals_cats"
472
+ q.save(ds_name)
473
+ result = DatasetQuery(name=ds_name, catalog=catalog).db_results()
474
+ dataset_record = catalog.get_dataset(ds_name)
475
+ assert dataset_record.status == DatasetStatus.COMPLETE
476
+ else:
477
+ result = q.db_results()
478
+
479
+ assert len(result) == 2
480
+ assert [posixpath.basename(r[3]) for r in result] == ["cat1", "cat2"]
481
+
482
+
483
+ @pytest.mark.parametrize("save", [True, False])
484
+ def test_offset_limit(cloud_test_catalog, save, animal_dataset):
485
+ catalog = cloud_test_catalog.catalog
486
+ q = (
487
+ DatasetQuery(animal_dataset.name, catalog=catalog)
488
+ .order_by(C("file.path"))
489
+ .offset(3)
490
+ .limit(2)
491
+ )
492
+ if save:
493
+ ds_name = "animals_cats"
494
+ q.save(ds_name)
495
+ result = DatasetQuery(name=ds_name, catalog=catalog).db_results()
496
+ dataset_record = catalog.get_dataset(ds_name)
497
+ assert dataset_record.status == DatasetStatus.COMPLETE
498
+ else:
499
+ result = q.db_results()
500
+
501
+ assert len(result) == 2
502
+ assert [posixpath.basename(r[3]) for r in result] == ["dog1", "dog2"]
503
+
504
+
505
+ @pytest.mark.parametrize("save", [True, False])
506
+ def test_mutate_offset_limit(cloud_test_catalog, save, animal_dataset):
507
+ catalog = cloud_test_catalog.catalog
508
+ q = (
509
+ DatasetQuery(animal_dataset.name, catalog=catalog)
510
+ .order_by(C("file.path"))
511
+ .mutate(size10x=C("file.size") * 10)
512
+ .offset(3)
513
+ .limit(2)
514
+ )
515
+ if save:
516
+ ds_name = "animals_cats"
517
+ q.save(ds_name)
518
+ result = DatasetQuery(name=ds_name, catalog=catalog).db_results()
519
+ dataset_record = catalog.get_dataset(ds_name)
520
+ assert dataset_record.status == DatasetStatus.COMPLETE
521
+ else:
522
+ result = q.db_results()
523
+
524
+ assert len(result) == 2
525
+ assert [posixpath.basename(r[3]) for r in result] == ["dog1", "dog2"]
526
+
527
+
462
528
  @pytest.mark.parametrize(
463
529
  "cloud_type,version_aware",
464
530
  [("s3", True)],
@@ -6,12 +6,13 @@ import lz4.frame
6
6
  import pandas as pd
7
7
  import pytest
8
8
 
9
+ from datachain.client.fsspec import Client
9
10
  from datachain.config import Config, ConfigLevel
10
11
  from datachain.dataset import DatasetStatus
11
12
  from datachain.error import DataChainError
12
13
  from datachain.utils import STUDIO_URL, JSONSerialize
13
14
  from tests.data import ENTRIES
14
- from tests.utils import assert_row_names, skip_if_not_sqlite
15
+ from tests.utils import assert_row_names, skip_if_not_sqlite, tree_from_path
15
16
 
16
17
  DATASET_UUID = "20f5a2f1-fc9a-4e36-8b91-5a530f289451"
17
18
 
@@ -40,10 +41,11 @@ def dog_entries():
40
41
 
41
42
 
42
43
  @pytest.fixture
43
- def dog_entries_parquet_lz4(dog_entries) -> bytes:
44
+ def dog_entries_parquet_lz4(dog_entries, cloud_test_catalog) -> bytes:
44
45
  """
45
46
  Returns dogs entries in lz4 compressed parquet format
46
47
  """
48
+ src_uri = cloud_test_catalog.src_uri
47
49
 
48
50
  def _adapt_row(row):
49
51
  """
@@ -61,7 +63,7 @@ def dog_entries_parquet_lz4(dog_entries) -> bytes:
61
63
  adapted["sys__id"] = 1
62
64
  adapted["sys__rand"] = 1
63
65
  adapted["file__location"] = ""
64
- adapted["file__source"] = "s3://dogs"
66
+ adapted["file__source"] = src_uri
65
67
  return adapted
66
68
 
67
69
  dog_entries = [_adapt_row(e) for e in dog_entries]
@@ -141,6 +143,7 @@ def remote_dataset(remote_dataset_version, schema):
141
143
 
142
144
  @pytest.mark.parametrize("cloud_type, version_aware", [("s3", False)], indirect=True)
143
145
  @pytest.mark.parametrize("dataset_uri", ["ds://dogs@v1", "ds://dogs"])
146
+ @pytest.mark.parametrize("instantiate", [True, False])
144
147
  @skip_if_not_sqlite
145
148
  def test_pull_dataset_success(
146
149
  requests_mock,
@@ -148,7 +151,10 @@ def test_pull_dataset_success(
148
151
  remote_dataset,
149
152
  dog_entries_parquet_lz4,
150
153
  dataset_uri,
154
+ instantiate,
151
155
  ):
156
+ src_uri = cloud_test_catalog.src_uri
157
+ working_dir = cloud_test_catalog.working_dir
152
158
  data_url = (
153
159
  "https://studio-blobvault.s3.amazonaws.com/datachain_ds_export_1_0.parquet.lz4"
154
160
  )
@@ -165,9 +171,16 @@ def test_pull_dataset_success(
165
171
  requests_mock.get(data_url, content=dog_entries_parquet_lz4)
166
172
  catalog = cloud_test_catalog.catalog
167
173
 
168
- catalog.pull_dataset(dataset_uri, no_cp=True)
169
- # trying to pull multiple times as it should work
170
- catalog.pull_dataset(dataset_uri, no_cp=True)
174
+ dest = None
175
+
176
+ if instantiate:
177
+ dest = working_dir / "data"
178
+ dest.mkdir()
179
+ catalog.pull_dataset(dataset_uri, output=str(dest), no_cp=False)
180
+ else:
181
+ # trying to pull multiple times since that should work as well
182
+ catalog.pull_dataset(dataset_uri, no_cp=True)
183
+ catalog.pull_dataset(dataset_uri, no_cp=True)
171
184
 
172
185
  dataset = catalog.get_dataset("dogs")
173
186
  assert dataset.versions_values == [1]
@@ -196,6 +209,20 @@ def test_pull_dataset_success(
196
209
  },
197
210
  )
198
211
 
212
+ client = Client.get_client(src_uri, None)
213
+
214
+ if instantiate:
215
+ assert tree_from_path(dest) == {
216
+ f"{client.name}": {
217
+ "dogs": {
218
+ "dog1": "woof",
219
+ "dog2": "arf",
220
+ "dog3": "bark",
221
+ "others": {"dog4": "ruff"},
222
+ }
223
+ }
224
+ }
225
+
199
226
 
200
227
  @pytest.mark.parametrize("cloud_type, version_aware", [("s3", False)], indirect=True)
201
228
  @skip_if_not_sqlite
@@ -0,0 +1,42 @@
1
+ import pytest
2
+
3
+ from datachain.toolkit import train_test_split
4
+
5
+
6
+ @pytest.mark.parametrize(
7
+ "weights,expected",
8
+ [
9
+ [[1, 1], [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]],
10
+ [[4, 1], [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10]]],
11
+ [[0.7, 0.2, 0.1], [[1, 2, 3, 4, 5, 6, 7], [8, 9], [10]]],
12
+ ],
13
+ )
14
+ def test_train_test_split_not_random(not_random_ds, weights, expected):
15
+ res = train_test_split(not_random_ds, weights)
16
+ assert len(res) == len(expected)
17
+
18
+ for i, dc in enumerate(res):
19
+ assert list(dc.collect("sys.id")) == expected[i]
20
+
21
+
22
+ @pytest.mark.parametrize(
23
+ "weights,expected",
24
+ [
25
+ [[1, 1], [[2, 3, 5], [1, 4, 6, 7, 8, 9, 10]]],
26
+ [[4, 1], [[2, 3, 4, 5, 7, 8, 9], [1, 6, 10]]],
27
+ [[0.7, 0.2, 0.1], [[2, 3, 4, 5, 8, 9], [1, 6, 7], [10]]],
28
+ ],
29
+ )
30
+ def test_train_test_split_random(pseudo_random_ds, weights, expected):
31
+ res = train_test_split(pseudo_random_ds, weights)
32
+ assert len(res) == len(expected)
33
+
34
+ for i, dc in enumerate(res):
35
+ assert list(dc.collect("sys.id")) == expected[i]
36
+
37
+
38
+ def test_train_test_split_errors(not_random_ds):
39
+ with pytest.raises(ValueError, match="Weights should have at least two elements"):
40
+ train_test_split(not_random_ds, [0.5])
41
+ with pytest.raises(ValueError, match="Weights should be non-negative"):
42
+ train_test_split(not_random_ds, [-1, 1])