datachain 0.7.11__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (291) hide show
  1. {datachain-0.7.11 → datachain-0.8.1}/.github/workflows/benchmarks.yml +1 -1
  2. {datachain-0.7.11 → datachain-0.8.1}/.github/workflows/release.yml +1 -1
  3. {datachain-0.7.11 → datachain-0.8.1}/.github/workflows/tests-studio.yml +1 -1
  4. {datachain-0.7.11 → datachain-0.8.1}/.github/workflows/tests.yml +3 -3
  5. {datachain-0.7.11 → datachain-0.8.1}/.pre-commit-config.yaml +1 -1
  6. {datachain-0.7.11/src/datachain.egg-info → datachain-0.8.1}/PKG-INFO +4 -3
  7. {datachain-0.7.11 → datachain-0.8.1}/docs/quick-start.md +4 -2
  8. datachain-0.8.1/examples/get_started/json-csv-reader.py +82 -0
  9. {datachain-0.7.11 → datachain-0.8.1}/pyproject.toml +4 -3
  10. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/catalog/catalog.py +56 -45
  11. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/cli.py +25 -3
  12. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/gcs.py +9 -0
  13. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/sqlite.py +20 -6
  14. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/warehouse.py +0 -1
  15. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/arrow.py +82 -58
  16. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/dc.py +167 -166
  17. datachain-0.8.1/src/datachain/lib/diff.py +197 -0
  18. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/file.py +3 -1
  19. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/listing.py +44 -0
  20. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/meta_formats.py +38 -42
  21. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/udf.py +0 -1
  22. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/batch.py +32 -6
  23. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/dataset.py +18 -17
  24. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/dispatch.py +125 -125
  25. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/session.py +8 -5
  26. datachain-0.8.1/src/datachain/query/udf.py +20 -0
  27. datachain-0.8.1/src/datachain/query/utils.py +42 -0
  28. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/remote/studio.py +53 -1
  29. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/studio.py +47 -2
  30. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/utils.py +1 -1
  31. {datachain-0.7.11 → datachain-0.8.1/src/datachain.egg-info}/PKG-INFO +4 -3
  32. {datachain-0.7.11 → datachain-0.8.1}/src/datachain.egg-info/SOURCES.txt +6 -0
  33. {datachain-0.7.11 → datachain-0.8.1}/src/datachain.egg-info/requires.txt +3 -2
  34. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_catalog.py +6 -2
  35. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_datachain.py +1 -1
  36. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_meta_formats.py +4 -4
  37. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_pull.py +18 -12
  38. datachain-0.8.1/tests/func/test_session.py +25 -0
  39. {datachain-0.7.11 → datachain-0.8.1}/tests/test_cli_studio.py +52 -1
  40. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_arrow.py +26 -0
  41. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_datachain.py +3 -3
  42. datachain-0.8.1/tests/unit/lib/test_diff.py +498 -0
  43. datachain-0.8.1/tests/unit/test_client_gcs.py +17 -0
  44. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_listing.py +29 -2
  45. datachain-0.7.11/examples/get_started/json-csv-reader.py +0 -101
  46. {datachain-0.7.11 → datachain-0.8.1}/.cruft.json +0 -0
  47. {datachain-0.7.11 → datachain-0.8.1}/.gitattributes +0 -0
  48. {datachain-0.7.11 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  49. {datachain-0.7.11 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  50. {datachain-0.7.11 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  51. {datachain-0.7.11 → datachain-0.8.1}/.github/codecov.yaml +0 -0
  52. {datachain-0.7.11 → datachain-0.8.1}/.github/dependabot.yml +0 -0
  53. {datachain-0.7.11 → datachain-0.8.1}/.github/workflows/update-template.yaml +0 -0
  54. {datachain-0.7.11 → datachain-0.8.1}/.gitignore +0 -0
  55. {datachain-0.7.11 → datachain-0.8.1}/CODE_OF_CONDUCT.rst +0 -0
  56. {datachain-0.7.11 → datachain-0.8.1}/LICENSE +0 -0
  57. {datachain-0.7.11 → datachain-0.8.1}/README.rst +0 -0
  58. {datachain-0.7.11 → datachain-0.8.1}/docs/assets/captioned_cartoons.png +0 -0
  59. {datachain-0.7.11 → datachain-0.8.1}/docs/assets/datachain-white.svg +0 -0
  60. {datachain-0.7.11 → datachain-0.8.1}/docs/assets/datachain.svg +0 -0
  61. {datachain-0.7.11 → datachain-0.8.1}/docs/contributing.md +0 -0
  62. {datachain-0.7.11 → datachain-0.8.1}/docs/css/github-permalink-style.css +0 -0
  63. {datachain-0.7.11 → datachain-0.8.1}/docs/examples.md +0 -0
  64. {datachain-0.7.11 → datachain-0.8.1}/docs/index.md +0 -0
  65. {datachain-0.7.11 → datachain-0.8.1}/docs/overrides/main.html +0 -0
  66. {datachain-0.7.11 → datachain-0.8.1}/docs/references/datachain.md +0 -0
  67. {datachain-0.7.11 → datachain-0.8.1}/docs/references/datatype.md +0 -0
  68. {datachain-0.7.11 → datachain-0.8.1}/docs/references/file.md +0 -0
  69. {datachain-0.7.11 → datachain-0.8.1}/docs/references/index.md +0 -0
  70. {datachain-0.7.11 → datachain-0.8.1}/docs/references/sql.md +0 -0
  71. {datachain-0.7.11 → datachain-0.8.1}/docs/references/torch.md +0 -0
  72. {datachain-0.7.11 → datachain-0.8.1}/docs/references/udf.md +0 -0
  73. {datachain-0.7.11 → datachain-0.8.1}/docs/tutorials.md +0 -0
  74. {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  75. {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  76. {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/openimage-detect.py +0 -0
  77. {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
  78. {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/ultralytics-pose.py +0 -0
  79. {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/ultralytics-segment.py +0 -0
  80. {datachain-0.7.11 → datachain-0.8.1}/examples/get_started/common_sql_functions.py +0 -0
  81. {datachain-0.7.11 → datachain-0.8.1}/examples/get_started/torch-loader.py +0 -0
  82. {datachain-0.7.11 → datachain-0.8.1}/examples/get_started/udfs/parallel.py +0 -0
  83. {datachain-0.7.11 → datachain-0.8.1}/examples/get_started/udfs/simple.py +0 -0
  84. {datachain-0.7.11 → datachain-0.8.1}/examples/get_started/udfs/stateful.py +0 -0
  85. {datachain-0.7.11 → datachain-0.8.1}/examples/llm_and_nlp/claude-query.py +0 -0
  86. {datachain-0.7.11 → datachain-0.8.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  87. {datachain-0.7.11 → datachain-0.8.1}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  88. {datachain-0.7.11 → datachain-0.8.1}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  89. {datachain-0.7.11 → datachain-0.8.1}/examples/multimodal/clip_inference.py +0 -0
  90. {datachain-0.7.11 → datachain-0.8.1}/examples/multimodal/hf_pipeline.py +0 -0
  91. {datachain-0.7.11 → datachain-0.8.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  92. {datachain-0.7.11 → datachain-0.8.1}/examples/multimodal/wds.py +0 -0
  93. {datachain-0.7.11 → datachain-0.8.1}/examples/multimodal/wds_filtered.py +0 -0
  94. {datachain-0.7.11 → datachain-0.8.1}/mkdocs.yml +0 -0
  95. {datachain-0.7.11 → datachain-0.8.1}/noxfile.py +0 -0
  96. {datachain-0.7.11 → datachain-0.8.1}/setup.cfg +0 -0
  97. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/__init__.py +0 -0
  98. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/__main__.py +0 -0
  99. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/asyn.py +0 -0
  100. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/cache.py +0 -0
  101. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/catalog/__init__.py +0 -0
  102. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/catalog/datasource.py +0 -0
  103. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/catalog/loader.py +0 -0
  104. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/cli_utils.py +0 -0
  105. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/__init__.py +0 -0
  106. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/azure.py +0 -0
  107. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/fileslice.py +0 -0
  108. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/fsspec.py +0 -0
  109. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/hf.py +0 -0
  110. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/local.py +0 -0
  111. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/s3.py +0 -0
  112. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/config.py +0 -0
  113. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/__init__.py +0 -0
  114. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/db_engine.py +0 -0
  115. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/job.py +0 -0
  116. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/metastore.py +0 -0
  117. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/schema.py +0 -0
  118. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/serializer.py +0 -0
  119. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/dataset.py +0 -0
  120. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/error.py +0 -0
  121. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/__init__.py +0 -0
  122. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/aggregate.py +0 -0
  123. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/array.py +0 -0
  124. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/base.py +0 -0
  125. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/conditional.py +0 -0
  126. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/func.py +0 -0
  127. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/numeric.py +0 -0
  128. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/path.py +0 -0
  129. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/random.py +0 -0
  130. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/string.py +0 -0
  131. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/window.py +0 -0
  132. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/job.py +0 -0
  133. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/__init__.py +0 -0
  134. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/clip.py +0 -0
  135. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/__init__.py +0 -0
  136. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/flatten.py +0 -0
  137. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  138. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  139. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/unflatten.py +0 -0
  140. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  141. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/data_model.py +0 -0
  142. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/dataset_info.py +0 -0
  143. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/hf.py +0 -0
  144. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/image.py +0 -0
  145. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/listing_info.py +0 -0
  146. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/model_store.py +0 -0
  147. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/pytorch.py +0 -0
  148. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/settings.py +0 -0
  149. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/signal_schema.py +0 -0
  150. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/tar.py +0 -0
  151. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/text.py +0 -0
  152. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/udf_signature.py +0 -0
  153. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/utils.py +0 -0
  154. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/vfile.py +0 -0
  155. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/webdataset.py +0 -0
  156. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/webdataset_laion.py +0 -0
  157. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/listing.py +0 -0
  158. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/__init__.py +0 -0
  159. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/bbox.py +0 -0
  160. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/pose.py +0 -0
  161. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/segment.py +0 -0
  162. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/ultralytics/__init__.py +0 -0
  163. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/ultralytics/bbox.py +0 -0
  164. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/ultralytics/pose.py +0 -0
  165. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/ultralytics/segment.py +0 -0
  166. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/node.py +0 -0
  167. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/nodes_fetcher.py +0 -0
  168. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/nodes_thread_pool.py +0 -0
  169. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/progress.py +0 -0
  170. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/py.typed +0 -0
  171. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/__init__.py +0 -0
  172. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/metrics.py +0 -0
  173. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/params.py +0 -0
  174. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/queue.py +0 -0
  175. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/schema.py +0 -0
  176. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/remote/__init__.py +0 -0
  177. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/__init__.py +0 -0
  178. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/default/__init__.py +0 -0
  179. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/default/base.py +0 -0
  180. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/__init__.py +0 -0
  181. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/aggregate.py +0 -0
  182. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/array.py +0 -0
  183. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/conditional.py +0 -0
  184. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/numeric.py +0 -0
  185. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/path.py +0 -0
  186. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/random.py +0 -0
  187. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/string.py +0 -0
  188. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/selectable.py +0 -0
  189. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  190. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/sqlite/base.py +0 -0
  191. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/sqlite/types.py +0 -0
  192. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/sqlite/vector.py +0 -0
  193. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/types.py +0 -0
  194. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/utils.py +0 -0
  195. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/telemetry.py +0 -0
  196. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/toolkit/__init__.py +0 -0
  197. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/toolkit/split.py +0 -0
  198. {datachain-0.7.11 → datachain-0.8.1}/src/datachain/torch/__init__.py +0 -0
  199. {datachain-0.7.11 → datachain-0.8.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  200. {datachain-0.7.11 → datachain-0.8.1}/src/datachain.egg-info/entry_points.txt +0 -0
  201. {datachain-0.7.11 → datachain-0.8.1}/src/datachain.egg-info/top_level.txt +0 -0
  202. {datachain-0.7.11 → datachain-0.8.1}/tests/__init__.py +0 -0
  203. {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/__init__.py +0 -0
  204. {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/conftest.py +0 -0
  205. {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  206. {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  207. {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/datasets/.gitignore +0 -0
  208. {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  209. {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/test_datachain.py +0 -0
  210. {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/test_ls.py +0 -0
  211. {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/test_version.py +0 -0
  212. {datachain-0.7.11 → datachain-0.8.1}/tests/conftest.py +0 -0
  213. {datachain-0.7.11 → datachain-0.8.1}/tests/data.py +0 -0
  214. {datachain-0.7.11 → datachain-0.8.1}/tests/examples/__init__.py +0 -0
  215. {datachain-0.7.11 → datachain-0.8.1}/tests/examples/test_examples.py +0 -0
  216. {datachain-0.7.11 → datachain-0.8.1}/tests/examples/test_wds_e2e.py +0 -0
  217. {datachain-0.7.11 → datachain-0.8.1}/tests/examples/wds_data.py +0 -0
  218. {datachain-0.7.11 → datachain-0.8.1}/tests/func/__init__.py +0 -0
  219. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_client.py +0 -0
  220. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_dataset_query.py +0 -0
  221. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_datasets.py +0 -0
  222. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_feature_pickling.py +0 -0
  223. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_listing.py +0 -0
  224. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_ls.py +0 -0
  225. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_metrics.py +0 -0
  226. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_pytorch.py +0 -0
  227. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_query.py +0 -0
  228. {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_toolkit.py +0 -0
  229. {datachain-0.7.11 → datachain-0.8.1}/tests/scripts/feature_class.py +0 -0
  230. {datachain-0.7.11 → datachain-0.8.1}/tests/scripts/feature_class_exception.py +0 -0
  231. {datachain-0.7.11 → datachain-0.8.1}/tests/scripts/feature_class_parallel.py +0 -0
  232. {datachain-0.7.11 → datachain-0.8.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  233. {datachain-0.7.11 → datachain-0.8.1}/tests/scripts/name_len_slow.py +0 -0
  234. {datachain-0.7.11 → datachain-0.8.1}/tests/test_atomicity.py +0 -0
  235. {datachain-0.7.11 → datachain-0.8.1}/tests/test_cli_e2e.py +0 -0
  236. {datachain-0.7.11 → datachain-0.8.1}/tests/test_query_e2e.py +0 -0
  237. {datachain-0.7.11 → datachain-0.8.1}/tests/test_telemetry.py +0 -0
  238. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/__init__.py +0 -0
  239. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/__init__.py +0 -0
  240. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/conftest.py +0 -0
  241. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_clip.py +0 -0
  242. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  243. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  244. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_feature.py +0 -0
  245. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_feature_utils.py +0 -0
  246. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_file.py +0 -0
  247. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_hf.py +0 -0
  248. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_image.py +0 -0
  249. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_listing_info.py +0 -0
  250. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_models.py +0 -0
  251. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_schema.py +0 -0
  252. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_signal_schema.py +0 -0
  253. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  254. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_text.py +0 -0
  255. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_udf_signature.py +0 -0
  256. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_utils.py +0 -0
  257. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_webdataset.py +0 -0
  258. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/__init__.py +0 -0
  259. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  260. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/sqlite/test_types.py +0 -0
  261. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  262. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_array.py +0 -0
  263. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_conditional.py +0 -0
  264. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_path.py +0 -0
  265. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_random.py +0 -0
  266. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_selectable.py +0 -0
  267. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_string.py +0 -0
  268. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_asyn.py +0 -0
  269. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_cache.py +0 -0
  270. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_catalog.py +0 -0
  271. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_catalog_loader.py +0 -0
  272. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_cli_parsing.py +0 -0
  273. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_client.py +0 -0
  274. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_client_s3.py +0 -0
  275. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_config.py +0 -0
  276. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_data_storage.py +0 -0
  277. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_database_engine.py +0 -0
  278. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_dataset.py +0 -0
  279. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_dispatch.py +0 -0
  280. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_fileslice.py +0 -0
  281. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_func.py +0 -0
  282. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_metastore.py +0 -0
  283. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_module_exports.py +0 -0
  284. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_query.py +0 -0
  285. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_query_metrics.py +0 -0
  286. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_query_params.py +0 -0
  287. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_serializer.py +0 -0
  288. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_session.py +0 -0
  289. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_utils.py +0 -0
  290. {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_warehouse.py +0 -0
  291. {datachain-0.7.11 → datachain-0.8.1}/tests/utils.py +0 -0
@@ -25,7 +25,7 @@ jobs:
25
25
  python-version: '3.12'
26
26
 
27
27
  - name: Setup uv
28
- uses: astral-sh/setup-uv@v4
28
+ uses: astral-sh/setup-uv@v5
29
29
  with:
30
30
  enable-cache: true
31
31
  cache-suffix: benchmarks
@@ -27,7 +27,7 @@ jobs:
27
27
  python-version: '3.12'
28
28
 
29
29
  - name: Setup uv
30
- uses: astral-sh/setup-uv@v4
30
+ uses: astral-sh/setup-uv@v5
31
31
 
32
32
  - name: Install nox
33
33
  run: uv pip install nox --system
@@ -81,7 +81,7 @@ jobs:
81
81
  python-version: ${{ matrix.pyv }}
82
82
 
83
83
  - name: Setup uv
84
- uses: astral-sh/setup-uv@v4
84
+ uses: astral-sh/setup-uv@v5
85
85
  with:
86
86
  enable-cache: true
87
87
  cache-suffix: studio
@@ -37,7 +37,7 @@ jobs:
37
37
  python-version: '3.9'
38
38
 
39
39
  - name: Setup uv
40
- uses: astral-sh/setup-uv@v4
40
+ uses: astral-sh/setup-uv@v5
41
41
  with:
42
42
  enable-cache: true
43
43
  cache-suffix: lint
@@ -94,7 +94,7 @@ jobs:
94
94
  python-version: ${{ matrix.pyv }}
95
95
 
96
96
  - name: Setup uv
97
- uses: astral-sh/setup-uv@v4
97
+ uses: astral-sh/setup-uv@v5
98
98
  with:
99
99
  enable-cache: true
100
100
  cache-suffix: tests-${{ matrix.pyv }}
@@ -157,7 +157,7 @@ jobs:
157
157
  python-version: ${{ matrix.pyv }}
158
158
 
159
159
  - name: Setup uv
160
- uses: astral-sh/setup-uv@v4
160
+ uses: astral-sh/setup-uv@v5
161
161
  with:
162
162
  enable-cache: true
163
163
  cache-suffix: examples-${{ matrix.pyv }}
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.8.2'
27
+ rev: 'v0.8.4'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.11
3
+ Version: 0.8.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -46,6 +46,7 @@ Requires-Dist: iterative-telemetry>=0.0.9
46
46
  Requires-Dist: platformdirs
47
47
  Requires-Dist: dvc-studio-client<1,>=0.21
48
48
  Requires-Dist: tabulate
49
+ Requires-Dist: websockets
49
50
  Provides-Extra: docs
50
51
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
51
52
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -83,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
83
84
  Requires-Dist: scipy; extra == "tests"
84
85
  Provides-Extra: dev
85
86
  Requires-Dist: datachain[docs,tests]; extra == "dev"
86
- Requires-Dist: mypy==1.13.0; extra == "dev"
87
+ Requires-Dist: mypy==1.14.0; extra == "dev"
87
88
  Requires-Dist: types-python-dateutil; extra == "dev"
88
89
  Requires-Dist: types-pytz; extra == "dev"
89
90
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -98,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
98
99
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
100
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
101
  Requires-Dist: onnx==1.16.1; extra == "examples"
101
- Requires-Dist: ultralytics==8.3.48; extra == "examples"
102
+ Requires-Dist: ultralytics==8.3.53; extra == "examples"
102
103
 
103
104
  ================
104
105
  |logo| DataChain
@@ -59,6 +59,8 @@ Batch inference with a simple sentiment model using the
59
59
  pip install transformers
60
60
  ```
61
61
 
62
+ Note, `transformers` works only if `torch`, `tensorflow` >= 2.0, or `flax` are installed.
63
+
62
64
  The code below downloads files from the cloud, and applies a
63
65
  user-defined function to each one of them. All files with a positive
64
66
  sentiment detected are then copied to the local directory.
@@ -114,13 +116,14 @@ DataChain can parallelize API calls; the free Mistral tier supports up
114
116
  to 4 requests at the same time.
115
117
 
116
118
  ``` py
119
+ import os
117
120
  from mistralai import Mistral
118
121
  from datachain import File, DataChain, Column
119
122
 
120
123
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
121
124
 
122
125
  def eval_dialogue(file: File) -> bool:
123
- client = Mistral()
126
+ client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
124
127
  response = client.chat.complete(
125
128
  model="open-mixtral-8x22b",
126
129
  messages=[{"role": "system", "content": PROMPT},
@@ -130,7 +133,6 @@ def eval_dialogue(file: File) -> bool:
130
133
 
131
134
  chain = (
132
135
  DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
133
- .settings(parallel=4, cache=True)
134
136
  .map(is_success=eval_dialogue)
135
137
  .save("mistral_files")
136
138
  )
@@ -0,0 +1,82 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from datachain import C, DataChain
6
+ from datachain.lib.data_model import ModelStore
7
+ from datachain.lib.meta_formats import gen_datamodel_code
8
+
9
+
10
+ # Sample model for static JSON model
11
+ class LicenseModel(BaseModel):
12
+ url: str
13
+ id: int
14
+ name: str
15
+
16
+
17
+ LicenseFeature = ModelStore.register(LicenseModel)
18
+
19
+
20
+ # Sample model for static CSV model
21
+ class ChatDialog(BaseModel):
22
+ id: Optional[int] = None
23
+ count: Optional[int] = None
24
+ sender: Optional[str] = None
25
+ text: Optional[str] = None
26
+
27
+
28
+ ChatFeature = ModelStore.register(ChatDialog)
29
+
30
+
31
+ def main():
32
+ # Dynamic JSONl schema from 2 objects
33
+ uri = "gs://datachain-demo/jsonl/object.jsonl"
34
+ jsonl_ds = DataChain.from_json(uri, format="jsonl", anon="True")
35
+ jsonl_ds.show()
36
+
37
+ # Dynamic JSON schema from 200 OpenImage json-pairs with validation errors
38
+ uri = "gs://datachain-demo/openimages-v6-test-jsonpairs/*json"
39
+ schema_uri = (
40
+ "gs://datachain-demo/openimages-v6-test-jsonpairs/08392c290ecc9d2a.json"
41
+ )
42
+ json_pairs_ds = DataChain.from_json(
43
+ uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage", anon="True"
44
+ )
45
+ json_pairs_ds.show()
46
+
47
+ uri = "gs://datachain-demo/coco2017/annotations_captions/"
48
+
49
+ # Print JSON schema in Pydantic format from main COCO annotation
50
+ chain = DataChain.from_storage(uri, anon="True").filter(
51
+ C("file.path").glob("*.json")
52
+ )
53
+ file = next(chain.limit(1).collect("file"))
54
+ print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
55
+
56
+ # Static JSON schema test parsing 3/7 objects
57
+ static_json_ds = DataChain.from_json(
58
+ uri, jmespath="licenses", spec=LicenseFeature, nrows=3, anon="True"
59
+ )
60
+ static_json_ds.show()
61
+
62
+ # Dynamic JSON schema test parsing 5K objects
63
+ dynamic_json_ds = DataChain.from_json(uri, jmespath="images", anon="True")
64
+ print(dynamic_json_ds.to_pandas())
65
+
66
+ # Static CSV with header schema test parsing 3.5K objects
67
+ uri = "gs://datachain-demo/chatbot-csv/"
68
+ static_csv_ds = DataChain.from_csv(
69
+ uri, output=ChatDialog, object_name="chat", anon="True"
70
+ )
71
+ static_csv_ds.print_schema()
72
+ static_csv_ds.show()
73
+
74
+ # Dynamic CSV with header schema test parsing 3/3M objects
75
+ uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
76
+ dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3, anon="True")
77
+ dynamic_csv_ds.print_schema()
78
+ dynamic_csv_ds.show()
79
+
80
+
81
+ if __name__ == "__main__":
82
+ main()
@@ -48,7 +48,8 @@ dependencies = [
48
48
  "iterative-telemetry>=0.0.9",
49
49
  "platformdirs",
50
50
  "dvc-studio-client>=0.21,<1",
51
- "tabulate"
51
+ "tabulate",
52
+ "websockets"
52
53
  ]
53
54
 
54
55
  [project.optional-dependencies]
@@ -95,7 +96,7 @@ tests = [
95
96
  ]
96
97
  dev = [
97
98
  "datachain[docs,tests]",
98
- "mypy==1.13.0",
99
+ "mypy==1.14.0",
99
100
  "types-python-dateutil",
100
101
  "types-pytz",
101
102
  "types-PyYAML",
@@ -111,7 +112,7 @@ examples = [
111
112
  "pdfplumber==0.11.4",
112
113
  "huggingface_hub[hf_transfer]",
113
114
  "onnx==1.16.1",
114
- "ultralytics==8.3.48"
115
+ "ultralytics==8.3.53"
115
116
  ]
116
117
 
117
118
  [project.urls]
@@ -1,7 +1,6 @@
1
1
  import io
2
2
  import json
3
3
  import logging
4
- import math
5
4
  import os
6
5
  import os.path
7
6
  import posixpath
@@ -13,7 +12,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence
13
12
  from copy import copy
14
13
  from dataclasses import dataclass
15
14
  from functools import cached_property, reduce
16
- from random import shuffle
17
15
  from threading import Thread
18
16
  from typing import (
19
17
  IO,
@@ -54,15 +52,12 @@ from datachain.error import (
54
52
  QueryScriptCancelError,
55
53
  QueryScriptRunError,
56
54
  )
55
+ from datachain.lib.listing import get_listing
57
56
  from datachain.node import DirType, Node, NodeWithPath
58
57
  from datachain.nodes_thread_pool import NodesThreadPool
59
58
  from datachain.remote.studio import StudioClient
60
59
  from datachain.sql.types import DateTime, SQLType
61
- from datachain.utils import (
62
- DataChainDir,
63
- batched,
64
- datachain_paths_join,
65
- )
60
+ from datachain.utils import DataChainDir, datachain_paths_join
66
61
 
67
62
  from .datasource import DataSource
68
63
 
@@ -90,7 +85,7 @@ QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
90
85
  QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
91
86
 
92
87
  # dataset pull
93
- PULL_DATASET_MAX_THREADS = 10
88
+ PULL_DATASET_MAX_THREADS = 5
94
89
  PULL_DATASET_CHUNK_TIMEOUT = 3600
95
90
  PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be available
96
91
  PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
@@ -130,6 +125,7 @@ class DatasetRowsFetcher(NodesThreadPool):
130
125
  local_ds_version: int,
131
126
  schema: dict[str, Union[SQLType, type[SQLType]]],
132
127
  max_threads: int = PULL_DATASET_MAX_THREADS,
128
+ progress_bar=None,
133
129
  ):
134
130
  super().__init__(max_threads)
135
131
  self._check_dependencies()
@@ -142,6 +138,7 @@ class DatasetRowsFetcher(NodesThreadPool):
142
138
  self.schema = schema
143
139
  self.last_status_check: Optional[float] = None
144
140
  self.studio_client = StudioClient()
141
+ self.progress_bar = progress_bar
145
142
 
146
143
  def done_task(self, done):
147
144
  for task in done:
@@ -198,6 +195,20 @@ class DatasetRowsFetcher(NodesThreadPool):
198
195
  for c in [c for c, t in self.schema.items() if t == DateTime]:
199
196
  df[c] = pd.to_datetime(df[c], unit="s")
200
197
 
198
+ # id will be autogenerated in DB
199
+ return df.drop("sys__id", axis=1)
200
+
201
+ def get_parquet_content(self, url: str):
202
+ while True:
203
+ if self.should_check_for_status():
204
+ self.check_for_status()
205
+ r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
206
+ if r.status_code == 404:
207
+ time.sleep(PULL_DATASET_SLEEP_INTERVAL)
208
+ continue
209
+ r.raise_for_status()
210
+ return r.content
211
+
201
212
  def do_task(self, urls):
202
213
  import lz4.frame
203
214
  import pandas as pd
@@ -207,31 +218,22 @@ class DatasetRowsFetcher(NodesThreadPool):
207
218
  local_ds = metastore.get_dataset(self.local_ds_name)
208
219
 
209
220
  urls = list(urls)
210
- while urls:
211
- for url in urls:
212
- if self.should_check_for_status():
213
- self.check_for_status()
214
221
 
215
- r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
216
- if r.status_code == 404:
217
- time.sleep(PULL_DATASET_SLEEP_INTERVAL)
218
- # moving to the next url
219
- continue
222
+ for url in urls:
223
+ if self.should_check_for_status():
224
+ self.check_for_status()
220
225
 
221
- r.raise_for_status()
222
-
223
- df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
224
-
225
- self.fix_columns(df)
226
-
227
- # id will be autogenerated in DB
228
- df = df.drop("sys__id", axis=1)
226
+ df = pd.read_parquet(
227
+ io.BytesIO(lz4.frame.decompress(self.get_parquet_content(url)))
228
+ )
229
+ df = self.fix_columns(df)
229
230
 
230
- inserted = warehouse.insert_dataset_rows(
231
- df, local_ds, self.local_ds_version
232
- )
233
- self.increase_counter(inserted) # type: ignore [arg-type]
234
- urls.remove(url)
231
+ inserted = warehouse.insert_dataset_rows(
232
+ df, local_ds, self.local_ds_version
233
+ )
234
+ self.increase_counter(inserted) # type: ignore [arg-type]
235
+ # sometimes progress bar doesn't get updated so manually updating it
236
+ self.update_progress_bar(self.progress_bar)
235
237
 
236
238
 
237
239
  @dataclass
@@ -598,7 +600,7 @@ class Catalog:
598
600
  source, session=self.session, update=update, object_name=object_name
599
601
  )
600
602
 
601
- list_ds_name, list_uri, list_path, _ = DataChain.parse_uri(
603
+ list_ds_name, list_uri, list_path, _ = get_listing(
602
604
  source, self.session, update=update
603
605
  )
604
606
 
@@ -696,11 +698,9 @@ class Catalog:
696
698
  )
697
699
  indexed_sources = []
698
700
  for source in dataset_sources:
699
- from datachain.lib.dc import DataChain
700
-
701
701
  client = self.get_client(source, **client_config)
702
702
  uri = client.uri
703
- dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
703
+ dataset_name, _, _, _ = get_listing(uri, self.session)
704
704
  listing = Listing(
705
705
  self.metastore.clone(),
706
706
  self.warehouse.clone(),
@@ -1291,13 +1291,13 @@ class Catalog:
1291
1291
  for source in data_sources: # type: ignore [union-attr]
1292
1292
  yield source, source.ls(fields)
1293
1293
 
1294
- def pull_dataset( # noqa: PLR0915
1294
+ def pull_dataset( # noqa: C901, PLR0915
1295
1295
  self,
1296
1296
  remote_ds_uri: str,
1297
1297
  output: Optional[str] = None,
1298
1298
  local_ds_name: Optional[str] = None,
1299
1299
  local_ds_version: Optional[int] = None,
1300
- no_cp: bool = False,
1300
+ cp: bool = False,
1301
1301
  force: bool = False,
1302
1302
  edatachain: bool = False,
1303
1303
  edatachain_file: Optional[str] = None,
@@ -1305,7 +1305,7 @@ class Catalog:
1305
1305
  client_config=None,
1306
1306
  ) -> None:
1307
1307
  def _instantiate(ds_uri: str) -> None:
1308
- if no_cp:
1308
+ if not cp:
1309
1309
  return
1310
1310
  assert output
1311
1311
  self.cp(
@@ -1318,7 +1318,7 @@ class Catalog:
1318
1318
  )
1319
1319
  print(f"Dataset {ds_uri} instantiated locally to {output}")
1320
1320
 
1321
- if not output and not no_cp:
1321
+ if cp and not output:
1322
1322
  raise ValueError("Please provide output directory for instantiation")
1323
1323
 
1324
1324
  studio_client = StudioClient()
@@ -1417,12 +1417,26 @@ class Catalog:
1417
1417
  signed_urls = export_response.data
1418
1418
 
1419
1419
  if signed_urls:
1420
- shuffle(signed_urls)
1421
-
1422
1420
  with (
1423
1421
  self.metastore.clone() as metastore,
1424
1422
  self.warehouse.clone() as warehouse,
1425
1423
  ):
1424
+
1425
+ def batch(urls):
1426
+ """
1427
+ Batching urls in a way that fetching is most efficient as
1428
+ urls with lower id will be created first. Because that, we
1429
+ are making sure all threads are pulling most recent urls
1430
+ from beginning
1431
+ """
1432
+ res = [[] for i in range(PULL_DATASET_MAX_THREADS)]
1433
+ current_worker = 0
1434
+ for url in signed_urls:
1435
+ res[current_worker].append(url)
1436
+ current_worker = (current_worker + 1) % PULL_DATASET_MAX_THREADS
1437
+
1438
+ return res
1439
+
1426
1440
  rows_fetcher = DatasetRowsFetcher(
1427
1441
  metastore,
1428
1442
  warehouse,
@@ -1431,14 +1445,11 @@ class Catalog:
1431
1445
  local_ds_name,
1432
1446
  local_ds_version,
1433
1447
  schema,
1448
+ progress_bar=dataset_save_progress_bar,
1434
1449
  )
1435
1450
  try:
1436
1451
  rows_fetcher.run(
1437
- batched(
1438
- signed_urls,
1439
- math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
1440
- ),
1441
- dataset_save_progress_bar,
1452
+ iter(batch(signed_urls)), dataset_save_progress_bar
1442
1453
  )
1443
1454
  except:
1444
1455
  self.remove_dataset(local_ds_name, local_ds_version)
@@ -294,6 +294,28 @@ def add_studio_parser(subparsers, parent_parser) -> None:
294
294
  help="Python package requirement. Can be specified multiple times.",
295
295
  )
296
296
 
297
+ studio_cancel_help = "Cancel a job in Studio"
298
+ studio_cancel_description = "This command cancels a job in Studio."
299
+
300
+ studio_cancel_parser = studio_subparser.add_parser(
301
+ "cancel",
302
+ parents=[parent_parser],
303
+ description=studio_cancel_description,
304
+ help=studio_cancel_help,
305
+ )
306
+
307
+ studio_cancel_parser.add_argument(
308
+ "job_id",
309
+ action="store",
310
+ help="The job ID to cancel.",
311
+ )
312
+ studio_cancel_parser.add_argument(
313
+ "--team",
314
+ action="store",
315
+ default=None,
316
+ help="The team to cancel a job for. By default, it will use team from config.",
317
+ )
318
+
297
319
 
298
320
  def get_parser() -> ArgumentParser: # noqa: PLR0915
299
321
  try:
@@ -457,10 +479,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
457
479
  help="Copy directories recursively",
458
480
  )
459
481
  parse_pull.add_argument(
460
- "--no-cp",
482
+ "--cp",
461
483
  default=False,
462
484
  action="store_true",
463
- help="Do not copy files, just pull a remote dataset into local DB",
485
+ help="Copy actual files after pulling remote dataset into local DB",
464
486
  )
465
487
  parse_pull.add_argument(
466
488
  "--edatachain",
@@ -1300,7 +1322,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1300
1322
  args.output,
1301
1323
  local_ds_name=args.local_name,
1302
1324
  local_ds_version=args.local_version,
1303
- no_cp=args.no_cp,
1325
+ cp=args.cp,
1304
1326
  force=bool(args.force),
1305
1327
  edatachain=args.edatachain,
1306
1328
  edatachain_file=args.edatachain_file,
@@ -32,6 +32,15 @@ class GCSClient(Client):
32
32
 
33
33
  return cast(GCSFileSystem, super().create_fs(**kwargs))
34
34
 
35
+ def url(self, path: str, expires: int = 3600, **kwargs) -> str:
36
+ try:
37
+ return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
38
+ except AttributeError as exc:
39
+ is_anon = self.fs.storage_options.get("token") == "anon"
40
+ if is_anon and "you need a private key to sign credentials" in str(exc):
41
+ return f"https://storage.googleapis.com/{self.name}/{path}"
42
+ raise
43
+
35
44
  @staticmethod
36
45
  def parse_timestamp(timestamp: str) -> datetime:
37
46
  """
@@ -209,10 +209,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
209
209
 
210
210
  @retry_sqlite_locks
211
211
  def executemany(
212
- self, query, params, cursor: Optional[sqlite3.Cursor] = None
212
+ self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
213
213
  ) -> sqlite3.Cursor:
214
214
  if cursor:
215
215
  return cursor.executemany(self.compile(query).string, params)
216
+ if conn:
217
+ return conn.executemany(self.compile(query).string, params)
216
218
  return self.db.executemany(self.compile(query).string, params)
217
219
 
218
220
  @retry_sqlite_locks
@@ -222,7 +224,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
222
224
  return self.db.execute(sql, parameters)
223
225
 
224
226
  def insert_dataframe(self, table_name: str, df) -> int:
225
- return df.to_sql(table_name, self.db, if_exists="append", index=False)
227
+ return df.to_sql(
228
+ table_name,
229
+ self.db,
230
+ if_exists="append",
231
+ index=False,
232
+ method="multi",
233
+ chunksize=1000,
234
+ )
226
235
 
227
236
  def cursor(self, factory=None):
228
237
  if factory is None:
@@ -545,10 +554,15 @@ class SQLiteWarehouse(AbstractWarehouse):
545
554
  rows = list(rows)
546
555
  if not rows:
547
556
  return
548
- self.db.executemany(
549
- table.insert().values({f: bindparam(f) for f in rows[0]}),
550
- rows,
551
- )
557
+
558
+ with self.db.transaction() as conn:
559
+ # transactions speeds up inserts significantly as there is no separate
560
+ # transaction created for each insert row
561
+ self.db.executemany(
562
+ table.insert().values({f: bindparam(f) for f in rows[0]}),
563
+ rows,
564
+ conn=conn,
565
+ )
552
566
 
553
567
  def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
554
568
  dr = self.dataset_rows(dataset, version)
@@ -216,7 +216,6 @@ class AbstractWarehouse(ABC, Serializable):
216
216
  limit = query._limit
217
217
  paginated_query = query.limit(page_size)
218
218
 
219
- results = None
220
219
  offset = 0
221
220
  num_yielded = 0
222
221