datachain 0.7.11__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (287) hide show
  1. {datachain-0.7.11 → datachain-0.8.0}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.7.11/src/datachain.egg-info → datachain-0.8.0}/PKG-INFO +3 -2
  3. datachain-0.8.0/examples/get_started/json-csv-reader.py +82 -0
  4. {datachain-0.7.11 → datachain-0.8.0}/pyproject.toml +3 -2
  5. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/catalog/catalog.py +53 -41
  6. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/cli.py +25 -3
  7. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/data_storage/sqlite.py +20 -6
  8. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/dc.py +155 -109
  9. datachain-0.8.0/src/datachain/lib/diff.py +197 -0
  10. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/meta_formats.py +38 -42
  11. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/query/dataset.py +1 -0
  12. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/remote/studio.py +53 -1
  13. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/studio.py +47 -2
  14. {datachain-0.7.11 → datachain-0.8.0/src/datachain.egg-info}/PKG-INFO +3 -2
  15. {datachain-0.7.11 → datachain-0.8.0}/src/datachain.egg-info/SOURCES.txt +2 -0
  16. {datachain-0.7.11 → datachain-0.8.0}/src/datachain.egg-info/requires.txt +2 -1
  17. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_datachain.py +1 -1
  18. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_meta_formats.py +4 -4
  19. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_pull.py +18 -12
  20. {datachain-0.7.11 → datachain-0.8.0}/tests/test_cli_studio.py +52 -1
  21. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_datachain.py +3 -3
  22. datachain-0.8.0/tests/unit/lib/test_diff.py +498 -0
  23. datachain-0.7.11/examples/get_started/json-csv-reader.py +0 -101
  24. {datachain-0.7.11 → datachain-0.8.0}/.cruft.json +0 -0
  25. {datachain-0.7.11 → datachain-0.8.0}/.gitattributes +0 -0
  26. {datachain-0.7.11 → datachain-0.8.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  27. {datachain-0.7.11 → datachain-0.8.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  28. {datachain-0.7.11 → datachain-0.8.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  29. {datachain-0.7.11 → datachain-0.8.0}/.github/codecov.yaml +0 -0
  30. {datachain-0.7.11 → datachain-0.8.0}/.github/dependabot.yml +0 -0
  31. {datachain-0.7.11 → datachain-0.8.0}/.github/workflows/benchmarks.yml +0 -0
  32. {datachain-0.7.11 → datachain-0.8.0}/.github/workflows/release.yml +0 -0
  33. {datachain-0.7.11 → datachain-0.8.0}/.github/workflows/tests-studio.yml +0 -0
  34. {datachain-0.7.11 → datachain-0.8.0}/.github/workflows/tests.yml +0 -0
  35. {datachain-0.7.11 → datachain-0.8.0}/.github/workflows/update-template.yaml +0 -0
  36. {datachain-0.7.11 → datachain-0.8.0}/.gitignore +0 -0
  37. {datachain-0.7.11 → datachain-0.8.0}/CODE_OF_CONDUCT.rst +0 -0
  38. {datachain-0.7.11 → datachain-0.8.0}/LICENSE +0 -0
  39. {datachain-0.7.11 → datachain-0.8.0}/README.rst +0 -0
  40. {datachain-0.7.11 → datachain-0.8.0}/docs/assets/captioned_cartoons.png +0 -0
  41. {datachain-0.7.11 → datachain-0.8.0}/docs/assets/datachain-white.svg +0 -0
  42. {datachain-0.7.11 → datachain-0.8.0}/docs/assets/datachain.svg +0 -0
  43. {datachain-0.7.11 → datachain-0.8.0}/docs/contributing.md +0 -0
  44. {datachain-0.7.11 → datachain-0.8.0}/docs/css/github-permalink-style.css +0 -0
  45. {datachain-0.7.11 → datachain-0.8.0}/docs/examples.md +0 -0
  46. {datachain-0.7.11 → datachain-0.8.0}/docs/index.md +0 -0
  47. {datachain-0.7.11 → datachain-0.8.0}/docs/overrides/main.html +0 -0
  48. {datachain-0.7.11 → datachain-0.8.0}/docs/quick-start.md +0 -0
  49. {datachain-0.7.11 → datachain-0.8.0}/docs/references/datachain.md +0 -0
  50. {datachain-0.7.11 → datachain-0.8.0}/docs/references/datatype.md +0 -0
  51. {datachain-0.7.11 → datachain-0.8.0}/docs/references/file.md +0 -0
  52. {datachain-0.7.11 → datachain-0.8.0}/docs/references/index.md +0 -0
  53. {datachain-0.7.11 → datachain-0.8.0}/docs/references/sql.md +0 -0
  54. {datachain-0.7.11 → datachain-0.8.0}/docs/references/torch.md +0 -0
  55. {datachain-0.7.11 → datachain-0.8.0}/docs/references/udf.md +0 -0
  56. {datachain-0.7.11 → datachain-0.8.0}/docs/tutorials.md +0 -0
  57. {datachain-0.7.11 → datachain-0.8.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  58. {datachain-0.7.11 → datachain-0.8.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  59. {datachain-0.7.11 → datachain-0.8.0}/examples/computer_vision/openimage-detect.py +0 -0
  60. {datachain-0.7.11 → datachain-0.8.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  61. {datachain-0.7.11 → datachain-0.8.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  62. {datachain-0.7.11 → datachain-0.8.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  63. {datachain-0.7.11 → datachain-0.8.0}/examples/get_started/common_sql_functions.py +0 -0
  64. {datachain-0.7.11 → datachain-0.8.0}/examples/get_started/torch-loader.py +0 -0
  65. {datachain-0.7.11 → datachain-0.8.0}/examples/get_started/udfs/parallel.py +0 -0
  66. {datachain-0.7.11 → datachain-0.8.0}/examples/get_started/udfs/simple.py +0 -0
  67. {datachain-0.7.11 → datachain-0.8.0}/examples/get_started/udfs/stateful.py +0 -0
  68. {datachain-0.7.11 → datachain-0.8.0}/examples/llm_and_nlp/claude-query.py +0 -0
  69. {datachain-0.7.11 → datachain-0.8.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  70. {datachain-0.7.11 → datachain-0.8.0}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  71. {datachain-0.7.11 → datachain-0.8.0}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  72. {datachain-0.7.11 → datachain-0.8.0}/examples/multimodal/clip_inference.py +0 -0
  73. {datachain-0.7.11 → datachain-0.8.0}/examples/multimodal/hf_pipeline.py +0 -0
  74. {datachain-0.7.11 → datachain-0.8.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  75. {datachain-0.7.11 → datachain-0.8.0}/examples/multimodal/wds.py +0 -0
  76. {datachain-0.7.11 → datachain-0.8.0}/examples/multimodal/wds_filtered.py +0 -0
  77. {datachain-0.7.11 → datachain-0.8.0}/mkdocs.yml +0 -0
  78. {datachain-0.7.11 → datachain-0.8.0}/noxfile.py +0 -0
  79. {datachain-0.7.11 → datachain-0.8.0}/setup.cfg +0 -0
  80. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/__init__.py +0 -0
  81. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/__main__.py +0 -0
  82. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/asyn.py +0 -0
  83. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/cache.py +0 -0
  84. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/catalog/__init__.py +0 -0
  85. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/catalog/datasource.py +0 -0
  86. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/catalog/loader.py +0 -0
  87. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/cli_utils.py +0 -0
  88. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/client/__init__.py +0 -0
  89. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/client/azure.py +0 -0
  90. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/client/fileslice.py +0 -0
  91. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/client/fsspec.py +0 -0
  92. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/client/gcs.py +0 -0
  93. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/client/hf.py +0 -0
  94. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/client/local.py +0 -0
  95. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/client/s3.py +0 -0
  96. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/config.py +0 -0
  97. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/data_storage/__init__.py +0 -0
  98. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/data_storage/db_engine.py +0 -0
  99. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/data_storage/job.py +0 -0
  100. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/data_storage/metastore.py +0 -0
  101. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/data_storage/schema.py +0 -0
  102. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/data_storage/serializer.py +0 -0
  103. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/data_storage/warehouse.py +0 -0
  104. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/dataset.py +0 -0
  105. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/error.py +0 -0
  106. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/__init__.py +0 -0
  107. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/aggregate.py +0 -0
  108. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/array.py +0 -0
  109. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/base.py +0 -0
  110. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/conditional.py +0 -0
  111. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/func.py +0 -0
  112. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/numeric.py +0 -0
  113. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/path.py +0 -0
  114. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/random.py +0 -0
  115. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/string.py +0 -0
  116. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/func/window.py +0 -0
  117. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/job.py +0 -0
  118. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/__init__.py +0 -0
  119. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/arrow.py +0 -0
  120. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/clip.py +0 -0
  121. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/convert/__init__.py +0 -0
  122. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/convert/flatten.py +0 -0
  123. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  124. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  125. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/convert/unflatten.py +0 -0
  126. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  127. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/data_model.py +0 -0
  128. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/dataset_info.py +0 -0
  129. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/file.py +0 -0
  130. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/hf.py +0 -0
  131. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/image.py +0 -0
  132. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/listing.py +0 -0
  133. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/listing_info.py +0 -0
  134. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/model_store.py +0 -0
  135. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/pytorch.py +0 -0
  136. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/settings.py +0 -0
  137. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/signal_schema.py +0 -0
  138. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/tar.py +0 -0
  139. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/text.py +0 -0
  140. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/udf.py +0 -0
  141. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/udf_signature.py +0 -0
  142. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/utils.py +0 -0
  143. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/vfile.py +0 -0
  144. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/webdataset.py +0 -0
  145. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/lib/webdataset_laion.py +0 -0
  146. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/listing.py +0 -0
  147. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/model/__init__.py +0 -0
  148. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/model/bbox.py +0 -0
  149. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/model/pose.py +0 -0
  150. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/model/segment.py +0 -0
  151. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  152. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  153. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/model/ultralytics/pose.py +0 -0
  154. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/model/ultralytics/segment.py +0 -0
  155. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/node.py +0 -0
  156. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/nodes_fetcher.py +0 -0
  157. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/nodes_thread_pool.py +0 -0
  158. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/progress.py +0 -0
  159. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/py.typed +0 -0
  160. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/query/__init__.py +0 -0
  161. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/query/batch.py +0 -0
  162. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/query/dispatch.py +0 -0
  163. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/query/metrics.py +0 -0
  164. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/query/params.py +0 -0
  165. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/query/queue.py +0 -0
  166. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/query/schema.py +0 -0
  167. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/query/session.py +0 -0
  168. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/remote/__init__.py +0 -0
  169. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/__init__.py +0 -0
  170. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/default/__init__.py +0 -0
  171. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/default/base.py +0 -0
  172. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/functions/__init__.py +0 -0
  173. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/functions/aggregate.py +0 -0
  174. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/functions/array.py +0 -0
  175. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/functions/conditional.py +0 -0
  176. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/functions/numeric.py +0 -0
  177. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/functions/path.py +0 -0
  178. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/functions/random.py +0 -0
  179. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/functions/string.py +0 -0
  180. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/selectable.py +0 -0
  181. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  182. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/sqlite/base.py +0 -0
  183. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/sqlite/types.py +0 -0
  184. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/sqlite/vector.py +0 -0
  185. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/types.py +0 -0
  186. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/sql/utils.py +0 -0
  187. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/telemetry.py +0 -0
  188. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/toolkit/__init__.py +0 -0
  189. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/toolkit/split.py +0 -0
  190. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/torch/__init__.py +0 -0
  191. {datachain-0.7.11 → datachain-0.8.0}/src/datachain/utils.py +0 -0
  192. {datachain-0.7.11 → datachain-0.8.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  193. {datachain-0.7.11 → datachain-0.8.0}/src/datachain.egg-info/entry_points.txt +0 -0
  194. {datachain-0.7.11 → datachain-0.8.0}/src/datachain.egg-info/top_level.txt +0 -0
  195. {datachain-0.7.11 → datachain-0.8.0}/tests/__init__.py +0 -0
  196. {datachain-0.7.11 → datachain-0.8.0}/tests/benchmarks/__init__.py +0 -0
  197. {datachain-0.7.11 → datachain-0.8.0}/tests/benchmarks/conftest.py +0 -0
  198. {datachain-0.7.11 → datachain-0.8.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  199. {datachain-0.7.11 → datachain-0.8.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  200. {datachain-0.7.11 → datachain-0.8.0}/tests/benchmarks/datasets/.gitignore +0 -0
  201. {datachain-0.7.11 → datachain-0.8.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  202. {datachain-0.7.11 → datachain-0.8.0}/tests/benchmarks/test_datachain.py +0 -0
  203. {datachain-0.7.11 → datachain-0.8.0}/tests/benchmarks/test_ls.py +0 -0
  204. {datachain-0.7.11 → datachain-0.8.0}/tests/benchmarks/test_version.py +0 -0
  205. {datachain-0.7.11 → datachain-0.8.0}/tests/conftest.py +0 -0
  206. {datachain-0.7.11 → datachain-0.8.0}/tests/data.py +0 -0
  207. {datachain-0.7.11 → datachain-0.8.0}/tests/examples/__init__.py +0 -0
  208. {datachain-0.7.11 → datachain-0.8.0}/tests/examples/test_examples.py +0 -0
  209. {datachain-0.7.11 → datachain-0.8.0}/tests/examples/test_wds_e2e.py +0 -0
  210. {datachain-0.7.11 → datachain-0.8.0}/tests/examples/wds_data.py +0 -0
  211. {datachain-0.7.11 → datachain-0.8.0}/tests/func/__init__.py +0 -0
  212. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_catalog.py +0 -0
  213. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_client.py +0 -0
  214. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_dataset_query.py +0 -0
  215. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_datasets.py +0 -0
  216. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_feature_pickling.py +0 -0
  217. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_listing.py +0 -0
  218. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_ls.py +0 -0
  219. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_metrics.py +0 -0
  220. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_pytorch.py +0 -0
  221. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_query.py +0 -0
  222. {datachain-0.7.11 → datachain-0.8.0}/tests/func/test_toolkit.py +0 -0
  223. {datachain-0.7.11 → datachain-0.8.0}/tests/scripts/feature_class.py +0 -0
  224. {datachain-0.7.11 → datachain-0.8.0}/tests/scripts/feature_class_exception.py +0 -0
  225. {datachain-0.7.11 → datachain-0.8.0}/tests/scripts/feature_class_parallel.py +0 -0
  226. {datachain-0.7.11 → datachain-0.8.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  227. {datachain-0.7.11 → datachain-0.8.0}/tests/scripts/name_len_slow.py +0 -0
  228. {datachain-0.7.11 → datachain-0.8.0}/tests/test_atomicity.py +0 -0
  229. {datachain-0.7.11 → datachain-0.8.0}/tests/test_cli_e2e.py +0 -0
  230. {datachain-0.7.11 → datachain-0.8.0}/tests/test_query_e2e.py +0 -0
  231. {datachain-0.7.11 → datachain-0.8.0}/tests/test_telemetry.py +0 -0
  232. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/__init__.py +0 -0
  233. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/__init__.py +0 -0
  234. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/conftest.py +0 -0
  235. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_arrow.py +0 -0
  236. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_clip.py +0 -0
  237. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  238. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_datachain_merge.py +0 -0
  239. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_feature.py +0 -0
  240. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_feature_utils.py +0 -0
  241. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_file.py +0 -0
  242. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_hf.py +0 -0
  243. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_image.py +0 -0
  244. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_listing_info.py +0 -0
  245. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_models.py +0 -0
  246. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_schema.py +0 -0
  247. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_signal_schema.py +0 -0
  248. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  249. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_text.py +0 -0
  250. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_udf_signature.py +0 -0
  251. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_utils.py +0 -0
  252. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/lib/test_webdataset.py +0 -0
  253. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/sql/__init__.py +0 -0
  254. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  255. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  256. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  257. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/sql/test_array.py +0 -0
  258. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/sql/test_conditional.py +0 -0
  259. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/sql/test_path.py +0 -0
  260. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/sql/test_random.py +0 -0
  261. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/sql/test_selectable.py +0 -0
  262. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/sql/test_string.py +0 -0
  263. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_asyn.py +0 -0
  264. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_cache.py +0 -0
  265. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_catalog.py +0 -0
  266. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_catalog_loader.py +0 -0
  267. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_cli_parsing.py +0 -0
  268. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_client.py +0 -0
  269. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_client_s3.py +0 -0
  270. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_config.py +0 -0
  271. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_data_storage.py +0 -0
  272. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_database_engine.py +0 -0
  273. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_dataset.py +0 -0
  274. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_dispatch.py +0 -0
  275. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_fileslice.py +0 -0
  276. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_func.py +0 -0
  277. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_listing.py +0 -0
  278. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_metastore.py +0 -0
  279. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_module_exports.py +0 -0
  280. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_query.py +0 -0
  281. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_query_metrics.py +0 -0
  282. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_query_params.py +0 -0
  283. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_serializer.py +0 -0
  284. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_session.py +0 -0
  285. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_utils.py +0 -0
  286. {datachain-0.7.11 → datachain-0.8.0}/tests/unit/test_warehouse.py +0 -0
  287. {datachain-0.7.11 → datachain-0.8.0}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.8.2'
27
+ rev: 'v0.8.3'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.11
3
+ Version: 0.8.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -46,6 +46,7 @@ Requires-Dist: iterative-telemetry>=0.0.9
46
46
  Requires-Dist: platformdirs
47
47
  Requires-Dist: dvc-studio-client<1,>=0.21
48
48
  Requires-Dist: tabulate
49
+ Requires-Dist: websockets
49
50
  Provides-Extra: docs
50
51
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
51
52
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -98,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
98
99
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
100
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
101
  Requires-Dist: onnx==1.16.1; extra == "examples"
101
- Requires-Dist: ultralytics==8.3.48; extra == "examples"
102
+ Requires-Dist: ultralytics==8.3.50; extra == "examples"
102
103
 
103
104
  ================
104
105
  |logo| DataChain
@@ -0,0 +1,82 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from datachain import C, DataChain
6
+ from datachain.lib.data_model import ModelStore
7
+ from datachain.lib.meta_formats import gen_datamodel_code
8
+
9
+
10
+ # Sample model for static JSON model
11
+ class LicenseModel(BaseModel):
12
+ url: str
13
+ id: int
14
+ name: str
15
+
16
+
17
+ LicenseFeature = ModelStore.register(LicenseModel)
18
+
19
+
20
+ # Sample model for static CSV model
21
+ class ChatDialog(BaseModel):
22
+ id: Optional[int] = None
23
+ count: Optional[int] = None
24
+ sender: Optional[str] = None
25
+ text: Optional[str] = None
26
+
27
+
28
+ ChatFeature = ModelStore.register(ChatDialog)
29
+
30
+
31
+ def main():
32
+ # Dynamic JSONl schema from 2 objects
33
+ uri = "gs://datachain-demo/jsonl/object.jsonl"
34
+ jsonl_ds = DataChain.from_json(uri, format="jsonl", anon="True")
35
+ jsonl_ds.show()
36
+
37
+ # Dynamic JSON schema from 200 OpenImage json-pairs with validation errors
38
+ uri = "gs://datachain-demo/openimages-v6-test-jsonpairs/*json"
39
+ schema_uri = (
40
+ "gs://datachain-demo/openimages-v6-test-jsonpairs/08392c290ecc9d2a.json"
41
+ )
42
+ json_pairs_ds = DataChain.from_json(
43
+ uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage", anon="True"
44
+ )
45
+ json_pairs_ds.show()
46
+
47
+ uri = "gs://datachain-demo/coco2017/annotations_captions/"
48
+
49
+ # Print JSON schema in Pydantic format from main COCO annotation
50
+ chain = DataChain.from_storage(uri, anon="True").filter(
51
+ C("file.path").glob("*.json")
52
+ )
53
+ file = next(chain.limit(1).collect("file"))
54
+ print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
55
+
56
+ # Static JSON schema test parsing 3/7 objects
57
+ static_json_ds = DataChain.from_json(
58
+ uri, jmespath="licenses", spec=LicenseFeature, nrows=3, anon="True"
59
+ )
60
+ static_json_ds.show()
61
+
62
+ # Dynamic JSON schema test parsing 5K objects
63
+ dynamic_json_ds = DataChain.from_json(uri, jmespath="images", anon="True")
64
+ print(dynamic_json_ds.to_pandas())
65
+
66
+ # Static CSV with header schema test parsing 3.5K objects
67
+ uri = "gs://datachain-demo/chatbot-csv/"
68
+ static_csv_ds = DataChain.from_csv(
69
+ uri, output=ChatDialog, object_name="chat", anon="True"
70
+ )
71
+ static_csv_ds.print_schema()
72
+ static_csv_ds.show()
73
+
74
+ # Dynamic CSV with header schema test parsing 3/3M objects
75
+ uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
76
+ dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3, anon="True")
77
+ dynamic_csv_ds.print_schema()
78
+ dynamic_csv_ds.show()
79
+
80
+
81
+ if __name__ == "__main__":
82
+ main()
@@ -48,7 +48,8 @@ dependencies = [
48
48
  "iterative-telemetry>=0.0.9",
49
49
  "platformdirs",
50
50
  "dvc-studio-client>=0.21,<1",
51
- "tabulate"
51
+ "tabulate",
52
+ "websockets"
52
53
  ]
53
54
 
54
55
  [project.optional-dependencies]
@@ -111,7 +112,7 @@ examples = [
111
112
  "pdfplumber==0.11.4",
112
113
  "huggingface_hub[hf_transfer]",
113
114
  "onnx==1.16.1",
114
- "ultralytics==8.3.48"
115
+ "ultralytics==8.3.50"
115
116
  ]
116
117
 
117
118
  [project.urls]
@@ -1,7 +1,6 @@
1
1
  import io
2
2
  import json
3
3
  import logging
4
- import math
5
4
  import os
6
5
  import os.path
7
6
  import posixpath
@@ -13,7 +12,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence
13
12
  from copy import copy
14
13
  from dataclasses import dataclass
15
14
  from functools import cached_property, reduce
16
- from random import shuffle
17
15
  from threading import Thread
18
16
  from typing import (
19
17
  IO,
@@ -58,11 +56,7 @@ from datachain.node import DirType, Node, NodeWithPath
58
56
  from datachain.nodes_thread_pool import NodesThreadPool
59
57
  from datachain.remote.studio import StudioClient
60
58
  from datachain.sql.types import DateTime, SQLType
61
- from datachain.utils import (
62
- DataChainDir,
63
- batched,
64
- datachain_paths_join,
65
- )
59
+ from datachain.utils import DataChainDir, datachain_paths_join
66
60
 
67
61
  from .datasource import DataSource
68
62
 
@@ -90,7 +84,7 @@ QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
90
84
  QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
91
85
 
92
86
  # dataset pull
93
- PULL_DATASET_MAX_THREADS = 10
87
+ PULL_DATASET_MAX_THREADS = 5
94
88
  PULL_DATASET_CHUNK_TIMEOUT = 3600
95
89
  PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be available
96
90
  PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
@@ -130,6 +124,7 @@ class DatasetRowsFetcher(NodesThreadPool):
130
124
  local_ds_version: int,
131
125
  schema: dict[str, Union[SQLType, type[SQLType]]],
132
126
  max_threads: int = PULL_DATASET_MAX_THREADS,
127
+ progress_bar=None,
133
128
  ):
134
129
  super().__init__(max_threads)
135
130
  self._check_dependencies()
@@ -142,6 +137,7 @@ class DatasetRowsFetcher(NodesThreadPool):
142
137
  self.schema = schema
143
138
  self.last_status_check: Optional[float] = None
144
139
  self.studio_client = StudioClient()
140
+ self.progress_bar = progress_bar
145
141
 
146
142
  def done_task(self, done):
147
143
  for task in done:
@@ -198,6 +194,20 @@ class DatasetRowsFetcher(NodesThreadPool):
198
194
  for c in [c for c, t in self.schema.items() if t == DateTime]:
199
195
  df[c] = pd.to_datetime(df[c], unit="s")
200
196
 
197
+ # id will be autogenerated in DB
198
+ return df.drop("sys__id", axis=1)
199
+
200
+ def get_parquet_content(self, url: str):
201
+ while True:
202
+ if self.should_check_for_status():
203
+ self.check_for_status()
204
+ r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
205
+ if r.status_code == 404:
206
+ time.sleep(PULL_DATASET_SLEEP_INTERVAL)
207
+ continue
208
+ r.raise_for_status()
209
+ return r.content
210
+
201
211
  def do_task(self, urls):
202
212
  import lz4.frame
203
213
  import pandas as pd
@@ -207,31 +217,22 @@ class DatasetRowsFetcher(NodesThreadPool):
207
217
  local_ds = metastore.get_dataset(self.local_ds_name)
208
218
 
209
219
  urls = list(urls)
210
- while urls:
211
- for url in urls:
212
- if self.should_check_for_status():
213
- self.check_for_status()
214
-
215
- r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
216
- if r.status_code == 404:
217
- time.sleep(PULL_DATASET_SLEEP_INTERVAL)
218
- # moving to the next url
219
- continue
220
-
221
- r.raise_for_status()
222
-
223
- df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
224
220
 
225
- self.fix_columns(df)
221
+ for url in urls:
222
+ if self.should_check_for_status():
223
+ self.check_for_status()
226
224
 
227
- # id will be autogenerated in DB
228
- df = df.drop("sys__id", axis=1)
225
+ df = pd.read_parquet(
226
+ io.BytesIO(lz4.frame.decompress(self.get_parquet_content(url)))
227
+ )
228
+ df = self.fix_columns(df)
229
229
 
230
- inserted = warehouse.insert_dataset_rows(
231
- df, local_ds, self.local_ds_version
232
- )
233
- self.increase_counter(inserted) # type: ignore [arg-type]
234
- urls.remove(url)
230
+ inserted = warehouse.insert_dataset_rows(
231
+ df, local_ds, self.local_ds_version
232
+ )
233
+ self.increase_counter(inserted) # type: ignore [arg-type]
234
+ # sometimes progress bar doesn't get updated so manually updating it
235
+ self.update_progress_bar(self.progress_bar)
235
236
 
236
237
 
237
238
  @dataclass
@@ -1291,13 +1292,13 @@ class Catalog:
1291
1292
  for source in data_sources: # type: ignore [union-attr]
1292
1293
  yield source, source.ls(fields)
1293
1294
 
1294
- def pull_dataset( # noqa: PLR0915
1295
+ def pull_dataset( # noqa: C901, PLR0915
1295
1296
  self,
1296
1297
  remote_ds_uri: str,
1297
1298
  output: Optional[str] = None,
1298
1299
  local_ds_name: Optional[str] = None,
1299
1300
  local_ds_version: Optional[int] = None,
1300
- no_cp: bool = False,
1301
+ cp: bool = False,
1301
1302
  force: bool = False,
1302
1303
  edatachain: bool = False,
1303
1304
  edatachain_file: Optional[str] = None,
@@ -1305,7 +1306,7 @@ class Catalog:
1305
1306
  client_config=None,
1306
1307
  ) -> None:
1307
1308
  def _instantiate(ds_uri: str) -> None:
1308
- if no_cp:
1309
+ if not cp:
1309
1310
  return
1310
1311
  assert output
1311
1312
  self.cp(
@@ -1318,7 +1319,7 @@ class Catalog:
1318
1319
  )
1319
1320
  print(f"Dataset {ds_uri} instantiated locally to {output}")
1320
1321
 
1321
- if not output and not no_cp:
1322
+ if cp and not output:
1322
1323
  raise ValueError("Please provide output directory for instantiation")
1323
1324
 
1324
1325
  studio_client = StudioClient()
@@ -1417,12 +1418,26 @@ class Catalog:
1417
1418
  signed_urls = export_response.data
1418
1419
 
1419
1420
  if signed_urls:
1420
- shuffle(signed_urls)
1421
-
1422
1421
  with (
1423
1422
  self.metastore.clone() as metastore,
1424
1423
  self.warehouse.clone() as warehouse,
1425
1424
  ):
1425
+
1426
+ def batch(urls):
1427
+ """
1428
+ Batching urls in a way that fetching is most efficient as
1429
+ urls with lower id will be created first. Because that, we
1430
+ are making sure all threads are pulling most recent urls
1431
+ from beginning
1432
+ """
1433
+ res = [[] for i in range(PULL_DATASET_MAX_THREADS)]
1434
+ current_worker = 0
1435
+ for url in signed_urls:
1436
+ res[current_worker].append(url)
1437
+ current_worker = (current_worker + 1) % PULL_DATASET_MAX_THREADS
1438
+
1439
+ return res
1440
+
1426
1441
  rows_fetcher = DatasetRowsFetcher(
1427
1442
  metastore,
1428
1443
  warehouse,
@@ -1431,14 +1446,11 @@ class Catalog:
1431
1446
  local_ds_name,
1432
1447
  local_ds_version,
1433
1448
  schema,
1449
+ progress_bar=dataset_save_progress_bar,
1434
1450
  )
1435
1451
  try:
1436
1452
  rows_fetcher.run(
1437
- batched(
1438
- signed_urls,
1439
- math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
1440
- ),
1441
- dataset_save_progress_bar,
1453
+ iter(batch(signed_urls)), dataset_save_progress_bar
1442
1454
  )
1443
1455
  except:
1444
1456
  self.remove_dataset(local_ds_name, local_ds_version)
@@ -294,6 +294,28 @@ def add_studio_parser(subparsers, parent_parser) -> None:
294
294
  help="Python package requirement. Can be specified multiple times.",
295
295
  )
296
296
 
297
+ studio_cancel_help = "Cancel a job in Studio"
298
+ studio_cancel_description = "This command cancels a job in Studio."
299
+
300
+ studio_cancel_parser = studio_subparser.add_parser(
301
+ "cancel",
302
+ parents=[parent_parser],
303
+ description=studio_cancel_description,
304
+ help=studio_cancel_help,
305
+ )
306
+
307
+ studio_cancel_parser.add_argument(
308
+ "job_id",
309
+ action="store",
310
+ help="The job ID to cancel.",
311
+ )
312
+ studio_cancel_parser.add_argument(
313
+ "--team",
314
+ action="store",
315
+ default=None,
316
+ help="The team to cancel a job for. By default, it will use team from config.",
317
+ )
318
+
297
319
 
298
320
  def get_parser() -> ArgumentParser: # noqa: PLR0915
299
321
  try:
@@ -457,10 +479,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
457
479
  help="Copy directories recursively",
458
480
  )
459
481
  parse_pull.add_argument(
460
- "--no-cp",
482
+ "--cp",
461
483
  default=False,
462
484
  action="store_true",
463
- help="Do not copy files, just pull a remote dataset into local DB",
485
+ help="Copy actual files after pulling remote dataset into local DB",
464
486
  )
465
487
  parse_pull.add_argument(
466
488
  "--edatachain",
@@ -1300,7 +1322,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1300
1322
  args.output,
1301
1323
  local_ds_name=args.local_name,
1302
1324
  local_ds_version=args.local_version,
1303
- no_cp=args.no_cp,
1325
+ cp=args.cp,
1304
1326
  force=bool(args.force),
1305
1327
  edatachain=args.edatachain,
1306
1328
  edatachain_file=args.edatachain_file,
@@ -209,10 +209,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
209
209
 
210
210
  @retry_sqlite_locks
211
211
  def executemany(
212
- self, query, params, cursor: Optional[sqlite3.Cursor] = None
212
+ self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
213
213
  ) -> sqlite3.Cursor:
214
214
  if cursor:
215
215
  return cursor.executemany(self.compile(query).string, params)
216
+ if conn:
217
+ return conn.executemany(self.compile(query).string, params)
216
218
  return self.db.executemany(self.compile(query).string, params)
217
219
 
218
220
  @retry_sqlite_locks
@@ -222,7 +224,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
222
224
  return self.db.execute(sql, parameters)
223
225
 
224
226
  def insert_dataframe(self, table_name: str, df) -> int:
225
- return df.to_sql(table_name, self.db, if_exists="append", index=False)
227
+ return df.to_sql(
228
+ table_name,
229
+ self.db,
230
+ if_exists="append",
231
+ index=False,
232
+ method="multi",
233
+ chunksize=1000,
234
+ )
226
235
 
227
236
  def cursor(self, factory=None):
228
237
  if factory is None:
@@ -545,10 +554,15 @@ class SQLiteWarehouse(AbstractWarehouse):
545
554
  rows = list(rows)
546
555
  if not rows:
547
556
  return
548
- self.db.executemany(
549
- table.insert().values({f: bindparam(f) for f in rows[0]}),
550
- rows,
551
- )
557
+
558
+ with self.db.transaction() as conn:
559
+ # transactions speeds up inserts significantly as there is no separate
560
+ # transaction created for each insert row
561
+ self.db.executemany(
562
+ table.insert().values({f: bindparam(f) for f in rows[0]}),
563
+ rows,
564
+ conn=conn,
565
+ )
552
566
 
553
567
  def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
554
568
  dr = self.dataset_rows(dataset, version)