datachain 0.8.9__tar.gz → 0.8.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (314) hide show
  1. {datachain-0.8.9 → datachain-0.8.11}/.github/workflows/tests.yml +6 -14
  2. {datachain-0.8.9 → datachain-0.8.11}/.pre-commit-config.yaml +2 -2
  3. {datachain-0.8.9 → datachain-0.8.11}/PKG-INFO +3 -7
  4. {datachain-0.8.9 → datachain-0.8.11}/docs/overrides/main.html +10 -0
  5. {datachain-0.8.9 → datachain-0.8.11}/examples/llm_and_nlp/hf-dataset-llm-eval.py +15 -9
  6. {datachain-0.8.9 → datachain-0.8.11}/pyproject.toml +2 -6
  7. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cache.py +4 -4
  8. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/catalog/__init__.py +0 -2
  9. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/catalog/catalog.py +102 -138
  10. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/__init__.py +9 -9
  11. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/parser/__init__.py +36 -20
  12. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/parser/job.py +1 -1
  13. datachain-0.8.11/src/datachain/cli/parser/studio.py +103 -0
  14. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/parser/utils.py +19 -1
  15. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/utils.py +1 -1
  16. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/client/fsspec.py +11 -8
  17. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/client/local.py +4 -4
  18. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/data_storage/schema.py +1 -1
  19. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/data_storage/sqlite.py +38 -7
  20. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/data_storage/warehouse.py +2 -2
  21. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/dataset.py +1 -1
  22. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/error.py +12 -0
  23. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/__init__.py +2 -1
  24. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/conditional.py +67 -23
  25. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/func.py +17 -5
  26. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/convert/python_to_sql.py +15 -3
  27. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/dc.py +27 -5
  28. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/file.py +16 -0
  29. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/listing.py +30 -12
  30. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/pytorch.py +1 -1
  31. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/udf.py +1 -1
  32. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/listing.py +1 -13
  33. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/node.py +0 -15
  34. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/nodes_fetcher.py +2 -2
  35. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/dataset.py +8 -4
  36. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/remote/studio.py +3 -3
  37. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/sqlite/base.py +35 -14
  38. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/studio.py +8 -8
  39. {datachain-0.8.9 → datachain-0.8.11}/src/datachain.egg-info/PKG-INFO +3 -7
  40. {datachain-0.8.9 → datachain-0.8.11}/src/datachain.egg-info/SOURCES.txt +6 -3
  41. {datachain-0.8.9 → datachain-0.8.11}/src/datachain.egg-info/requires.txt +2 -6
  42. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_catalog.py +36 -369
  43. {datachain-0.8.9/tests/unit → datachain-0.8.11/tests/func}/test_client.py +87 -24
  44. {datachain-0.8.9/tests/unit → datachain-0.8.11/tests/func}/test_data_storage.py +1 -52
  45. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_datachain.py +92 -3
  46. datachain-0.8.11/tests/func/test_datachain_merge.py +101 -0
  47. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_datasets.py +1 -1
  48. datachain-0.8.11/tests/func/test_file.py +65 -0
  49. datachain-0.8.11/tests/func/test_hf.py +50 -0
  50. datachain-0.8.11/tests/func/test_listing.py +64 -0
  51. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_pytorch.py +14 -11
  52. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_query.py +79 -0
  53. datachain-0.8.11/tests/func/test_warehouse.py +6 -0
  54. {datachain-0.8.9 → datachain-0.8.11}/tests/scripts/name_len_slow.py +1 -1
  55. {datachain-0.8.9 → datachain-0.8.11}/tests/test_cli_studio.py +9 -9
  56. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_datachain.py +28 -58
  57. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_datachain_bootstrap.py +0 -30
  58. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_datachain_merge.py +48 -98
  59. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_diff.py +76 -89
  60. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_file.py +0 -42
  61. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_hf.py +1 -44
  62. datachain-0.8.11/tests/unit/lib/test_python_to_sql.py +45 -0
  63. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_utils.py +3 -19
  64. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/sql/test_conditional.py +31 -1
  65. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_cache.py +4 -4
  66. datachain-0.8.11/tests/unit/test_client.py +33 -0
  67. datachain-0.8.11/tests/unit/test_data_storage.py +77 -0
  68. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_func.py +89 -0
  69. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_listing.py +0 -36
  70. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_pytorch.py +3 -3
  71. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_query.py +16 -1
  72. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_warehouse.py +0 -8
  73. datachain-0.8.9/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -78
  74. datachain-0.8.9/examples/llm_and_nlp/unstructured-summary-map.py +0 -67
  75. datachain-0.8.9/src/datachain/cli/parser/studio.py +0 -102
  76. datachain-0.8.9/tests/func/test_client.py +0 -93
  77. datachain-0.8.9/tests/func/test_listing.py +0 -27
  78. datachain-0.8.9/tests/unit/test_diff.py +0 -70
  79. {datachain-0.8.9 → datachain-0.8.11}/.cruft.json +0 -0
  80. {datachain-0.8.9 → datachain-0.8.11}/.gitattributes +0 -0
  81. {datachain-0.8.9 → datachain-0.8.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  82. {datachain-0.8.9 → datachain-0.8.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  83. {datachain-0.8.9 → datachain-0.8.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  84. {datachain-0.8.9 → datachain-0.8.11}/.github/codecov.yaml +0 -0
  85. {datachain-0.8.9 → datachain-0.8.11}/.github/dependabot.yml +0 -0
  86. {datachain-0.8.9 → datachain-0.8.11}/.github/workflows/benchmarks.yml +0 -0
  87. {datachain-0.8.9 → datachain-0.8.11}/.github/workflows/release.yml +0 -0
  88. {datachain-0.8.9 → datachain-0.8.11}/.github/workflows/tests-studio.yml +0 -0
  89. {datachain-0.8.9 → datachain-0.8.11}/.github/workflows/update-template.yaml +0 -0
  90. {datachain-0.8.9 → datachain-0.8.11}/.gitignore +0 -0
  91. {datachain-0.8.9 → datachain-0.8.11}/CODE_OF_CONDUCT.rst +0 -0
  92. {datachain-0.8.9 → datachain-0.8.11}/LICENSE +0 -0
  93. {datachain-0.8.9 → datachain-0.8.11}/README.rst +0 -0
  94. {datachain-0.8.9 → datachain-0.8.11}/docs/assets/captioned_cartoons.png +0 -0
  95. {datachain-0.8.9 → datachain-0.8.11}/docs/assets/datachain-white.svg +0 -0
  96. {datachain-0.8.9 → datachain-0.8.11}/docs/assets/datachain.svg +0 -0
  97. {datachain-0.8.9 → datachain-0.8.11}/docs/contributing.md +0 -0
  98. {datachain-0.8.9 → datachain-0.8.11}/docs/css/github-permalink-style.css +0 -0
  99. {datachain-0.8.9 → datachain-0.8.11}/docs/examples.md +0 -0
  100. {datachain-0.8.9 → datachain-0.8.11}/docs/index.md +0 -0
  101. {datachain-0.8.9 → datachain-0.8.11}/docs/quick-start.md +0 -0
  102. {datachain-0.8.9 → datachain-0.8.11}/docs/references/datachain.md +0 -0
  103. {datachain-0.8.9 → datachain-0.8.11}/docs/references/datatype.md +0 -0
  104. {datachain-0.8.9 → datachain-0.8.11}/docs/references/file.md +0 -0
  105. {datachain-0.8.9 → datachain-0.8.11}/docs/references/index.md +0 -0
  106. {datachain-0.8.9 → datachain-0.8.11}/docs/references/sql.md +0 -0
  107. {datachain-0.8.9 → datachain-0.8.11}/docs/references/torch.md +0 -0
  108. {datachain-0.8.9 → datachain-0.8.11}/docs/references/udf.md +0 -0
  109. {datachain-0.8.9 → datachain-0.8.11}/docs/tutorials.md +0 -0
  110. {datachain-0.8.9 → datachain-0.8.11}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  111. {datachain-0.8.9 → datachain-0.8.11}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  112. {datachain-0.8.9 → datachain-0.8.11}/examples/computer_vision/openimage-detect.py +0 -0
  113. {datachain-0.8.9 → datachain-0.8.11}/examples/computer_vision/ultralytics-bbox.py +0 -0
  114. {datachain-0.8.9 → datachain-0.8.11}/examples/computer_vision/ultralytics-pose.py +0 -0
  115. {datachain-0.8.9 → datachain-0.8.11}/examples/computer_vision/ultralytics-segment.py +0 -0
  116. {datachain-0.8.9 → datachain-0.8.11}/examples/get_started/common_sql_functions.py +0 -0
  117. {datachain-0.8.9 → datachain-0.8.11}/examples/get_started/json-csv-reader.py +0 -0
  118. {datachain-0.8.9 → datachain-0.8.11}/examples/get_started/torch-loader.py +0 -0
  119. {datachain-0.8.9 → datachain-0.8.11}/examples/get_started/udfs/parallel.py +0 -0
  120. {datachain-0.8.9 → datachain-0.8.11}/examples/get_started/udfs/simple.py +0 -0
  121. {datachain-0.8.9 → datachain-0.8.11}/examples/get_started/udfs/stateful.py +0 -0
  122. {datachain-0.8.9 → datachain-0.8.11}/examples/llm_and_nlp/claude-query.py +0 -0
  123. {datachain-0.8.9 → datachain-0.8.11}/examples/multimodal/clip_inference.py +0 -0
  124. {datachain-0.8.9 → datachain-0.8.11}/examples/multimodal/hf_pipeline.py +0 -0
  125. {datachain-0.8.9 → datachain-0.8.11}/examples/multimodal/openai_image_desc_lib.py +0 -0
  126. {datachain-0.8.9 → datachain-0.8.11}/examples/multimodal/wds.py +0 -0
  127. {datachain-0.8.9 → datachain-0.8.11}/examples/multimodal/wds_filtered.py +0 -0
  128. {datachain-0.8.9 → datachain-0.8.11}/mkdocs.yml +0 -0
  129. {datachain-0.8.9 → datachain-0.8.11}/noxfile.py +0 -0
  130. {datachain-0.8.9 → datachain-0.8.11}/setup.cfg +0 -0
  131. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/__init__.py +0 -0
  132. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/__main__.py +0 -0
  133. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/asyn.py +0 -0
  134. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/catalog/datasource.py +0 -0
  135. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/catalog/loader.py +0 -0
  136. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/commands/__init__.py +0 -0
  137. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/commands/datasets.py +0 -0
  138. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/commands/du.py +0 -0
  139. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/commands/index.py +0 -0
  140. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/commands/ls.py +0 -0
  141. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/commands/misc.py +0 -0
  142. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/commands/query.py +0 -0
  143. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/cli/commands/show.py +0 -0
  144. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/client/__init__.py +0 -0
  145. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/client/azure.py +0 -0
  146. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/client/fileslice.py +0 -0
  147. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/client/gcs.py +0 -0
  148. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/client/hf.py +0 -0
  149. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/client/s3.py +0 -0
  150. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/config.py +0 -0
  151. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/data_storage/__init__.py +0 -0
  152. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/data_storage/db_engine.py +0 -0
  153. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/data_storage/job.py +0 -0
  154. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/data_storage/metastore.py +0 -0
  155. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/data_storage/serializer.py +0 -0
  156. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/diff/__init__.py +0 -0
  157. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/aggregate.py +0 -0
  158. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/array.py +0 -0
  159. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/base.py +0 -0
  160. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/numeric.py +0 -0
  161. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/path.py +0 -0
  162. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/random.py +0 -0
  163. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/string.py +0 -0
  164. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/func/window.py +0 -0
  165. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/job.py +0 -0
  166. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/__init__.py +0 -0
  167. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/arrow.py +0 -0
  168. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/clip.py +0 -0
  169. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/convert/__init__.py +0 -0
  170. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/convert/flatten.py +0 -0
  171. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/convert/sql_to_python.py +0 -0
  172. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/convert/unflatten.py +0 -0
  173. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  174. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/data_model.py +0 -0
  175. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/dataset_info.py +0 -0
  176. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/hf.py +0 -0
  177. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/image.py +0 -0
  178. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/listing_info.py +0 -0
  179. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/meta_formats.py +0 -0
  180. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/model_store.py +0 -0
  181. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/settings.py +0 -0
  182. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/signal_schema.py +0 -0
  183. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/tar.py +0 -0
  184. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/text.py +0 -0
  185. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/udf_signature.py +0 -0
  186. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/utils.py +0 -0
  187. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/vfile.py +0 -0
  188. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/webdataset.py +0 -0
  189. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/lib/webdataset_laion.py +0 -0
  190. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/model/__init__.py +0 -0
  191. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/model/bbox.py +0 -0
  192. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/model/pose.py +0 -0
  193. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/model/segment.py +0 -0
  194. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/model/ultralytics/__init__.py +0 -0
  195. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/model/ultralytics/bbox.py +0 -0
  196. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/model/ultralytics/pose.py +0 -0
  197. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/model/ultralytics/segment.py +0 -0
  198. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/nodes_thread_pool.py +0 -0
  199. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/progress.py +0 -0
  200. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/py.typed +0 -0
  201. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/__init__.py +0 -0
  202. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/batch.py +0 -0
  203. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/dispatch.py +0 -0
  204. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/metrics.py +0 -0
  205. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/params.py +0 -0
  206. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/queue.py +0 -0
  207. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/schema.py +0 -0
  208. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/session.py +0 -0
  209. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/udf.py +0 -0
  210. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/query/utils.py +0 -0
  211. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/remote/__init__.py +0 -0
  212. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/__init__.py +0 -0
  213. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/default/__init__.py +0 -0
  214. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/default/base.py +0 -0
  215. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/functions/__init__.py +0 -0
  216. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/functions/aggregate.py +0 -0
  217. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/functions/array.py +0 -0
  218. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/functions/conditional.py +0 -0
  219. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/functions/numeric.py +0 -0
  220. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/functions/path.py +0 -0
  221. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/functions/random.py +0 -0
  222. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/functions/string.py +0 -0
  223. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/selectable.py +0 -0
  224. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/sqlite/__init__.py +0 -0
  225. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/sqlite/types.py +0 -0
  226. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/sqlite/vector.py +0 -0
  227. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/types.py +0 -0
  228. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/sql/utils.py +0 -0
  229. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/telemetry.py +0 -0
  230. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/toolkit/__init__.py +0 -0
  231. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/toolkit/split.py +0 -0
  232. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/torch/__init__.py +0 -0
  233. {datachain-0.8.9 → datachain-0.8.11}/src/datachain/utils.py +0 -0
  234. {datachain-0.8.9 → datachain-0.8.11}/src/datachain.egg-info/dependency_links.txt +0 -0
  235. {datachain-0.8.9 → datachain-0.8.11}/src/datachain.egg-info/entry_points.txt +0 -0
  236. {datachain-0.8.9 → datachain-0.8.11}/src/datachain.egg-info/top_level.txt +0 -0
  237. {datachain-0.8.9 → datachain-0.8.11}/tests/__init__.py +0 -0
  238. {datachain-0.8.9 → datachain-0.8.11}/tests/benchmarks/__init__.py +0 -0
  239. {datachain-0.8.9 → datachain-0.8.11}/tests/benchmarks/conftest.py +0 -0
  240. {datachain-0.8.9 → datachain-0.8.11}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  241. {datachain-0.8.9 → datachain-0.8.11}/tests/benchmarks/datasets/.dvc/config +0 -0
  242. {datachain-0.8.9 → datachain-0.8.11}/tests/benchmarks/datasets/.gitignore +0 -0
  243. {datachain-0.8.9 → datachain-0.8.11}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  244. {datachain-0.8.9 → datachain-0.8.11}/tests/benchmarks/test_datachain.py +0 -0
  245. {datachain-0.8.9 → datachain-0.8.11}/tests/benchmarks/test_ls.py +0 -0
  246. {datachain-0.8.9 → datachain-0.8.11}/tests/benchmarks/test_version.py +0 -0
  247. {datachain-0.8.9 → datachain-0.8.11}/tests/conftest.py +0 -0
  248. {datachain-0.8.9 → datachain-0.8.11}/tests/data.py +0 -0
  249. {datachain-0.8.9 → datachain-0.8.11}/tests/examples/__init__.py +0 -0
  250. {datachain-0.8.9 → datachain-0.8.11}/tests/examples/test_examples.py +0 -0
  251. {datachain-0.8.9 → datachain-0.8.11}/tests/examples/test_wds_e2e.py +0 -0
  252. {datachain-0.8.9 → datachain-0.8.11}/tests/examples/wds_data.py +0 -0
  253. {datachain-0.8.9 → datachain-0.8.11}/tests/func/__init__.py +0 -0
  254. {datachain-0.8.9 → datachain-0.8.11}/tests/func/fake-service-account-credentials.json +0 -0
  255. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_dataset_query.py +0 -0
  256. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_feature_pickling.py +0 -0
  257. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_ls.py +0 -0
  258. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_meta_formats.py +0 -0
  259. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_metrics.py +0 -0
  260. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_pull.py +0 -0
  261. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_session.py +0 -0
  262. {datachain-0.8.9 → datachain-0.8.11}/tests/func/test_toolkit.py +0 -0
  263. {datachain-0.8.9 → datachain-0.8.11}/tests/scripts/feature_class.py +0 -0
  264. {datachain-0.8.9 → datachain-0.8.11}/tests/scripts/feature_class_exception.py +0 -0
  265. {datachain-0.8.9 → datachain-0.8.11}/tests/scripts/feature_class_parallel.py +0 -0
  266. {datachain-0.8.9 → datachain-0.8.11}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  267. {datachain-0.8.9 → datachain-0.8.11}/tests/test_atomicity.py +0 -0
  268. {datachain-0.8.9 → datachain-0.8.11}/tests/test_cli_e2e.py +0 -0
  269. {datachain-0.8.9 → datachain-0.8.11}/tests/test_query_e2e.py +0 -0
  270. {datachain-0.8.9 → datachain-0.8.11}/tests/test_telemetry.py +0 -0
  271. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/__init__.py +0 -0
  272. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/__init__.py +0 -0
  273. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/conftest.py +0 -0
  274. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_arrow.py +0 -0
  275. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_clip.py +0 -0
  276. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_feature.py +0 -0
  277. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_feature_utils.py +0 -0
  278. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_image.py +0 -0
  279. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_listing_info.py +0 -0
  280. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_models.py +0 -0
  281. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_schema.py +0 -0
  282. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_signal_schema.py +0 -0
  283. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_sql_to_python.py +0 -0
  284. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_text.py +0 -0
  285. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_udf_signature.py +0 -0
  286. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/lib/test_webdataset.py +0 -0
  287. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/sql/__init__.py +0 -0
  288. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/sql/sqlite/__init__.py +0 -0
  289. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/sql/sqlite/test_types.py +0 -0
  290. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
  291. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/sql/test_array.py +0 -0
  292. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/sql/test_path.py +0 -0
  293. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/sql/test_random.py +0 -0
  294. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/sql/test_selectable.py +0 -0
  295. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/sql/test_string.py +0 -0
  296. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_asyn.py +0 -0
  297. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_catalog.py +0 -0
  298. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_catalog_loader.py +0 -0
  299. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_cli_parsing.py +0 -0
  300. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_client_gcs.py +0 -0
  301. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_client_s3.py +0 -0
  302. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_config.py +0 -0
  303. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_database_engine.py +0 -0
  304. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_dataset.py +0 -0
  305. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_dispatch.py +0 -0
  306. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_fileslice.py +0 -0
  307. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_metastore.py +0 -0
  308. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_module_exports.py +0 -0
  309. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_query_metrics.py +0 -0
  310. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_query_params.py +0 -0
  311. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_serializer.py +0 -0
  312. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_session.py +0 -0
  313. {datachain-0.8.9 → datachain-0.8.11}/tests/unit/test_utils.py +0 -0
  314. {datachain-0.8.9 → datachain-0.8.11}/tests/utils.py +0 -0
@@ -3,7 +3,7 @@ name: Tests
3
3
  on:
4
4
  push:
5
5
  branches: [main]
6
- pull_request_target:
6
+ pull_request:
7
7
  workflow_dispatch:
8
8
 
9
9
  env:
@@ -14,15 +14,7 @@ concurrency:
14
14
  cancel-in-progress: true
15
15
 
16
16
  jobs:
17
- authorize:
18
- environment: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository && 'external' || 'internal' }}
19
- runs-on: ubuntu-latest
20
- steps:
21
- - run: true
22
-
23
17
  lint:
24
- needs: authorize
25
-
26
18
  runs-on: ubuntu-latest
27
19
  steps:
28
20
  - name: Check out the repository
@@ -62,8 +54,6 @@ jobs:
62
54
  run: nox -s lint
63
55
 
64
56
  datachain:
65
- needs: authorize
66
-
67
57
  timeout-minutes: 40
68
58
  runs-on: ${{ matrix.os }}
69
59
  strategy:
@@ -112,7 +102,11 @@ jobs:
112
102
  run: echo 'DISABLE_REMOTES_ARG=--disable-remotes=azure,gs' >> $env:GITHUB_ENV
113
103
 
114
104
  - name: Run tests
115
- run: nox -s tests-${{ matrix.pyv }} -- $DISABLE_REMOTES_ARG
105
+ run: nox -s tests-${{ matrix.pyv }} -- -m "not e2e and not examples" $DISABLE_REMOTES_ARG
106
+ shell: bash
107
+
108
+ - name: Run E2E tests
109
+ run: nox -s tests-${{ matrix.pyv }} -- -m "e2e" --cov-append $DISABLE_REMOTES_ARG
116
110
  shell: bash
117
111
 
118
112
  - name: Upload coverage report
@@ -129,8 +123,6 @@ jobs:
129
123
  run: nox -s docs
130
124
 
131
125
  examples:
132
- needs: authorize
133
-
134
126
  runs-on: ${{ matrix.os }}
135
127
  timeout-minutes: 60
136
128
  strategy:
@@ -24,13 +24,13 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.9.1'
27
+ rev: 'v0.9.3'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
31
31
  - id: ruff-format
32
32
  - repo: https://github.com/codespell-project/codespell
33
- rev: v2.3.0
33
+ rev: v2.4.0
34
34
  hooks:
35
35
  - id: codespell
36
36
  additional_dependencies: ["tomli"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.8.9
3
+ Version: 0.8.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -78,7 +78,6 @@ Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
78
78
  Requires-Dist: virtualenv; extra == "tests"
79
79
  Requires-Dist: dulwich; extra == "tests"
80
80
  Requires-Dist: hypothesis; extra == "tests"
81
- Requires-Dist: open_clip_torch; extra == "tests"
82
81
  Requires-Dist: aiotools>=1.7.0; extra == "tests"
83
82
  Requires-Dist: requests-mock; extra == "tests"
84
83
  Requires-Dist: scipy; extra == "tests"
@@ -94,12 +93,9 @@ Provides-Extra: examples
94
93
  Requires-Dist: datachain[tests]; extra == "examples"
95
94
  Requires-Dist: defusedxml; extra == "examples"
96
95
  Requires-Dist: accelerate; extra == "examples"
97
- Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
98
- Requires-Dist: unstructured[pdf]<0.16.12; extra == "examples"
99
- Requires-Dist: pdfplumber==0.11.5; extra == "examples"
100
96
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
101
- Requires-Dist: onnx==1.16.1; extra == "examples"
102
- Requires-Dist: ultralytics==8.3.61; extra == "examples"
97
+ Requires-Dist: ultralytics==8.3.68; extra == "examples"
98
+ Requires-Dist: open_clip_torch; extra == "examples"
103
99
 
104
100
  ================
105
101
  |logo| DataChain
@@ -8,6 +8,16 @@
8
8
  <script type="text/javascript">
9
9
  !function () { var e, t, n; e = "14ffd92a6cbf5f2", t = function () { Reo.init({ clientID: "14ffd92a6cbf5f2" }) }, (n = document.createElement("script")).src = "https://static.reo.dev/" + e + "/reo.js", n.async = !0, n.onload = t, document.head.appendChild(n) }();
10
10
  </script>
11
+ <script>
12
+ function initApollo() {
13
+ var n = Math.random().toString(36).substring(7), o = document.createElement("script");
14
+ o.src = "https://assets.apollo.io/micro/website-tracker/tracker.iife.js?nocache=" + n, o.async = !0, o.defer = !0,
15
+ o.onload = function () { window.trackingFunctions.onLoad({ appId: "66315101e9aa7501c79140d9" }) },
16
+ document.head.appendChild(o)
17
+ };
18
+ initApollo();
19
+ </script>
20
+
11
21
 
12
22
 
13
23
  {% endblock %}
@@ -1,4 +1,5 @@
1
1
  from huggingface_hub import InferenceClient
2
+ from requests import HTTPError
2
3
 
3
4
  from datachain import C, DataChain, DataModel
4
5
 
@@ -20,15 +21,20 @@ def eval_dialog(
20
21
  user_input: str,
21
22
  bot_response: str,
22
23
  ) -> DialogEval:
23
- completion = client.chat_completion(
24
- messages=[
25
- {
26
- "role": "user",
27
- "content": f"{PROMPT}\n\nUser: {user_input}\nBot: {bot_response}",
28
- },
29
- ],
30
- response_format={"type": "json", "value": DialogEval.model_json_schema()},
31
- )
24
+ try:
25
+ completion = client.chat_completion(
26
+ messages=[
27
+ {
28
+ "role": "user",
29
+ "content": f"{PROMPT}\n\nUser: {user_input}\nBot: {bot_response}",
30
+ },
31
+ ],
32
+ response_format={"type": "json", "value": DialogEval.model_json_schema()},
33
+ )
34
+ except HTTPError:
35
+ return DialogEval(
36
+ result="Error", reason="Error while interacting with the Hugging Face API."
37
+ )
32
38
 
33
39
  message = completion.choices[0].message
34
40
  try:
@@ -89,7 +89,6 @@ tests = [
89
89
  "virtualenv",
90
90
  "dulwich",
91
91
  "hypothesis",
92
- "open_clip_torch",
93
92
  "aiotools>=1.7.0",
94
93
  "requests-mock",
95
94
  "scipy"
@@ -107,12 +106,9 @@ examples = [
107
106
  "datachain[tests]",
108
107
  "defusedxml",
109
108
  "accelerate",
110
- "unstructured_ingest[embed-huggingface]",
111
- "unstructured[pdf]<0.16.12",
112
- "pdfplumber==0.11.5",
113
109
  "huggingface_hub[hf_transfer]",
114
- "onnx==1.16.1",
115
- "ultralytics==8.3.61"
110
+ "ultralytics==8.3.68",
111
+ "open_clip_torch"
116
112
  ]
117
113
 
118
114
  [project.urls]
@@ -22,15 +22,15 @@ def try_scandir(path):
22
22
  pass
23
23
 
24
24
 
25
- def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "DataChainCache":
25
+ def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "Cache":
26
26
  cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
27
- return DataChainCache(cache_dir, tmp_dir=tmp_dir)
27
+ return Cache(cache_dir, tmp_dir=tmp_dir)
28
28
 
29
29
 
30
30
  @contextmanager
31
31
  def temporary_cache(
32
32
  tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
33
- ) -> Iterator["DataChainCache"]:
33
+ ) -> Iterator["Cache"]:
34
34
  cache = get_temp_cache(tmp_dir, prefix=prefix)
35
35
  try:
36
36
  yield cache
@@ -39,7 +39,7 @@ def temporary_cache(
39
39
  cache.destroy()
40
40
 
41
41
 
42
- class DataChainCache:
42
+ class Cache:
43
43
  def __init__(self, cache_dir: str, tmp_dir: str):
44
44
  self.odb = LocalHashFileDB(
45
45
  LocalFileSystem(),
@@ -3,7 +3,6 @@ from .catalog import (
3
3
  QUERY_SCRIPT_CANCELED_EXIT_CODE,
4
4
  QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
5
5
  Catalog,
6
- parse_edatachain_file,
7
6
  )
8
7
  from .loader import get_catalog
9
8
 
@@ -13,5 +12,4 @@ __all__ = [
13
12
  "QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
14
13
  "Catalog",
15
14
  "get_catalog",
16
- "parse_edatachain_file",
17
15
  ]
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import os.path
6
6
  import posixpath
7
+ import signal
7
8
  import subprocess
8
9
  import sys
9
10
  import time
@@ -26,11 +27,10 @@ from uuid import uuid4
26
27
 
27
28
  import requests
28
29
  import sqlalchemy as sa
29
- import yaml
30
30
  from sqlalchemy import Column
31
31
  from tqdm.auto import tqdm
32
32
 
33
- from datachain.cache import DataChainCache
33
+ from datachain.cache import Cache
34
34
  from datachain.client import Client
35
35
  from datachain.dataset import (
36
36
  DATASET_PREFIX,
@@ -57,7 +57,7 @@ from datachain.node import DirType, Node, NodeWithPath
57
57
  from datachain.nodes_thread_pool import NodesThreadPool
58
58
  from datachain.remote.studio import StudioClient
59
59
  from datachain.sql.types import DateTime, SQLType
60
- from datachain.utils import DataChainDir, datachain_paths_join
60
+ from datachain.utils import DataChainDir
61
61
 
62
62
  from .datasource import DataSource
63
63
 
@@ -73,7 +73,6 @@ if TYPE_CHECKING:
73
73
  logger = logging.getLogger("datachain")
74
74
 
75
75
  DEFAULT_DATASET_DIR = "dataset"
76
- DATASET_FILE_SUFFIX = ".edatachain"
77
76
 
78
77
  TTL_INT = 4 * 60 * 60
79
78
 
@@ -99,6 +98,47 @@ def noop(_: str):
99
98
  pass
100
99
 
101
100
 
101
+ class TerminationSignal(RuntimeError): # noqa: N818
102
+ def __init__(self, signal):
103
+ self.signal = signal
104
+ super().__init__("Received termination signal", signal)
105
+
106
+ def __repr__(self):
107
+ return f"{self.__class__.__name__}({self.signal})"
108
+
109
+
110
+ if sys.platform == "win32":
111
+ SIGINT = signal.CTRL_C_EVENT
112
+ else:
113
+ SIGINT = signal.SIGINT
114
+
115
+
116
+ def shutdown_process(
117
+ proc: subprocess.Popen,
118
+ interrupt_timeout: Optional[int] = None,
119
+ terminate_timeout: Optional[int] = None,
120
+ ) -> int:
121
+ """Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL."""
122
+
123
+ logger.info("sending interrupt signal to the process %s", proc.pid)
124
+ proc.send_signal(SIGINT)
125
+
126
+ logger.info("waiting for the process %s to finish", proc.pid)
127
+ try:
128
+ return proc.wait(interrupt_timeout)
129
+ except subprocess.TimeoutExpired:
130
+ logger.info(
131
+ "timed out waiting, sending terminate signal to the process %s", proc.pid
132
+ )
133
+ proc.terminate()
134
+ try:
135
+ return proc.wait(terminate_timeout)
136
+ except subprocess.TimeoutExpired:
137
+ logger.info("timed out waiting, killing the process %s", proc.pid)
138
+ proc.kill()
139
+ return proc.wait()
140
+
141
+
102
142
  def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
103
143
  buffer = b""
104
144
  while byt := stream.read(1): # Read one byte at a time
@@ -247,7 +287,6 @@ class NodeGroup:
247
287
  # The source path within the bucket
248
288
  # (not including the bucket name or s3:// prefix)
249
289
  source_path: str = ""
250
- is_edatachain: bool = False
251
290
  dataset_name: Optional[str] = None
252
291
  dataset_version: Optional[int] = None
253
292
  instantiated_nodes: Optional[list[NodeWithPath]] = None
@@ -272,55 +311,11 @@ class NodeGroup:
272
311
  self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
273
312
 
274
313
 
275
- def check_output_dataset_file(
276
- output: str,
277
- force: bool = False,
278
- dataset_filename: Optional[str] = None,
279
- skip_check_edatachain: bool = False,
280
- ) -> str:
281
- """
282
- Checks the dataset filename for existence or if it should be force-overwritten.
283
- """
284
- dataset_file = (
285
- dataset_filename if dataset_filename else output + DATASET_FILE_SUFFIX
286
- )
287
- if not skip_check_edatachain and os.path.exists(dataset_file):
288
- if force:
289
- os.remove(dataset_file)
290
- else:
291
- raise RuntimeError(f"Output dataset file already exists: {dataset_file}")
292
- return dataset_file
293
-
294
-
295
- def parse_edatachain_file(filename: str) -> list[dict[str, Any]]:
296
- with open(filename, encoding="utf-8") as f:
297
- contents = yaml.safe_load(f)
298
-
299
- if not isinstance(contents, list):
300
- contents = [contents]
301
-
302
- for entry in contents:
303
- if not isinstance(entry, dict):
304
- raise TypeError(
305
- "Failed parsing EDataChain file, "
306
- "each data source entry must be a dictionary"
307
- )
308
- if "data-source" not in entry or "files" not in entry:
309
- raise ValueError(
310
- "Failed parsing EDataChain file, "
311
- "each data source entry must contain the "
312
- '"data-source" and "files" keys'
313
- )
314
-
315
- return contents
316
-
317
-
318
314
  def prepare_output_for_cp(
319
315
  node_groups: list[NodeGroup],
320
316
  output: str,
321
317
  force: bool = False,
322
- edatachain_only: bool = False,
323
- no_edatachain_file: bool = False,
318
+ no_cp: bool = False,
324
319
  ) -> tuple[bool, Optional[str]]:
325
320
  total_node_count = 0
326
321
  for node_group in node_groups:
@@ -333,7 +328,7 @@ def prepare_output_for_cp(
333
328
  always_copy_dir_contents = False
334
329
  copy_to_filename = None
335
330
 
336
- if edatachain_only:
331
+ if no_cp:
337
332
  return always_copy_dir_contents, copy_to_filename
338
333
 
339
334
  if not os.path.isdir(output):
@@ -358,10 +353,6 @@ def prepare_output_for_cp(
358
353
  copy_to_filename = output
359
354
  else:
360
355
  raise FileNotFoundError(f"Is not a directory: {output}")
361
-
362
- if copy_to_filename and not no_edatachain_file:
363
- raise RuntimeError("File to file cp not supported with .edatachain files!")
364
-
365
356
  return always_copy_dir_contents, copy_to_filename
366
357
 
367
358
 
@@ -465,8 +456,6 @@ def instantiate_node_groups(
465
456
  copy_to_filename,
466
457
  recursive,
467
458
  copy_dir_contents,
468
- source_path,
469
- node_group.is_edatachain,
470
459
  node_group.is_dataset,
471
460
  )
472
461
  if not virtual_only:
@@ -484,24 +473,6 @@ def instantiate_node_groups(
484
473
  instantiate_progress_bar.close()
485
474
 
486
475
 
487
- def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
488
- metafile_data = []
489
- for node_group in node_groups:
490
- if not node_group.sources:
491
- continue
492
- listing: Listing = node_group.listing
493
- metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
494
- for node in node_group.instantiated_nodes:
495
- if not node.n.is_dir:
496
- metafile_group["files"].append( # type: ignore [attr-defined]
497
- node.get_metafile_data()
498
- )
499
- if metafile_group["files"]:
500
- metafile_data.append(metafile_group)
501
-
502
- return metafile_data
503
-
504
-
505
476
  def find_column_to_str( # noqa: PLR0911
506
477
  row: tuple[Any, ...], field_lookup: dict[str, int], src: DataSource, column: str
507
478
  ) -> str:
@@ -536,7 +507,7 @@ def find_column_to_str( # noqa: PLR0911
536
507
  return ""
537
508
 
538
509
 
539
- def clone_catalog_with_cache(catalog: "Catalog", cache: "DataChainCache") -> "Catalog":
510
+ def clone_catalog_with_cache(catalog: "Catalog", cache: "Cache") -> "Catalog":
540
511
  clone = catalog.copy()
541
512
  clone.cache = cache
542
513
  return clone
@@ -559,7 +530,7 @@ class Catalog:
559
530
  datachain_dir.init()
560
531
  self.metastore = metastore
561
532
  self._warehouse = warehouse
562
- self.cache = DataChainCache(datachain_dir.cache, datachain_dir.tmp)
533
+ self.cache = Cache(datachain_dir.cache, datachain_dir.tmp)
563
534
  self.client_config = client_config if client_config is not None else {}
564
535
  self._init_params = {
565
536
  "cache_dir": cache_dir,
@@ -703,22 +674,8 @@ class Catalog:
703
674
  enlisted_sources: list[tuple[bool, bool, Any]] = []
704
675
  client_config = client_config or self.client_config
705
676
  for src in sources: # Opt: parallel
706
- if src.endswith(DATASET_FILE_SUFFIX) and os.path.isfile(src):
707
- # TODO: Also allow using EDataChain files from cloud locations?
708
- edatachain_data = parse_edatachain_file(src)
709
- indexed_sources = []
710
- for ds in edatachain_data:
711
- listing, _, source_path = self.enlist_source(
712
- ds["data-source"]["uri"],
713
- update,
714
- client_config=client_config,
715
- )
716
- paths = datachain_paths_join(
717
- source_path, (f["name"] for f in ds["files"])
718
- )
719
- indexed_sources.append((listing, source_path, paths))
720
- enlisted_sources.append((True, False, indexed_sources))
721
- elif src.startswith("ds://"):
677
+ listing: Optional[Listing]
678
+ if src.startswith("ds://"):
722
679
  ds_name, ds_version = parse_dataset_uri(src)
723
680
  dataset = self.get_dataset(ds_name)
724
681
  if not ds_version:
@@ -796,7 +753,6 @@ class Catalog:
796
753
  listing.client,
797
754
  dsrc,
798
755
  source_path,
799
- is_edatachain=True,
800
756
  )
801
757
  )
802
758
  else:
@@ -1360,8 +1316,6 @@ class Catalog:
1360
1316
  local_ds_version: Optional[int] = None,
1361
1317
  cp: bool = False,
1362
1318
  force: bool = False,
1363
- edatachain: bool = False,
1364
- edatachain_file: Optional[str] = None,
1365
1319
  *,
1366
1320
  client_config=None,
1367
1321
  ) -> None:
@@ -1373,8 +1327,6 @@ class Catalog:
1373
1327
  [ds_uri],
1374
1328
  output,
1375
1329
  force=force,
1376
- no_edatachain_file=not edatachain,
1377
- edatachain_file=edatachain_file,
1378
1330
  client_config=client_config,
1379
1331
  )
1380
1332
  print(f"Dataset {ds_uri} instantiated locally to {output}")
@@ -1541,8 +1493,6 @@ class Catalog:
1541
1493
  recursive: bool = False,
1542
1494
  no_glob: bool = False,
1543
1495
  no_cp: bool = False,
1544
- edatachain: bool = False,
1545
- edatachain_file: Optional[str] = None,
1546
1496
  *,
1547
1497
  client_config=None,
1548
1498
  ) -> None:
@@ -1551,9 +1501,8 @@ class Catalog:
1551
1501
  them into the dataset folder.
1552
1502
  It also adds those files to a dataset in database, which is
1553
1503
  created if doesn't exist yet
1554
- Optionally, it creates a .edatachain file
1555
1504
  """
1556
- if not no_cp or edatachain:
1505
+ if not no_cp:
1557
1506
  self.cp(
1558
1507
  sources,
1559
1508
  output,
@@ -1561,9 +1510,7 @@ class Catalog:
1561
1510
  update=update,
1562
1511
  recursive=recursive,
1563
1512
  no_glob=no_glob,
1564
- edatachain_only=no_cp,
1565
- no_edatachain_file=not edatachain,
1566
- edatachain_file=edatachain_file,
1513
+ no_cp=no_cp,
1567
1514
  client_config=client_config,
1568
1515
  )
1569
1516
  else:
@@ -1588,6 +1535,8 @@ class Catalog:
1588
1535
  output_hook: Callable[[str], None] = noop,
1589
1536
  params: Optional[dict[str, str]] = None,
1590
1537
  job_id: Optional[str] = None,
1538
+ interrupt_timeout: Optional[int] = None,
1539
+ terminate_timeout: Optional[int] = None,
1591
1540
  ) -> None:
1592
1541
  cmd = [python_executable, "-c", query_script]
1593
1542
  env = dict(env or os.environ)
@@ -1601,13 +1550,48 @@ class Catalog:
1601
1550
  if capture_output:
1602
1551
  popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1603
1552
 
1553
+ def raise_termination_signal(sig: int, _: Any) -> NoReturn:
1554
+ raise TerminationSignal(sig)
1555
+
1556
+ thread: Optional[Thread] = None
1604
1557
  with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
1605
- if capture_output:
1606
- args = (proc.stdout, output_hook)
1607
- thread = Thread(target=_process_stream, args=args, daemon=True)
1608
- thread.start()
1609
- thread.join() # wait for the reader thread
1558
+ logger.info("Starting process %s", proc.pid)
1559
+
1560
+ orig_sigint_handler = signal.getsignal(signal.SIGINT)
1561
+ # ignore SIGINT in the main process.
1562
+ # In the terminal, SIGINTs are received by all the processes in
1563
+ # the foreground process group, so the script will receive the signal too.
1564
+ # (If we forward the signal to the child, it will receive it twice.)
1565
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
1610
1566
 
1567
+ orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
1568
+ signal.signal(signal.SIGTERM, raise_termination_signal)
1569
+ try:
1570
+ if capture_output:
1571
+ args = (proc.stdout, output_hook)
1572
+ thread = Thread(target=_process_stream, args=args, daemon=True)
1573
+ thread.start()
1574
+
1575
+ proc.wait()
1576
+ except TerminationSignal as exc:
1577
+ signal.signal(signal.SIGTERM, orig_sigterm_handler)
1578
+ signal.signal(signal.SIGINT, orig_sigint_handler)
1579
+ logging.info("Shutting down process %s, received %r", proc.pid, exc)
1580
+ # Rather than forwarding the signal to the child, we try to shut it down
1581
+ # gracefully. This is because we consider the script to be interactive
1582
+ # and special, so we give it time to cleanup before exiting.
1583
+ shutdown_process(proc, interrupt_timeout, terminate_timeout)
1584
+ if proc.returncode:
1585
+ raise QueryScriptCancelError(
1586
+ "Query script was canceled by user", return_code=proc.returncode
1587
+ ) from exc
1588
+ finally:
1589
+ signal.signal(signal.SIGTERM, orig_sigterm_handler)
1590
+ signal.signal(signal.SIGINT, orig_sigint_handler)
1591
+ if thread:
1592
+ thread.join() # wait for the reader thread
1593
+
1594
+ logging.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1611
1595
  if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1612
1596
  raise QueryScriptCancelError(
1613
1597
  "Query script was canceled by user",
@@ -1626,17 +1610,14 @@ class Catalog:
1626
1610
  force: bool = False,
1627
1611
  update: bool = False,
1628
1612
  recursive: bool = False,
1629
- edatachain_file: Optional[str] = None,
1630
- edatachain_only: bool = False,
1631
- no_edatachain_file: bool = False,
1613
+ no_cp: bool = False,
1632
1614
  no_glob: bool = False,
1633
1615
  *,
1634
- client_config=None,
1635
- ) -> list[dict[str, Any]]:
1616
+ client_config: Optional["dict"] = None,
1617
+ ) -> None:
1636
1618
  """
1637
1619
  This function copies files from cloud sources to local destination directory
1638
1620
  If cloud source is not indexed, or has expired index, it runs indexing
1639
- It also creates .edatachain file by default, if not specified differently
1640
1621
  """
1641
1622
  client_config = client_config or self.client_config
1642
1623
  node_groups = self.enlist_sources_grouped(
@@ -1647,17 +1628,11 @@ class Catalog:
1647
1628
  )
1648
1629
 
1649
1630
  always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
1650
- node_groups, output, force, edatachain_only, no_edatachain_file
1631
+ node_groups, output, force, no_cp
1651
1632
  )
1652
- dataset_file = check_output_dataset_file(
1653
- output, force, edatachain_file, no_edatachain_file
1654
- )
1655
-
1656
1633
  total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
1657
-
1658
- if total_files == 0:
1659
- # Nothing selected to cp
1660
- return []
1634
+ if not total_files:
1635
+ return
1661
1636
 
1662
1637
  desc_max_len = max(len(output) + 16, 19)
1663
1638
  bar_format = (
@@ -1667,7 +1642,7 @@ class Catalog:
1667
1642
  "[{elapsed}<{remaining}, {rate_fmt:>8}]"
1668
1643
  )
1669
1644
 
1670
- if not edatachain_only:
1645
+ if not no_cp:
1671
1646
  with get_download_bar(bar_format, total_size) as pbar:
1672
1647
  for node_group in node_groups:
1673
1648
  node_group.download(recursive=recursive, pbar=pbar)
@@ -1679,21 +1654,10 @@ class Catalog:
1679
1654
  total_files,
1680
1655
  force,
1681
1656
  recursive,
1682
- edatachain_only,
1657
+ no_cp,
1683
1658
  always_copy_dir_contents,
1684
1659
  copy_to_filename,
1685
1660
  )
1686
- if no_edatachain_file:
1687
- return []
1688
-
1689
- metafile_data = compute_metafile_data(node_groups)
1690
- if metafile_data:
1691
- # Don't write the metafile if nothing was copied
1692
- print(f"Creating '{dataset_file}'")
1693
- with open(dataset_file, "w", encoding="utf-8") as fd:
1694
- yaml.dump(metafile_data, fd, sort_keys=False)
1695
-
1696
- return metafile_data
1697
1661
 
1698
1662
  def du(
1699
1663
  self,