datachain 0.21.1__tar.gz → 0.22.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (397) hide show
  1. {datachain-0.21.1 → datachain-0.22.0}/.github/workflows/tests-studio.yml +1 -0
  2. {datachain-0.21.1 → datachain-0.22.0}/.pre-commit-config.yaml +1 -1
  3. {datachain-0.21.1 → datachain-0.22.0}/PKG-INFO +2 -2
  4. datachain-0.22.0/docs/guide/db_migrations.md +114 -0
  5. datachain-0.22.0/docs/guide/env.md +18 -0
  6. {datachain-0.21.1 → datachain-0.22.0}/docs/guide/index.md +3 -0
  7. datachain-0.22.0/docs/guide/namespaces.md +119 -0
  8. {datachain-0.21.1 → datachain-0.22.0}/examples/get_started/json-csv-reader.py +1 -1
  9. {datachain-0.21.1 → datachain-0.22.0}/examples/incremental_processing/delta.py +1 -1
  10. {datachain-0.21.1 → datachain-0.22.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +15 -5
  11. {datachain-0.21.1 → datachain-0.22.0}/mkdocs.yml +3 -0
  12. {datachain-0.21.1 → datachain-0.22.0}/pyproject.toml +3 -2
  13. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/__init__.py +2 -0
  14. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cache.py +2 -2
  15. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/catalog/catalog.py +180 -65
  16. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/__init__.py +0 -7
  17. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/commands/datasets.py +43 -28
  18. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/commands/ls.py +2 -2
  19. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/parser/__init__.py +1 -35
  20. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/client/fsspec.py +5 -3
  21. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/client/hf.py +10 -0
  22. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/client/local.py +4 -4
  23. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/data_storage/metastore.py +422 -37
  24. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/data_storage/sqlite.py +136 -7
  25. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/data_storage/warehouse.py +26 -7
  26. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/dataset.py +126 -12
  27. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/delta.py +11 -7
  28. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/error.py +36 -0
  29. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/func.py +1 -1
  30. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/arrow.py +3 -3
  31. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dataset_info.py +4 -0
  32. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/datachain.py +260 -92
  33. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/datasets.py +104 -50
  34. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/listings.py +3 -3
  35. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/records.py +1 -0
  36. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/storage.py +38 -40
  37. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/file.py +77 -23
  38. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/listing.py +3 -1
  39. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/meta_formats.py +1 -1
  40. datachain-0.22.0/src/datachain/lib/namespaces.py +71 -0
  41. datachain-0.22.0/src/datachain/lib/projects.py +86 -0
  42. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/pytorch.py +1 -1
  43. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/settings.py +10 -0
  44. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/tar.py +1 -2
  45. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/udf.py +1 -1
  46. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/udf_signature.py +1 -1
  47. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/webdataset.py +30 -20
  48. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/listing.py +3 -1
  49. datachain-0.22.0/src/datachain/namespace.py +65 -0
  50. datachain-0.22.0/src/datachain/project.py +78 -0
  51. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/dataset.py +71 -46
  52. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/session.py +1 -1
  53. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/remote/studio.py +61 -26
  54. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/studio.py +23 -6
  55. {datachain-0.21.1 → datachain-0.22.0}/src/datachain.egg-info/PKG-INFO +2 -2
  56. {datachain-0.21.1 → datachain-0.22.0}/src/datachain.egg-info/SOURCES.txt +9 -0
  57. {datachain-0.21.1 → datachain-0.22.0}/src/datachain.egg-info/requires.txt +1 -1
  58. {datachain-0.21.1 → datachain-0.22.0}/tests/conftest.py +86 -4
  59. {datachain-0.21.1 → datachain-0.22.0}/tests/examples/test_examples.py +2 -0
  60. {datachain-0.21.1 → datachain-0.22.0}/tests/examples/test_wds_e2e.py +5 -5
  61. {datachain-0.21.1 → datachain-0.22.0}/tests/func/functions/test_aggregate.py +7 -9
  62. {datachain-0.21.1 → datachain-0.22.0}/tests/func/functions/test_array.py +20 -21
  63. {datachain-0.21.1 → datachain-0.22.0}/tests/func/functions/test_conditional.py +6 -7
  64. {datachain-0.21.1 → datachain-0.22.0}/tests/func/functions/test_numeric.py +4 -5
  65. {datachain-0.21.1 → datachain-0.22.0}/tests/func/functions/test_path.py +6 -8
  66. {datachain-0.21.1 → datachain-0.22.0}/tests/func/functions/test_random.py +3 -6
  67. {datachain-0.21.1 → datachain-0.22.0}/tests/func/functions/test_string.py +6 -7
  68. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_batching.py +5 -5
  69. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_datachain.py +31 -36
  70. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_dataset_query.py +20 -2
  71. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_datasets.py +113 -81
  72. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_delta.py +15 -29
  73. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_file.py +33 -7
  74. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_listing.py +1 -1
  75. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_metastore.py +30 -10
  76. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_pull.py +68 -18
  77. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_retry.py +6 -8
  78. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_toolkit.py +2 -2
  79. {datachain-0.21.1 → datachain-0.22.0}/tests/test_atomicity.py +3 -0
  80. {datachain-0.21.1 → datachain-0.22.0}/tests/test_cli_e2e.py +43 -10
  81. {datachain-0.21.1 → datachain-0.22.0}/tests/test_cli_studio.py +40 -29
  82. {datachain-0.21.1 → datachain-0.22.0}/tests/test_import_time.py +2 -2
  83. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_datachain.py +231 -110
  84. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_datachain_bootstrap.py +3 -3
  85. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_datachain_merge.py +11 -11
  86. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_diff.py +43 -45
  87. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_feature_utils.py +2 -2
  88. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_file.py +50 -8
  89. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_listing_info.py +7 -2
  90. datachain-0.22.0/tests/unit/lib/test_namespace.py +79 -0
  91. datachain-0.22.0/tests/unit/lib/test_project.py +157 -0
  92. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_schema.py +1 -4
  93. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_dataset.py +43 -1
  94. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_func.py +149 -125
  95. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_listing.py +20 -4
  96. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_metastore.py +35 -3
  97. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_session.py +31 -9
  98. {datachain-0.21.1 → datachain-0.22.0}/tests/utils.py +2 -2
  99. {datachain-0.21.1 → datachain-0.22.0}/.cruft.json +0 -0
  100. {datachain-0.21.1 → datachain-0.22.0}/.gitattributes +0 -0
  101. {datachain-0.21.1 → datachain-0.22.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  102. {datachain-0.21.1 → datachain-0.22.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  103. {datachain-0.21.1 → datachain-0.22.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  104. {datachain-0.21.1 → datachain-0.22.0}/.github/codecov.yaml +0 -0
  105. {datachain-0.21.1 → datachain-0.22.0}/.github/dependabot.yml +0 -0
  106. {datachain-0.21.1 → datachain-0.22.0}/.github/workflows/benchmarks.yml +0 -0
  107. {datachain-0.21.1 → datachain-0.22.0}/.github/workflows/release.yml +0 -0
  108. {datachain-0.21.1 → datachain-0.22.0}/.github/workflows/tests.yml +0 -0
  109. {datachain-0.21.1 → datachain-0.22.0}/.github/workflows/update-template.yaml +0 -0
  110. {datachain-0.21.1 → datachain-0.22.0}/.gitignore +0 -0
  111. {datachain-0.21.1 → datachain-0.22.0}/CODE_OF_CONDUCT.rst +0 -0
  112. {datachain-0.21.1 → datachain-0.22.0}/LICENSE +0 -0
  113. {datachain-0.21.1 → datachain-0.22.0}/README.rst +0 -0
  114. {datachain-0.21.1 → datachain-0.22.0}/docs/assets/captioned_cartoons.png +0 -0
  115. {datachain-0.21.1 → datachain-0.22.0}/docs/assets/datachain-white.svg +0 -0
  116. {datachain-0.21.1 → datachain-0.22.0}/docs/assets/datachain.svg +0 -0
  117. {datachain-0.21.1 → datachain-0.22.0}/docs/commands/auth/login.md +0 -0
  118. {datachain-0.21.1 → datachain-0.22.0}/docs/commands/auth/logout.md +0 -0
  119. {datachain-0.21.1 → datachain-0.22.0}/docs/commands/auth/team.md +0 -0
  120. {datachain-0.21.1 → datachain-0.22.0}/docs/commands/auth/token.md +0 -0
  121. {datachain-0.21.1 → datachain-0.22.0}/docs/commands/index.md +0 -0
  122. {datachain-0.21.1 → datachain-0.22.0}/docs/commands/job/cancel.md +0 -0
  123. {datachain-0.21.1 → datachain-0.22.0}/docs/commands/job/clusters.md +0 -0
  124. {datachain-0.21.1 → datachain-0.22.0}/docs/commands/job/logs.md +0 -0
  125. {datachain-0.21.1 → datachain-0.22.0}/docs/commands/job/ls.md +0 -0
  126. {datachain-0.21.1 → datachain-0.22.0}/docs/commands/job/run.md +0 -0
  127. {datachain-0.21.1 → datachain-0.22.0}/docs/contributing.md +0 -0
  128. {datachain-0.21.1 → datachain-0.22.0}/docs/css/github-permalink-style.css +0 -0
  129. {datachain-0.21.1 → datachain-0.22.0}/docs/examples.md +0 -0
  130. {datachain-0.21.1 → datachain-0.22.0}/docs/guide/delta.md +0 -0
  131. {datachain-0.21.1 → datachain-0.22.0}/docs/guide/processing.md +0 -0
  132. {datachain-0.21.1 → datachain-0.22.0}/docs/guide/remotes.md +0 -0
  133. {datachain-0.21.1 → datachain-0.22.0}/docs/guide/retry.md +0 -0
  134. {datachain-0.21.1 → datachain-0.22.0}/docs/index.md +0 -0
  135. {datachain-0.21.1 → datachain-0.22.0}/docs/overrides/main.html +0 -0
  136. {datachain-0.21.1 → datachain-0.22.0}/docs/quick-start.md +0 -0
  137. {datachain-0.21.1 → datachain-0.22.0}/docs/references/data-types/arrowrow.md +0 -0
  138. {datachain-0.21.1 → datachain-0.22.0}/docs/references/data-types/bbox.md +0 -0
  139. {datachain-0.21.1 → datachain-0.22.0}/docs/references/data-types/file.md +0 -0
  140. {datachain-0.21.1 → datachain-0.22.0}/docs/references/data-types/imagefile.md +0 -0
  141. {datachain-0.21.1 → datachain-0.22.0}/docs/references/data-types/index.md +0 -0
  142. {datachain-0.21.1 → datachain-0.22.0}/docs/references/data-types/pose.md +0 -0
  143. {datachain-0.21.1 → datachain-0.22.0}/docs/references/data-types/segment.md +0 -0
  144. {datachain-0.21.1 → datachain-0.22.0}/docs/references/data-types/tarvfile.md +0 -0
  145. {datachain-0.21.1 → datachain-0.22.0}/docs/references/data-types/textfile.md +0 -0
  146. {datachain-0.21.1 → datachain-0.22.0}/docs/references/data-types/videofile.md +0 -0
  147. {datachain-0.21.1 → datachain-0.22.0}/docs/references/datachain.md +0 -0
  148. {datachain-0.21.1 → datachain-0.22.0}/docs/references/func.md +0 -0
  149. {datachain-0.21.1 → datachain-0.22.0}/docs/references/index.md +0 -0
  150. {datachain-0.21.1 → datachain-0.22.0}/docs/references/toolkit.md +0 -0
  151. {datachain-0.21.1 → datachain-0.22.0}/docs/references/torch.md +0 -0
  152. {datachain-0.21.1 → datachain-0.22.0}/docs/references/udf.md +0 -0
  153. {datachain-0.21.1 → datachain-0.22.0}/docs/tutorials.md +0 -0
  154. {datachain-0.21.1 → datachain-0.22.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  155. {datachain-0.21.1 → datachain-0.22.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  156. {datachain-0.21.1 → datachain-0.22.0}/examples/computer_vision/openimage-detect.py +0 -0
  157. {datachain-0.21.1 → datachain-0.22.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
  158. {datachain-0.21.1 → datachain-0.22.0}/examples/computer_vision/ultralytics-pose.py +0 -0
  159. {datachain-0.21.1 → datachain-0.22.0}/examples/computer_vision/ultralytics-segment.py +0 -0
  160. {datachain-0.21.1 → datachain-0.22.0}/examples/get_started/common_sql_functions.py +0 -0
  161. {datachain-0.21.1 → datachain-0.22.0}/examples/get_started/torch-loader.py +0 -0
  162. {datachain-0.21.1 → datachain-0.22.0}/examples/get_started/udfs/parallel.py +0 -0
  163. {datachain-0.21.1 → datachain-0.22.0}/examples/get_started/udfs/simple.py +0 -0
  164. {datachain-0.21.1 → datachain-0.22.0}/examples/get_started/udfs/stateful.py +0 -0
  165. {datachain-0.21.1 → datachain-0.22.0}/examples/incremental_processing/retry.py +0 -0
  166. {datachain-0.21.1 → datachain-0.22.0}/examples/incremental_processing/utils.py +0 -0
  167. {datachain-0.21.1 → datachain-0.22.0}/examples/llm_and_nlp/claude-query.py +0 -0
  168. {datachain-0.21.1 → datachain-0.22.0}/examples/multimodal/clip_inference.py +0 -0
  169. {datachain-0.21.1 → datachain-0.22.0}/examples/multimodal/hf_pipeline.py +0 -0
  170. {datachain-0.21.1 → datachain-0.22.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
  171. {datachain-0.21.1 → datachain-0.22.0}/examples/multimodal/wds.py +0 -0
  172. {datachain-0.21.1 → datachain-0.22.0}/examples/multimodal/wds_filtered.py +0 -0
  173. {datachain-0.21.1 → datachain-0.22.0}/noxfile.py +0 -0
  174. {datachain-0.21.1 → datachain-0.22.0}/setup.cfg +0 -0
  175. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/__main__.py +0 -0
  176. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/asyn.py +0 -0
  177. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/catalog/__init__.py +0 -0
  178. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/catalog/datasource.py +0 -0
  179. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/catalog/loader.py +0 -0
  180. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/commands/__init__.py +0 -0
  181. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/commands/du.py +0 -0
  182. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/commands/index.py +0 -0
  183. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/commands/misc.py +0 -0
  184. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/commands/query.py +0 -0
  185. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/commands/show.py +0 -0
  186. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/parser/job.py +0 -0
  187. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/parser/studio.py +0 -0
  188. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/parser/utils.py +0 -0
  189. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/cli/utils.py +0 -0
  190. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/client/__init__.py +0 -0
  191. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/client/azure.py +0 -0
  192. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/client/fileslice.py +0 -0
  193. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/client/gcs.py +0 -0
  194. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/client/s3.py +0 -0
  195. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/config.py +0 -0
  196. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/data_storage/__init__.py +0 -0
  197. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/data_storage/db_engine.py +0 -0
  198. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/data_storage/job.py +0 -0
  199. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/data_storage/schema.py +0 -0
  200. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/data_storage/serializer.py +0 -0
  201. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/diff/__init__.py +0 -0
  202. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/fs/__init__.py +0 -0
  203. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/fs/reference.py +0 -0
  204. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/fs/utils.py +0 -0
  205. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/__init__.py +0 -0
  206. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/aggregate.py +0 -0
  207. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/array.py +0 -0
  208. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/base.py +0 -0
  209. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/conditional.py +0 -0
  210. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/numeric.py +0 -0
  211. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/path.py +0 -0
  212. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/random.py +0 -0
  213. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/string.py +0 -0
  214. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/func/window.py +0 -0
  215. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/job.py +0 -0
  216. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/__init__.py +0 -0
  217. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/clip.py +0 -0
  218. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/convert/__init__.py +0 -0
  219. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/convert/flatten.py +0 -0
  220. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
  221. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
  222. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/convert/unflatten.py +0 -0
  223. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  224. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/data_model.py +0 -0
  225. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/__init__.py +0 -0
  226. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/csv.py +0 -0
  227. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/database.py +0 -0
  228. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/hf.py +0 -0
  229. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/json.py +0 -0
  230. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/pandas.py +0 -0
  231. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/parquet.py +0 -0
  232. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/utils.py +0 -0
  233. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/dc/values.py +0 -0
  234. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/hf.py +0 -0
  235. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/image.py +0 -0
  236. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/listing_info.py +0 -0
  237. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/model_store.py +0 -0
  238. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/signal_schema.py +0 -0
  239. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/text.py +0 -0
  240. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/utils.py +0 -0
  241. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/video.py +0 -0
  242. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/lib/webdataset_laion.py +0 -0
  243. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/model/__init__.py +0 -0
  244. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/model/bbox.py +0 -0
  245. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/model/pose.py +0 -0
  246. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/model/segment.py +0 -0
  247. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/model/ultralytics/__init__.py +0 -0
  248. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/model/ultralytics/bbox.py +0 -0
  249. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/model/ultralytics/pose.py +0 -0
  250. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/model/ultralytics/segment.py +0 -0
  251. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/model/utils.py +0 -0
  252. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/node.py +0 -0
  253. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/nodes_fetcher.py +0 -0
  254. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/nodes_thread_pool.py +0 -0
  255. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/progress.py +0 -0
  256. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/py.typed +0 -0
  257. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/__init__.py +0 -0
  258. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/batch.py +0 -0
  259. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/dispatch.py +0 -0
  260. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/metrics.py +0 -0
  261. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/params.py +0 -0
  262. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/queue.py +0 -0
  263. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/schema.py +0 -0
  264. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/udf.py +0 -0
  265. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/query/utils.py +0 -0
  266. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/remote/__init__.py +0 -0
  267. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/script_meta.py +0 -0
  268. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/semver.py +0 -0
  269. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/__init__.py +0 -0
  270. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/default/__init__.py +0 -0
  271. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/default/base.py +0 -0
  272. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/functions/__init__.py +0 -0
  273. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/functions/aggregate.py +0 -0
  274. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/functions/array.py +0 -0
  275. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/functions/conditional.py +0 -0
  276. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/functions/numeric.py +0 -0
  277. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/functions/path.py +0 -0
  278. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/functions/random.py +0 -0
  279. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/functions/string.py +0 -0
  280. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/selectable.py +0 -0
  281. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/sqlite/__init__.py +0 -0
  282. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/sqlite/base.py +0 -0
  283. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/sqlite/types.py +0 -0
  284. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/sqlite/vector.py +0 -0
  285. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/types.py +0 -0
  286. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/sql/utils.py +0 -0
  287. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/telemetry.py +0 -0
  288. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/toolkit/__init__.py +0 -0
  289. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/toolkit/split.py +0 -0
  290. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/torch/__init__.py +0 -0
  291. {datachain-0.21.1 → datachain-0.22.0}/src/datachain/utils.py +0 -0
  292. {datachain-0.21.1 → datachain-0.22.0}/src/datachain.egg-info/dependency_links.txt +0 -0
  293. {datachain-0.21.1 → datachain-0.22.0}/src/datachain.egg-info/entry_points.txt +0 -0
  294. {datachain-0.21.1 → datachain-0.22.0}/src/datachain.egg-info/top_level.txt +0 -0
  295. {datachain-0.21.1 → datachain-0.22.0}/tests/__init__.py +0 -0
  296. {datachain-0.21.1 → datachain-0.22.0}/tests/benchmarks/__init__.py +0 -0
  297. {datachain-0.21.1 → datachain-0.22.0}/tests/benchmarks/conftest.py +0 -0
  298. {datachain-0.21.1 → datachain-0.22.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  299. {datachain-0.21.1 → datachain-0.22.0}/tests/benchmarks/datasets/.dvc/config +0 -0
  300. {datachain-0.21.1 → datachain-0.22.0}/tests/benchmarks/datasets/.gitignore +0 -0
  301. {datachain-0.21.1 → datachain-0.22.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  302. {datachain-0.21.1 → datachain-0.22.0}/tests/benchmarks/test_datachain.py +0 -0
  303. {datachain-0.21.1 → datachain-0.22.0}/tests/benchmarks/test_ls.py +0 -0
  304. {datachain-0.21.1 → datachain-0.22.0}/tests/benchmarks/test_version.py +0 -0
  305. {datachain-0.21.1 → datachain-0.22.0}/tests/data.py +0 -0
  306. {datachain-0.21.1 → datachain-0.22.0}/tests/examples/__init__.py +0 -0
  307. {datachain-0.21.1 → datachain-0.22.0}/tests/examples/wds_data.py +0 -0
  308. {datachain-0.21.1 → datachain-0.22.0}/tests/func/__init__.py +0 -0
  309. {datachain-0.21.1 → datachain-0.22.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  310. {datachain-0.21.1 → datachain-0.22.0}/tests/func/data/lena.jpg +0 -0
  311. {datachain-0.21.1 → datachain-0.22.0}/tests/func/fake-service-account-credentials.json +0 -0
  312. {datachain-0.21.1 → datachain-0.22.0}/tests/func/functions/__init__.py +0 -0
  313. {datachain-0.21.1 → datachain-0.22.0}/tests/func/model/__init__.py +0 -0
  314. {datachain-0.21.1 → datachain-0.22.0}/tests/func/model/data/running-mask0.png +0 -0
  315. {datachain-0.21.1 → datachain-0.22.0}/tests/func/model/data/running-mask1.png +0 -0
  316. {datachain-0.21.1 → datachain-0.22.0}/tests/func/model/data/running.jpg +0 -0
  317. {datachain-0.21.1 → datachain-0.22.0}/tests/func/model/data/ships.jpg +0 -0
  318. {datachain-0.21.1 → datachain-0.22.0}/tests/func/model/test_yolo.py +0 -0
  319. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_catalog.py +0 -0
  320. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_client.py +0 -0
  321. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_cloud_transfer.py +0 -0
  322. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_data_storage.py +0 -0
  323. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_datachain_merge.py +0 -0
  324. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_feature_pickling.py +0 -0
  325. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_hf.py +0 -0
  326. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_hidden_field.py +0 -0
  327. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_image.py +0 -0
  328. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_ls.py +0 -0
  329. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_meta_formats.py +0 -0
  330. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_metrics.py +0 -0
  331. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_pytorch.py +0 -0
  332. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_query.py +0 -0
  333. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_read_database.py +0 -0
  334. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_session.py +0 -0
  335. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_video.py +0 -0
  336. {datachain-0.21.1 → datachain-0.22.0}/tests/func/test_warehouse.py +0 -0
  337. {datachain-0.21.1 → datachain-0.22.0}/tests/scripts/feature_class.py +0 -0
  338. {datachain-0.21.1 → datachain-0.22.0}/tests/scripts/feature_class_exception.py +0 -0
  339. {datachain-0.21.1 → datachain-0.22.0}/tests/scripts/feature_class_parallel.py +0 -0
  340. {datachain-0.21.1 → datachain-0.22.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  341. {datachain-0.21.1 → datachain-0.22.0}/tests/scripts/name_len_slow.py +0 -0
  342. {datachain-0.21.1 → datachain-0.22.0}/tests/test_query_e2e.py +0 -0
  343. {datachain-0.21.1 → datachain-0.22.0}/tests/test_telemetry.py +0 -0
  344. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/__init__.py +0 -0
  345. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/__init__.py +0 -0
  346. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/conftest.py +0 -0
  347. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_arrow.py +0 -0
  348. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_clip.py +0 -0
  349. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_feature.py +0 -0
  350. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_hf.py +0 -0
  351. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_image.py +0 -0
  352. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_python_to_sql.py +0 -0
  353. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_signal_schema.py +0 -0
  354. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_sql_to_python.py +0 -0
  355. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_text.py +0 -0
  356. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_udf.py +0 -0
  357. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_udf_signature.py +0 -0
  358. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_utils.py +0 -0
  359. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/lib/test_webdataset.py +0 -0
  360. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/model/__init__.py +0 -0
  361. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/model/test_bbox.py +0 -0
  362. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/model/test_pose.py +0 -0
  363. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/model/test_segment.py +0 -0
  364. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/model/test_utils.py +0 -0
  365. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/sql/__init__.py +0 -0
  366. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/sql/sqlite/__init__.py +0 -0
  367. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/sql/sqlite/test_types.py +0 -0
  368. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
  369. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/sql/test_array.py +0 -0
  370. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/sql/test_conditional.py +0 -0
  371. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/sql/test_path.py +0 -0
  372. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/sql/test_random.py +0 -0
  373. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/sql/test_selectable.py +0 -0
  374. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/sql/test_string.py +0 -0
  375. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_asyn.py +0 -0
  376. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_cache.py +0 -0
  377. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_catalog.py +0 -0
  378. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_catalog_loader.py +0 -0
  379. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_cli_parsing.py +0 -0
  380. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_client.py +0 -0
  381. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_client_gcs.py +0 -0
  382. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_client_s3.py +0 -0
  383. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_config.py +0 -0
  384. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_data_storage.py +0 -0
  385. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_database_engine.py +0 -0
  386. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_dispatch.py +0 -0
  387. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_fileslice.py +0 -0
  388. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_module_exports.py +0 -0
  389. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_pytorch.py +0 -0
  390. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_query.py +0 -0
  391. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_query_metrics.py +0 -0
  392. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_query_params.py +0 -0
  393. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_script_meta.py +0 -0
  394. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_semver.py +0 -0
  395. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_serializer.py +0 -0
  396. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_utils.py +0 -0
  397. {datachain-0.21.1 → datachain-0.22.0}/tests/unit/test_warehouse.py +0 -0
@@ -98,6 +98,7 @@ jobs:
98
98
  - name: Run tests
99
99
  # Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
100
100
  run: >
101
+ DATACHAIN_METASTORE_ARG_PROJECT=john
101
102
  PYTHONPATH="$(pwd)/..:${PYTHONPATH}"
102
103
  pytest
103
104
  --config-file=pyproject.toml -rs
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.11.13'
27
+ rev: 'v0.12.0'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.21.1
3
+ Version: 0.22.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -94,7 +94,7 @@ Requires-Dist: scipy; extra == "tests"
94
94
  Requires-Dist: ultralytics; extra == "tests"
95
95
  Provides-Extra: dev
96
96
  Requires-Dist: datachain[docs,tests]; extra == "dev"
97
- Requires-Dist: mypy==1.16.0; extra == "dev"
97
+ Requires-Dist: mypy==1.16.1; extra == "dev"
98
98
  Requires-Dist: types-python-dateutil; extra == "dev"
99
99
  Requires-Dist: types-pytz; extra == "dev"
100
100
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -0,0 +1,114 @@
1
+ # Handling Local Database Migrations (CLI)
2
+
3
+ When using the DataChain CLI, datasets are stored in a local SQLite database located at:
4
+
5
+ ```
6
+ .datachain/db
7
+ ```
8
+
9
+ Unlike the SaaS version (Studio), the CLI does **not** support automatic database migrations. This means that after upgrading the DataChain CLI, the local database schema may become incompatible with the updated codebase.
10
+
11
+ ## Schema Mismatch Detection
12
+
13
+ The CLI automatically checks for schema compatibility. If a mismatch is detected, you’ll see an error like:
14
+
15
+ ```
16
+ OutdatedDatabaseSchemaError: You have an old version of the database schema. Please refer to the documentation for more information.
17
+ ```
18
+
19
+ This typically happens after upgrading the CLI to a newer version.
20
+
21
+ ## How to Fix It
22
+
23
+ The recommended fix is to **delete the local database** and let the CLI recreate it. To avoid losing datasets, you should **export them before removing the database**.
24
+
25
+ Before deleting the file, we strongly recommend making a backup of your current database:
26
+
27
+ ```bash
28
+ cp .datachain/db .datachain/db.backup
29
+ ```
30
+
31
+ This allows you to recover data manually if needed later.
32
+
33
+ ---
34
+
35
+ ## Exporting and Re-Importing All Local Datasets
36
+
37
+ **Important:** Exporting datasets must be done **before upgrading** to a new DataChain version. Export with the old version to avoid the `OutdatedDatabaseSchemaError` during export. After deleting the database file, upgrade/install the new DataChain version.
38
+
39
+ ### Step 1: Export All Datasets to Parquet
40
+
41
+ Export all datasets into a folder named `exported_datasets` (created if it doesn't exist). Each dataset will be saved to a file in the format:
42
+
43
+ ```
44
+ <dataset_name>.<dataset_version>.parquet
45
+ ```
46
+
47
+ Example: `metrics.1.0.1.parquet`
48
+
49
+ ```python
50
+ import os
51
+ import datachain as dc
52
+
53
+ export_dir = "exported_datasets"
54
+ os.makedirs(export_dir, exist_ok=True)
55
+
56
+ # dc.datasets() returns a chain of DatasetInfo objects
57
+ for ds_info in dc.datasets(column="dataset").to_values("dataset"):
58
+ ds = dc.read_dataset(ds_info.name, version=ds_info.version)
59
+ filename = f"{ds_info.name}.{ds_info.version}.parquet"
60
+ filepath = os.path.join(export_dir, filename)
61
+ ds.to_parquet(filepath)
62
+ ```
63
+
64
+ ### Step 2: Delete Local Database
65
+
66
+ Make sure you've backed it up (see above), then:
67
+
68
+ ```bash
69
+ rm .datachain/db
70
+ ```
71
+
72
+ ### Step 3: Re-import All Datasets from Parquet (In Correct Version Order)
73
+
74
+ To avoid import errors due to semantic versioning constraints, datasets must be imported in ascending order by version for each dataset name.
75
+
76
+ ```python
77
+ import os
78
+ import datachain as dc
79
+ from packaging.version import Version
80
+
81
+ import_dir = "exported_datasets"
82
+
83
+ # Gather all dataset files
84
+ datasets = []
85
+
86
+ for fname in os.listdir(import_dir):
87
+ if not fname.endswith(".parquet"):
88
+ continue
89
+ base = fname[:-8] # remove '.parquet'
90
+ name, version = base.split('.', 1) # split on first dot
91
+ filepath = os.path.join(import_dir, fname)
92
+ datasets.append((name, Version(version), filepath))
93
+
94
+ # Sort by dataset name and then by version ascending
95
+ datasets.sort(key=lambda x: (x[0], x[1]))
96
+
97
+ # Import datasets in order
98
+ for name, version, filepath in datasets:
99
+ dc.read_parquet(filepath).save(name, version=str(version))
100
+ ```
101
+
102
+ **Note:** While exporting and importing datasets to Parquet files preserves the datasets and their data, some metadata — such as dataset dependencies — will **not** be preserved. This information will be lost during this process.
103
+
104
+ ---
105
+
106
+ ## Notes
107
+
108
+ - This limitation only applies to the **CLI**, which uses a local SQLite database.
109
+ - The **Studio (SaaS)** version handles all schema migrations automatically — no manual steps are required.
110
+ - The CLI only supports the default namespace/project: `local.local`.
111
+
112
+ ---
113
+
114
+ This export/import workflow is the recommended way to preserve your datasets during local CLI upgrades that involve database schema changes.
@@ -0,0 +1,18 @@
1
+ # Environment Variables
2
+
3
+ List of environment variables used to configure DataChain behavior.
4
+
5
+ ### Core Configuration
6
+
7
+ - `DATACHAIN_ROOT_DIR` – Specifies the root directory where DataChain will create the `.datachain` folder to store its internal data. (default: the current working directory).
8
+ - `DATACHAIN_SYSTEM_CONFIG_DIR` – Overrides the system-wide configuration directory (default depends on the platform).
9
+ - `DATACHAIN_GLOBAL_CONFIG_DIR` – Overrides the user's global configuration directory (default depends on the platform).
10
+ - `DATACHAIN_NO_ANALYTICS` – Disables telemetry.
11
+
12
+ ### Studio Integration
13
+
14
+ - `DATACHAIN_STUDIO_URL` – Custom Studio URL.
15
+ - `DATACHAIN_STUDIO_TOKEN` – Authentication token for Studio.
16
+ - `DATACHAIN_STUDIO_TEAM` – Studio team name.
17
+
18
+ Note: Some environment variables are used internally and may not be documented here. For the most up-to-date list, refer to the source code.
@@ -10,3 +10,6 @@ Welcome to the DataChain User Guide! This section provides comprehensive documen
10
10
  - [Data Processing Overview](./processing.md) - Discover DataChain's specialized data processing features.
11
11
  - [Delta Processing](./delta.md) - Incremental data processing to efficiently handle large datasets that change over time.
12
12
  - [Error Handling and Retries](./retry.md) - Learn how to handle processing errors and selectively reprocess problematic records.
13
+ - [Environment Variables](./env.md) - Configure DataChain's behavior using environment variables.
14
+ - [Namespaces](./namespaces.md) - Learn more about namespaces and projects.
15
+ - [Local DB Migrations](./namespaces.md) - Learn how to handle local DB migrations after upgrading datachain.
@@ -0,0 +1,119 @@
1
+ # Organizing Datasets with Namespace and Project
2
+
3
+ DataChain allows you to organize datasets using namespaces and projects. These provide an additional structure for managing data across different workflows, use cases, or organizational structures.
4
+
5
+ A dataset in DataChain is organized as:
6
+
7
+ ```
8
+ <namespace>.<project>.<dataset>
9
+ ```
10
+
11
+ For example:
12
+
13
+ ```
14
+ dev.analytics.metrics
15
+ ```
16
+
17
+ ## Default Namespace and Project
18
+
19
+ If no namespace or project is specified, DataChain uses defaults depending on whether you're using **Studio** or the **CLI**.
20
+
21
+ ### Studio
22
+
23
+ - **Namespace:** `users`
24
+ - **Project:** your username (e.g. `jondoe`)
25
+ - Saving without namespace/project:
26
+
27
+ ```python
28
+ dc.read_values(scores=[1.2, 3.4, 2.5]).save("metrics")
29
+ # Saved as users.jondoe.metrics
30
+ ```
31
+
32
+ ### CLI
33
+
34
+ - **Namespace:** `local`
35
+ - **Project:** `local`
36
+ - Saving without namespace/project:
37
+
38
+ ```python
39
+ dc.read_values(scores=[2.0, 2.2, 2.8]).save("metrics")
40
+ # Saved as local.local.metrics
41
+ ```
42
+
43
+ In the CLI, you cannot create or use any namespaces or projects other than the default `local.local`.
44
+
45
+ ## Creating a Project (Studio only)
46
+
47
+ In Studio, you can explicitly create a project and namespace using:
48
+
49
+ ```python
50
+ import datachain as dc
51
+
52
+ dc.create_project("dev", "analytics")
53
+ ```
54
+
55
+ This creates the `dev` namespace (if it doesn't exist) and a project called `analytics` inside it.
56
+
57
+ **Note:** Creating custom namespaces and projects is only supported in **Studio**. In the **CLI**, only the default `local` namespace and `local` project are available.
58
+
59
+ ## Saving a Dataset Using a Fully Qualified Name
60
+
61
+ You can implicitly create and use namespaces and projects by saving a dataset using a fully qualified name:
62
+
63
+ ```python
64
+ dc.read_values(scores=[1.2, 3.4, 2.5]).save("dev.analytics.metrics")
65
+ ```
66
+
67
+ In Studio, this automatically creates the namespace and project if they don’t already exist.
68
+
69
+ In CLI, only `local.local.<dataset>` is supported. Using any other namespace or project will result in an error.
70
+
71
+ ## Using `.settings()` to Set Namespace and Project
72
+
73
+ You can also set the namespace and project using `.settings()`:
74
+
75
+ ```python
76
+ dc.read_values(scores=[1.2, 3.4, 2.5])
77
+ .settings(namespace="dev", project="analytics")
78
+ .save("metrics")
79
+ ```
80
+
81
+ This is equivalent to saving to `dev.analytics.metrics`.
82
+
83
+ In CLI, `.settings()` is only supported when both `namespace` and `project` are set to `"local"`.
84
+
85
+ ## Reading a Dataset from a Project
86
+
87
+ To read a dataset from a specific namespace and project:
88
+
89
+ ```python
90
+ ds = dc.read_dataset("dev.analytics.metrics")
91
+ ```
92
+
93
+ In CLI, this only works for datasets saved in the default `local.local` project.
94
+
95
+
96
+ ## Example (Studio)
97
+
98
+ ```python
99
+ import datachain as dc
100
+
101
+ dc.create_project("prod", "analytics")
102
+
103
+ dc.read_csv("gs://bucket/metrics.csv") \
104
+ .save("prod.analytics.metrics")
105
+
106
+ ds = dc.read_dataset("prod.analytics.metrics")
107
+ ds.show()
108
+ ```
109
+
110
+ ## Example (CLI – default only)
111
+
112
+ ```python
113
+ import datachain as dc
114
+
115
+ dc.read_values(scores=[0.8, 1.5, 2.1]).save("metrics")
116
+
117
+ ds = dc.read_dataset("local.local.metrics")
118
+ ds.show()
119
+ ```
@@ -48,7 +48,7 @@ def main():
48
48
 
49
49
  # Print JSON schema in Pydantic format from main COCO annotation
50
50
  chain = dc.read_storage(uri, anon="True").filter(dc.C("file.path").glob("*.json"))
51
- file = next(chain.limit(1).collect("file"))
51
+ file = chain.limit(1).to_values("file")[0]
52
52
  print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
53
53
 
54
54
  # Static JSON schema test parsing 3/7 objects
@@ -47,7 +47,7 @@ def process_files_with_delta():
47
47
  print("\nDataset versions:")
48
48
  test_dataset = dc.datasets().filter(C("name") == "test_files")
49
49
 
50
- for version in test_dataset.collect("version"):
50
+ for version in test_dataset.to_iter("version"):
51
51
  print(f"- Version: {version}")
52
52
 
53
53
  # Show the last 3 records to demonstrate the incremental processing
@@ -1,3 +1,5 @@
1
+ import os
2
+
1
3
  from huggingface_hub import InferenceClient
2
4
  from requests import HTTPError
3
5
 
@@ -23,6 +25,7 @@ def eval_dialog(
23
25
  ) -> DialogEval:
24
26
  try:
25
27
  completion = client.chat_completion(
28
+ model="meta-llama/Llama-3.3-70B-Instruct",
26
29
  messages=[
27
30
  {
28
31
  "role": "user",
@@ -31,9 +34,10 @@ def eval_dialog(
31
34
  ],
32
35
  response_format={"type": "json", "value": DialogEval.model_json_schema()},
33
36
  )
34
- except HTTPError:
37
+ except HTTPError as e:
35
38
  return DialogEval(
36
- result="Error", reason="Error while interacting with the Hugging Face API."
39
+ result="Error",
40
+ reason=f"Error while interacting with the Hugging Face API. {e}",
37
41
  )
38
42
 
39
43
  message = completion.choices[0].message
@@ -48,9 +52,15 @@ def eval_dialog(
48
52
  # Save to HF as Parquet. Dataset can be previewed here:
49
53
  # https://huggingface.co/datasets/dvcorg/test-datachain-llm-eval/viewer
50
54
  (
51
- dc.read_csv("hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv")
52
- .settings(parallel=10)
53
- .setup(client=lambda: InferenceClient("meta-llama/Llama-3.1-70B-Instruct"))
55
+ dc.read_csv(
56
+ "hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv", source=False
57
+ )
58
+ .settings(parallel=True)
59
+ .setup(
60
+ client=lambda: InferenceClient(
61
+ provider="hf-inference", api_key=os.environ["HF_TOKEN"]
62
+ )
63
+ )
54
64
  .map(response=eval_dialog)
55
65
  .to_parquet("hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet")
56
66
  )
@@ -105,6 +105,9 @@ nav:
105
105
  - Overview: guide/processing.md
106
106
  - Delta Processing: guide/delta.md
107
107
  - Errors Handling and Retries: guide/retry.md
108
+ - Environment Variables: guide/env.md
109
+ - Namespaces: guide/namespaces.md
110
+ - Local DB Migrations: guide/db_migrations.md
108
111
  - 🤝 Contributing: contributing.md
109
112
 
110
113
  - DataChain Website ↗: https://datachain.ai" target="_blank"
@@ -108,7 +108,7 @@ tests = [
108
108
  ]
109
109
  dev = [
110
110
  "datachain[docs,tests]",
111
- "mypy==1.16.0",
111
+ "mypy==1.16.1",
112
112
  "types-python-dateutil",
113
113
  "types-pytz",
114
114
  "types-PyYAML",
@@ -221,7 +221,8 @@ ignore = [
221
221
  "PERF203", # perflint - try-except-in-loop, irrelevant for Python>=3.11
222
222
  "PERF401",
223
223
  "D100", # undocumented-public-module
224
- "D205" # one-blank-line-after-class
224
+ "D205", # one-blank-line-after-class
225
+ "PLC0415" # import-outside-top-level
225
226
  ]
226
227
  select = [
227
228
  "B", # flake8-bugbear
@@ -32,6 +32,7 @@ from datachain.lib.file import (
32
32
  VideoFrame,
33
33
  )
34
34
  from datachain.lib.model_store import ModelStore
35
+ from datachain.lib.projects import create as create_project
35
36
  from datachain.lib.udf import Aggregator, Generator, Mapper
36
37
  from datachain.lib.utils import AbstractUDF, DataChainError
37
38
  from datachain.query import metrics, param
@@ -62,6 +63,7 @@ __all__ = [
62
63
  "VideoFile",
63
64
  "VideoFragment",
64
65
  "VideoFrame",
66
+ "create_project",
65
67
  "datasets",
66
68
  "delete_dataset",
67
69
  "is_chain_type",
@@ -39,7 +39,7 @@ def temporary_cache(
39
39
  cache.destroy()
40
40
 
41
41
 
42
- class Cache:
42
+ class Cache: # noqa: PLW1641
43
43
  def __init__(self, cache_dir: str, tmp_dir: str):
44
44
  self.odb = LocalHashFileDB(
45
45
  LocalFileSystem(),
@@ -76,9 +76,9 @@ class Cache:
76
76
  async def download(
77
77
  self, file: "File", client: "Client", callback: Optional[Callback] = None
78
78
  ) -> None:
79
- from_path = f"{file.source}/{file.path}"
80
79
  from dvc_objects.fs.utils import tmp_fname
81
80
 
81
+ from_path = file.get_uri()
82
82
  odb_fs = self.odb.fs
83
83
  tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
84
84
  size = file.size