datachain 0.15.0__tar.gz → 0.16.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (355) hide show
  1. {datachain-0.15.0/src/datachain.egg-info → datachain-0.16.1}/PKG-INFO +1 -1
  2. {datachain-0.15.0 → datachain-0.16.1}/docs/examples.md +5 -5
  3. {datachain-0.15.0 → datachain-0.16.1}/docs/quick-start.md +3 -3
  4. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/catalog/catalog.py +9 -9
  5. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/__init__.py +1 -1
  6. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/datasets.py +3 -3
  7. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/show.py +2 -2
  8. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/parser/__init__.py +2 -2
  9. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/metastore.py +5 -5
  10. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/dataset.py +8 -8
  11. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/aggregate.py +3 -3
  12. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/values_to_tuples.py +6 -8
  13. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dataset_info.py +18 -0
  14. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/datachain.py +20 -13
  15. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/datasets.py +9 -0
  16. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/records.py +16 -10
  17. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/utils.py +2 -2
  18. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/signal_schema.py +1 -10
  19. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/udf.py +2 -1
  20. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/dataset.py +15 -8
  21. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/schema.py +1 -4
  22. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/remote/studio.py +2 -2
  23. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/studio.py +2 -2
  24. {datachain-0.15.0 → datachain-0.16.1/src/datachain.egg-info}/PKG-INFO +1 -1
  25. {datachain-0.15.0 → datachain-0.16.1}/tests/conftest.py +7 -7
  26. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_datachain.py +4 -4
  27. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_datasets.py +7 -7
  28. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_pull.py +1 -1
  29. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_read_database.py +31 -17
  30. {datachain-0.15.0 → datachain-0.16.1}/tests/test_cli_studio.py +4 -4
  31. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_datachain.py +35 -0
  32. {datachain-0.15.0 → datachain-0.16.1}/.cruft.json +0 -0
  33. {datachain-0.15.0 → datachain-0.16.1}/.gitattributes +0 -0
  34. {datachain-0.15.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  35. {datachain-0.15.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  36. {datachain-0.15.0 → datachain-0.16.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  37. {datachain-0.15.0 → datachain-0.16.1}/.github/codecov.yaml +0 -0
  38. {datachain-0.15.0 → datachain-0.16.1}/.github/dependabot.yml +0 -0
  39. {datachain-0.15.0 → datachain-0.16.1}/.github/workflows/benchmarks.yml +0 -0
  40. {datachain-0.15.0 → datachain-0.16.1}/.github/workflows/release.yml +0 -0
  41. {datachain-0.15.0 → datachain-0.16.1}/.github/workflows/tests-studio.yml +0 -0
  42. {datachain-0.15.0 → datachain-0.16.1}/.github/workflows/tests.yml +0 -0
  43. {datachain-0.15.0 → datachain-0.16.1}/.github/workflows/update-template.yaml +0 -0
  44. {datachain-0.15.0 → datachain-0.16.1}/.gitignore +0 -0
  45. {datachain-0.15.0 → datachain-0.16.1}/.pre-commit-config.yaml +0 -0
  46. {datachain-0.15.0 → datachain-0.16.1}/CODE_OF_CONDUCT.rst +0 -0
  47. {datachain-0.15.0 → datachain-0.16.1}/LICENSE +0 -0
  48. {datachain-0.15.0 → datachain-0.16.1}/README.rst +0 -0
  49. {datachain-0.15.0 → datachain-0.16.1}/docs/assets/captioned_cartoons.png +0 -0
  50. {datachain-0.15.0 → datachain-0.16.1}/docs/assets/datachain-white.svg +0 -0
  51. {datachain-0.15.0 → datachain-0.16.1}/docs/assets/datachain.svg +0 -0
  52. {datachain-0.15.0 → datachain-0.16.1}/docs/contributing.md +0 -0
  53. {datachain-0.15.0 → datachain-0.16.1}/docs/css/github-permalink-style.css +0 -0
  54. {datachain-0.15.0 → datachain-0.16.1}/docs/index.md +0 -0
  55. {datachain-0.15.0 → datachain-0.16.1}/docs/overrides/main.html +0 -0
  56. {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/arrowrow.md +0 -0
  57. {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/bbox.md +0 -0
  58. {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/file.md +0 -0
  59. {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/imagefile.md +0 -0
  60. {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/index.md +0 -0
  61. {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/pose.md +0 -0
  62. {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/segment.md +0 -0
  63. {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/tarvfile.md +0 -0
  64. {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/textfile.md +0 -0
  65. {datachain-0.15.0 → datachain-0.16.1}/docs/references/data-types/videofile.md +0 -0
  66. {datachain-0.15.0 → datachain-0.16.1}/docs/references/datachain.md +0 -0
  67. {datachain-0.15.0 → datachain-0.16.1}/docs/references/func.md +0 -0
  68. {datachain-0.15.0 → datachain-0.16.1}/docs/references/index.md +0 -0
  69. {datachain-0.15.0 → datachain-0.16.1}/docs/references/remotes.md +0 -0
  70. {datachain-0.15.0 → datachain-0.16.1}/docs/references/toolkit.md +0 -0
  71. {datachain-0.15.0 → datachain-0.16.1}/docs/references/torch.md +0 -0
  72. {datachain-0.15.0 → datachain-0.16.1}/docs/references/udf.md +0 -0
  73. {datachain-0.15.0 → datachain-0.16.1}/docs/tutorials.md +0 -0
  74. {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  75. {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  76. {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/openimage-detect.py +0 -0
  77. {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
  78. {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-pose.py +0 -0
  79. {datachain-0.15.0 → datachain-0.16.1}/examples/computer_vision/ultralytics-segment.py +0 -0
  80. {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/common_sql_functions.py +0 -0
  81. {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/json-csv-reader.py +0 -0
  82. {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/torch-loader.py +0 -0
  83. {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/udfs/parallel.py +0 -0
  84. {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/udfs/simple.py +0 -0
  85. {datachain-0.15.0 → datachain-0.16.1}/examples/get_started/udfs/stateful.py +0 -0
  86. {datachain-0.15.0 → datachain-0.16.1}/examples/llm_and_nlp/claude-query.py +0 -0
  87. {datachain-0.15.0 → datachain-0.16.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
  88. {datachain-0.15.0 → datachain-0.16.1}/examples/multimodal/clip_inference.py +0 -0
  89. {datachain-0.15.0 → datachain-0.16.1}/examples/multimodal/hf_pipeline.py +0 -0
  90. {datachain-0.15.0 → datachain-0.16.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
  91. {datachain-0.15.0 → datachain-0.16.1}/examples/multimodal/wds.py +0 -0
  92. {datachain-0.15.0 → datachain-0.16.1}/examples/multimodal/wds_filtered.py +0 -0
  93. {datachain-0.15.0 → datachain-0.16.1}/mkdocs.yml +0 -0
  94. {datachain-0.15.0 → datachain-0.16.1}/noxfile.py +0 -0
  95. {datachain-0.15.0 → datachain-0.16.1}/pyproject.toml +0 -0
  96. {datachain-0.15.0 → datachain-0.16.1}/setup.cfg +0 -0
  97. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/__init__.py +0 -0
  98. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/__main__.py +0 -0
  99. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/asyn.py +0 -0
  100. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cache.py +0 -0
  101. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/catalog/__init__.py +0 -0
  102. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/catalog/datasource.py +0 -0
  103. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/catalog/loader.py +0 -0
  104. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/__init__.py +0 -0
  105. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/du.py +0 -0
  106. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/index.py +0 -0
  107. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/ls.py +0 -0
  108. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/misc.py +0 -0
  109. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/commands/query.py +0 -0
  110. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/parser/job.py +0 -0
  111. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/parser/studio.py +0 -0
  112. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/parser/utils.py +0 -0
  113. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/cli/utils.py +0 -0
  114. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/__init__.py +0 -0
  115. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/azure.py +0 -0
  116. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/fileslice.py +0 -0
  117. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/fsspec.py +0 -0
  118. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/gcs.py +0 -0
  119. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/hf.py +0 -0
  120. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/local.py +0 -0
  121. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/client/s3.py +0 -0
  122. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/config.py +0 -0
  123. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/__init__.py +0 -0
  124. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/db_engine.py +0 -0
  125. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/job.py +0 -0
  126. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/schema.py +0 -0
  127. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/serializer.py +0 -0
  128. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/sqlite.py +0 -0
  129. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/data_storage/warehouse.py +0 -0
  130. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/diff/__init__.py +0 -0
  131. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/error.py +0 -0
  132. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/fs/__init__.py +0 -0
  133. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/fs/reference.py +0 -0
  134. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/fs/utils.py +0 -0
  135. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/__init__.py +0 -0
  136. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/array.py +0 -0
  137. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/base.py +0 -0
  138. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/conditional.py +0 -0
  139. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/func.py +0 -0
  140. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/numeric.py +0 -0
  141. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/path.py +0 -0
  142. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/random.py +0 -0
  143. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/string.py +0 -0
  144. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/func/window.py +0 -0
  145. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/job.py +0 -0
  146. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/__init__.py +0 -0
  147. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/arrow.py +0 -0
  148. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/clip.py +0 -0
  149. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/__init__.py +0 -0
  150. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/flatten.py +0 -0
  151. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
  152. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
  153. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/convert/unflatten.py +0 -0
  154. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/data_model.py +0 -0
  155. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/__init__.py +0 -0
  156. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/csv.py +0 -0
  157. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/database.py +0 -0
  158. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/hf.py +0 -0
  159. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/json.py +0 -0
  160. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/listings.py +0 -0
  161. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/pandas.py +0 -0
  162. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/parquet.py +0 -0
  163. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/storage.py +0 -0
  164. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/dc/values.py +0 -0
  165. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/file.py +0 -0
  166. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/hf.py +0 -0
  167. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/image.py +0 -0
  168. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/listing.py +0 -0
  169. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/listing_info.py +0 -0
  170. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/meta_formats.py +0 -0
  171. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/model_store.py +0 -0
  172. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/pytorch.py +0 -0
  173. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/settings.py +0 -0
  174. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/tar.py +0 -0
  175. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/text.py +0 -0
  176. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/udf_signature.py +0 -0
  177. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/utils.py +0 -0
  178. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/video.py +0 -0
  179. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/webdataset.py +0 -0
  180. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/lib/webdataset_laion.py +0 -0
  181. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/listing.py +0 -0
  182. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/__init__.py +0 -0
  183. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/bbox.py +0 -0
  184. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/pose.py +0 -0
  185. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/segment.py +0 -0
  186. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/ultralytics/__init__.py +0 -0
  187. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/ultralytics/bbox.py +0 -0
  188. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/ultralytics/pose.py +0 -0
  189. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/ultralytics/segment.py +0 -0
  190. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/model/utils.py +0 -0
  191. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/node.py +0 -0
  192. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/nodes_fetcher.py +0 -0
  193. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/nodes_thread_pool.py +0 -0
  194. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/progress.py +0 -0
  195. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/py.typed +0 -0
  196. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/__init__.py +0 -0
  197. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/batch.py +0 -0
  198. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/dispatch.py +0 -0
  199. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/metrics.py +0 -0
  200. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/params.py +0 -0
  201. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/queue.py +0 -0
  202. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/session.py +0 -0
  203. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/udf.py +0 -0
  204. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/query/utils.py +0 -0
  205. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/remote/__init__.py +0 -0
  206. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/script_meta.py +0 -0
  207. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/__init__.py +0 -0
  208. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/default/__init__.py +0 -0
  209. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/default/base.py +0 -0
  210. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/__init__.py +0 -0
  211. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/aggregate.py +0 -0
  212. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/array.py +0 -0
  213. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/conditional.py +0 -0
  214. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/numeric.py +0 -0
  215. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/path.py +0 -0
  216. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/random.py +0 -0
  217. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/functions/string.py +0 -0
  218. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/selectable.py +0 -0
  219. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/sqlite/__init__.py +0 -0
  220. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/sqlite/base.py +0 -0
  221. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/sqlite/types.py +0 -0
  222. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/sqlite/vector.py +0 -0
  223. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/types.py +0 -0
  224. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/sql/utils.py +0 -0
  225. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/telemetry.py +0 -0
  226. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/toolkit/__init__.py +0 -0
  227. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/toolkit/split.py +0 -0
  228. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/torch/__init__.py +0 -0
  229. {datachain-0.15.0 → datachain-0.16.1}/src/datachain/utils.py +0 -0
  230. {datachain-0.15.0 → datachain-0.16.1}/src/datachain.egg-info/SOURCES.txt +0 -0
  231. {datachain-0.15.0 → datachain-0.16.1}/src/datachain.egg-info/dependency_links.txt +0 -0
  232. {datachain-0.15.0 → datachain-0.16.1}/src/datachain.egg-info/entry_points.txt +0 -0
  233. {datachain-0.15.0 → datachain-0.16.1}/src/datachain.egg-info/requires.txt +0 -0
  234. {datachain-0.15.0 → datachain-0.16.1}/src/datachain.egg-info/top_level.txt +0 -0
  235. {datachain-0.15.0 → datachain-0.16.1}/tests/__init__.py +0 -0
  236. {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/__init__.py +0 -0
  237. {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/conftest.py +0 -0
  238. {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  239. {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/datasets/.dvc/config +0 -0
  240. {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/datasets/.gitignore +0 -0
  241. {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  242. {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/test_datachain.py +0 -0
  243. {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/test_ls.py +0 -0
  244. {datachain-0.15.0 → datachain-0.16.1}/tests/benchmarks/test_version.py +0 -0
  245. {datachain-0.15.0 → datachain-0.16.1}/tests/data.py +0 -0
  246. {datachain-0.15.0 → datachain-0.16.1}/tests/examples/__init__.py +0 -0
  247. {datachain-0.15.0 → datachain-0.16.1}/tests/examples/test_examples.py +0 -0
  248. {datachain-0.15.0 → datachain-0.16.1}/tests/examples/test_wds_e2e.py +0 -0
  249. {datachain-0.15.0 → datachain-0.16.1}/tests/examples/wds_data.py +0 -0
  250. {datachain-0.15.0 → datachain-0.16.1}/tests/func/__init__.py +0 -0
  251. {datachain-0.15.0 → datachain-0.16.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
  252. {datachain-0.15.0 → datachain-0.16.1}/tests/func/data/lena.jpg +0 -0
  253. {datachain-0.15.0 → datachain-0.16.1}/tests/func/fake-service-account-credentials.json +0 -0
  254. {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/__init__.py +0 -0
  255. {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/data/running-mask0.png +0 -0
  256. {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/data/running-mask1.png +0 -0
  257. {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/data/running.jpg +0 -0
  258. {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/data/ships.jpg +0 -0
  259. {datachain-0.15.0 → datachain-0.16.1}/tests/func/model/test_yolo.py +0 -0
  260. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_catalog.py +0 -0
  261. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_client.py +0 -0
  262. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_cloud_transfer.py +0 -0
  263. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_data_storage.py +0 -0
  264. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_datachain_merge.py +0 -0
  265. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_dataset_query.py +0 -0
  266. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_feature_pickling.py +0 -0
  267. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_file.py +0 -0
  268. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_hf.py +0 -0
  269. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_hidden_field.py +0 -0
  270. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_image.py +0 -0
  271. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_listing.py +0 -0
  272. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_ls.py +0 -0
  273. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_meta_formats.py +0 -0
  274. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_metrics.py +0 -0
  275. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_pytorch.py +0 -0
  276. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_query.py +0 -0
  277. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_session.py +0 -0
  278. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_toolkit.py +0 -0
  279. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_video.py +0 -0
  280. {datachain-0.15.0 → datachain-0.16.1}/tests/func/test_warehouse.py +0 -0
  281. {datachain-0.15.0 → datachain-0.16.1}/tests/scripts/feature_class.py +0 -0
  282. {datachain-0.15.0 → datachain-0.16.1}/tests/scripts/feature_class_exception.py +0 -0
  283. {datachain-0.15.0 → datachain-0.16.1}/tests/scripts/feature_class_parallel.py +0 -0
  284. {datachain-0.15.0 → datachain-0.16.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  285. {datachain-0.15.0 → datachain-0.16.1}/tests/scripts/name_len_slow.py +0 -0
  286. {datachain-0.15.0 → datachain-0.16.1}/tests/test_atomicity.py +0 -0
  287. {datachain-0.15.0 → datachain-0.16.1}/tests/test_cli_e2e.py +0 -0
  288. {datachain-0.15.0 → datachain-0.16.1}/tests/test_import_time.py +0 -0
  289. {datachain-0.15.0 → datachain-0.16.1}/tests/test_query_e2e.py +0 -0
  290. {datachain-0.15.0 → datachain-0.16.1}/tests/test_telemetry.py +0 -0
  291. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/__init__.py +0 -0
  292. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/__init__.py +0 -0
  293. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/conftest.py +0 -0
  294. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_arrow.py +0 -0
  295. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_clip.py +0 -0
  296. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  297. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_datachain_merge.py +0 -0
  298. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_diff.py +0 -0
  299. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_feature.py +0 -0
  300. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_feature_utils.py +0 -0
  301. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_file.py +0 -0
  302. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_hf.py +0 -0
  303. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_image.py +0 -0
  304. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_listing_info.py +0 -0
  305. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_python_to_sql.py +0 -0
  306. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_schema.py +0 -0
  307. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_signal_schema.py +0 -0
  308. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_sql_to_python.py +0 -0
  309. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_text.py +0 -0
  310. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_udf_signature.py +0 -0
  311. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_utils.py +0 -0
  312. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/lib/test_webdataset.py +0 -0
  313. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/model/__init__.py +0 -0
  314. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/model/test_bbox.py +0 -0
  315. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/model/test_pose.py +0 -0
  316. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/model/test_segment.py +0 -0
  317. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/model/test_utils.py +0 -0
  318. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/__init__.py +0 -0
  319. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/sqlite/__init__.py +0 -0
  320. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/sqlite/test_types.py +0 -0
  321. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
  322. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_array.py +0 -0
  323. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_conditional.py +0 -0
  324. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_path.py +0 -0
  325. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_random.py +0 -0
  326. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_selectable.py +0 -0
  327. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/sql/test_string.py +0 -0
  328. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_asyn.py +0 -0
  329. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_cache.py +0 -0
  330. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_catalog.py +0 -0
  331. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_catalog_loader.py +0 -0
  332. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_cli_parsing.py +0 -0
  333. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_client.py +0 -0
  334. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_client_gcs.py +0 -0
  335. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_client_s3.py +0 -0
  336. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_config.py +0 -0
  337. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_data_storage.py +0 -0
  338. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_database_engine.py +0 -0
  339. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_dataset.py +0 -0
  340. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_dispatch.py +0 -0
  341. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_fileslice.py +0 -0
  342. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_func.py +0 -0
  343. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_listing.py +0 -0
  344. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_metastore.py +0 -0
  345. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_module_exports.py +0 -0
  346. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_pytorch.py +0 -0
  347. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_query.py +0 -0
  348. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_query_metrics.py +0 -0
  349. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_query_params.py +0 -0
  350. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_script_meta.py +0 -0
  351. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_serializer.py +0 -0
  352. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_session.py +0 -0
  353. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_utils.py +0 -0
  354. {datachain-0.15.0 → datachain-0.16.1}/tests/unit/test_warehouse.py +0 -0
  355. {datachain-0.15.0 → datachain-0.16.1}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.15.0
3
+ Version: 0.16.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -94,7 +94,7 @@ dc.DataModel.register(MistralModel)
94
94
  chain = (
95
95
  dc
96
96
  .read_storage("gs://datachain-demo/chatbot-KiT/", type="text")
97
- .filter(dc.Column("file.name").glob("*.txt"))
97
+ .filter(dc.Column("file.path").glob("*.txt"))
98
98
  .limit(5)
99
99
  .settings(parallel=4, cache=True)
100
100
  .map(
@@ -228,7 +228,7 @@ Here is an example from MS COCO “captions” JSON which employs separate secti
228
228
 
229
229
  Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations” array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
230
230
 
231
- However, Datachain can easily parse the entire COCO structure via several reading and merging operators:
231
+ However, DataChain can easily parse the entire COCO structure via several reading and merging operators:
232
232
 
233
233
  ```python
234
234
  import datachain as dc
@@ -240,7 +240,7 @@ images = dc.read_storage(images_uri)
240
240
  meta = dc.read_json(captions_uri, jmespath="images")
241
241
  captions = dc.read_json(captions_uri, jmespath="annotations")
242
242
 
243
- images_meta = images.merge(meta, on="file.name", right_on="images.file_name")
243
+ images_meta = images.merge(meta, on="file.path", right_on="images.file_name")
244
244
  captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id")
245
245
  ```
246
246
 
@@ -248,12 +248,12 @@ The resulting dataset has image entries as files decorated with all the metadata
248
248
 
249
249
  ```python
250
250
  images_with_dogs = captioned_images.filter(dc.Column("annotations.caption").glob("*dog*"))
251
- images_with_dogs.select("annotations", "file.name").show()
251
+ images_with_dogs.select("annotations", "file.path").show()
252
252
  ```
253
253
 
254
254
  ```
255
255
  captions captions captions file
256
- image_id id caption name
256
+ image_id id caption path
257
257
  0 17029 778902 a dog jumping to catch a frisbee in a yard 000000017029.jpg
258
258
  1 17029 779838 A dog jumping to catch a red frisbee in a garden 000000017029.jpg
259
259
  2 17029 781941 The dog is catching the Frisbee in mid air in ... 000000017029.jpg
@@ -184,7 +184,7 @@ chain = (
184
184
  .save("response")
185
185
  )
186
186
 
187
- chain.select("file.name", "status", "response.usage").show(5)
187
+ chain.select("file.path", "status", "response.usage").show(5)
188
188
 
189
189
  success_rate = chain.filter(dc.Column("status") == "success").count() / chain.count()
190
190
  print(f"{100*success_rate:.1f}% dialogs were successful")
@@ -194,7 +194,7 @@ Output:
194
194
 
195
195
  ``` shell
196
196
  file status response response response
197
- name usage usage usage
197
+ path usage usage usage
198
198
  prompt_tokens total_tokens completion_tokens
199
199
  0 1.txt success 547 548 1
200
200
  1 10.txt failure 3576 3578 2
@@ -277,7 +277,7 @@ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
277
277
 
278
278
  chain = (
279
279
  dc.read_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
280
- .map(label=lambda name: name.split(".")[0], params=["file.name"])
280
+ .map(label=lambda name: name.split(".")[0], params=["file.path"])
281
281
  .select("file", "label").to_pytorch(
282
282
  transform=processor.image_processor,
283
283
  tokenizer=processor.tokenizer,
@@ -776,7 +776,7 @@ class Catalog:
776
776
  listing: Optional[bool] = False,
777
777
  uuid: Optional[str] = None,
778
778
  description: Optional[str] = None,
779
- labels: Optional[list[str]] = None,
779
+ attrs: Optional[list[str]] = None,
780
780
  ) -> "DatasetRecord":
781
781
  """
782
782
  Creates new dataset of a specific version.
@@ -794,16 +794,16 @@ class Catalog:
794
794
  dataset = self.get_dataset(name)
795
795
  default_version = dataset.next_version
796
796
 
797
- if (description or labels) and (
798
- dataset.description != description or dataset.labels != labels
797
+ if (description or attrs) and (
798
+ dataset.description != description or dataset.attrs != attrs
799
799
  ):
800
800
  description = description or dataset.description
801
- labels = labels or dataset.labels
801
+ attrs = attrs or dataset.attrs
802
802
 
803
803
  self.update_dataset(
804
804
  dataset,
805
805
  description=description,
806
- labels=labels,
806
+ attrs=attrs,
807
807
  )
808
808
 
809
809
  except DatasetNotFoundError:
@@ -817,7 +817,7 @@ class Catalog:
817
817
  schema=schema,
818
818
  ignore_if_exists=True,
819
819
  description=description,
820
- labels=labels,
820
+ attrs=attrs,
821
821
  )
822
822
 
823
823
  version = version or default_version
@@ -1334,15 +1334,15 @@ class Catalog:
1334
1334
  name: str,
1335
1335
  new_name: Optional[str] = None,
1336
1336
  description: Optional[str] = None,
1337
- labels: Optional[list[str]] = None,
1337
+ attrs: Optional[list[str]] = None,
1338
1338
  ) -> DatasetRecord:
1339
1339
  update_data = {}
1340
1340
  if new_name:
1341
1341
  update_data["name"] = new_name
1342
1342
  if description is not None:
1343
1343
  update_data["description"] = description
1344
- if labels is not None:
1345
- update_data["labels"] = labels # type: ignore[assignment]
1344
+ if attrs is not None:
1345
+ update_data["attrs"] = attrs # type: ignore[assignment]
1346
1346
 
1347
1347
  dataset = self.get_dataset(name)
1348
1348
  return self.update_dataset(dataset, **update_data)
@@ -149,7 +149,7 @@ def handle_dataset_command(args, catalog):
149
149
  args.name,
150
150
  new_name=args.new_name,
151
151
  description=args.description,
152
- labels=args.labels,
152
+ attrs=args.attrs,
153
153
  studio=args.studio,
154
154
  local=args.local,
155
155
  all=args.all,
@@ -154,7 +154,7 @@ def edit_dataset(
154
154
  name: str,
155
155
  new_name: Optional[str] = None,
156
156
  description: Optional[str] = None,
157
- labels: Optional[list[str]] = None,
157
+ attrs: Optional[list[str]] = None,
158
158
  studio: bool = False,
159
159
  local: bool = False,
160
160
  all: bool = True,
@@ -167,9 +167,9 @@ def edit_dataset(
167
167
 
168
168
  if all or local:
169
169
  try:
170
- catalog.edit_dataset(name, new_name, description, labels)
170
+ catalog.edit_dataset(name, new_name, description, attrs)
171
171
  except DatasetNotFoundError:
172
172
  print("Dataset not found in local", file=sys.stderr)
173
173
 
174
174
  if (all or studio) and token:
175
- edit_studio_dataset(team, name, new_name, description, labels)
175
+ edit_studio_dataset(team, name, new_name, description, attrs)
@@ -42,8 +42,8 @@ def show(
42
42
  print("Name: ", name)
43
43
  if dataset.description:
44
44
  print("Description: ", dataset.description)
45
- if dataset.labels:
46
- print("Labels: ", ",".join(dataset.labels))
45
+ if dataset.attrs:
46
+ print("Attributes: ", ",".join(dataset.attrs))
47
47
  print("\n")
48
48
 
49
49
  show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
@@ -217,9 +217,9 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
217
217
  help="Dataset description",
218
218
  )
219
219
  parse_edit_dataset.add_argument(
220
- "--labels",
220
+ "--attrs",
221
221
  nargs="+",
222
- help="Dataset labels",
222
+ help="Dataset attributes",
223
223
  )
224
224
  parse_edit_dataset.add_argument(
225
225
  "--studio",
@@ -120,7 +120,7 @@ class AbstractMetastore(ABC, Serializable):
120
120
  schema: Optional[dict[str, Any]] = None,
121
121
  ignore_if_exists: bool = False,
122
122
  description: Optional[str] = None,
123
- labels: Optional[list[str]] = None,
123
+ attrs: Optional[list[str]] = None,
124
124
  ) -> DatasetRecord:
125
125
  """Creates new dataset."""
126
126
 
@@ -326,7 +326,7 @@ class AbstractDBMetastore(AbstractMetastore):
326
326
  Column("id", Integer, primary_key=True),
327
327
  Column("name", Text, nullable=False),
328
328
  Column("description", Text),
329
- Column("labels", JSON, nullable=True),
329
+ Column("attrs", JSON, nullable=True),
330
330
  Column("status", Integer, nullable=False),
331
331
  Column("feature_schema", JSON, nullable=True),
332
332
  Column("created_at", DateTime(timezone=True)),
@@ -521,7 +521,7 @@ class AbstractDBMetastore(AbstractMetastore):
521
521
  schema: Optional[dict[str, Any]] = None,
522
522
  ignore_if_exists: bool = False,
523
523
  description: Optional[str] = None,
524
- labels: Optional[list[str]] = None,
524
+ attrs: Optional[list[str]] = None,
525
525
  **kwargs, # TODO registered = True / False
526
526
  ) -> DatasetRecord:
527
527
  """Creates new dataset."""
@@ -538,7 +538,7 @@ class AbstractDBMetastore(AbstractMetastore):
538
538
  query_script=query_script,
539
539
  schema=json.dumps(schema or {}),
540
540
  description=description,
541
- labels=json.dumps(labels or []),
541
+ attrs=json.dumps(attrs or []),
542
542
  )
543
543
  if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
544
544
  # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
@@ -621,7 +621,7 @@ class AbstractDBMetastore(AbstractMetastore):
621
621
  dataset_values = {}
622
622
  for field, value in kwargs.items():
623
623
  if field in self._dataset_fields[1:]:
624
- if field in ["labels", "schema"]:
624
+ if field in ["attrs", "schema"]:
625
625
  values[field] = json.dumps(value) if value else None
626
626
  else:
627
627
  values[field] = value
@@ -329,7 +329,7 @@ class DatasetRecord:
329
329
  id: int
330
330
  name: str
331
331
  description: Optional[str]
332
- labels: list[str]
332
+ attrs: list[str]
333
333
  schema: dict[str, Union[SQLType, type[SQLType]]]
334
334
  feature_schema: dict
335
335
  versions: list[DatasetVersion]
@@ -357,7 +357,7 @@ class DatasetRecord:
357
357
  id: int,
358
358
  name: str,
359
359
  description: Optional[str],
360
- labels: str,
360
+ attrs: str,
361
361
  status: int,
362
362
  feature_schema: Optional[str],
363
363
  created_at: datetime,
@@ -387,7 +387,7 @@ class DatasetRecord:
387
387
  version_schema: str,
388
388
  version_job_id: Optional[str] = None,
389
389
  ) -> "DatasetRecord":
390
- labels_lst: list[str] = json.loads(labels) if labels else []
390
+ attrs_lst: list[str] = json.loads(attrs) if attrs else []
391
391
  schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
392
392
  version_schema_dct: dict[str, str] = (
393
393
  json.loads(version_schema) if version_schema else {}
@@ -418,7 +418,7 @@ class DatasetRecord:
418
418
  id,
419
419
  name,
420
420
  description,
421
- labels_lst,
421
+ attrs_lst,
422
422
  cls.parse_schema(schema_dct), # type: ignore[arg-type]
423
423
  json.loads(feature_schema) if feature_schema else {},
424
424
  [dataset_version],
@@ -562,7 +562,7 @@ class DatasetListRecord:
562
562
  id: int
563
563
  name: str
564
564
  description: Optional[str]
565
- labels: list[str]
565
+ attrs: list[str]
566
566
  versions: list[DatasetListVersion]
567
567
  created_at: Optional[datetime] = None
568
568
 
@@ -572,7 +572,7 @@ class DatasetListRecord:
572
572
  id: int,
573
573
  name: str,
574
574
  description: Optional[str],
575
- labels: str,
575
+ attrs: str,
576
576
  created_at: datetime,
577
577
  version_id: int,
578
578
  version_uuid: str,
@@ -588,7 +588,7 @@ class DatasetListRecord:
588
588
  version_query_script: Optional[str],
589
589
  version_job_id: Optional[str] = None,
590
590
  ) -> "DatasetListRecord":
591
- labels_lst: list[str] = json.loads(labels) if labels else []
591
+ attrs_lst: list[str] = json.loads(attrs) if attrs else []
592
592
 
593
593
  dataset_version = DatasetListVersion.parse(
594
594
  version_id,
@@ -610,7 +610,7 @@ class DatasetListRecord:
610
610
  id,
611
611
  name,
612
612
  description,
613
- labels_lst,
613
+ attrs_lst,
614
614
  [dataset_version],
615
615
  created_at,
616
616
  )
@@ -165,7 +165,7 @@ def any_value(col: str) -> Func:
165
165
  Example:
166
166
  ```py
167
167
  dc.group_by(
168
- file_example=func.any_value("file.name"),
168
+ file_example=func.any_value("file.path"),
169
169
  partition_by="signal.category",
170
170
  )
171
171
  ```
@@ -227,7 +227,7 @@ def concat(col: str, separator="") -> Func:
227
227
  Example:
228
228
  ```py
229
229
  dc.group_by(
230
- files=func.concat("file.name", separator=", "),
230
+ files=func.concat("file.path", separator=", "),
231
231
  partition_by="signal.category",
232
232
  )
233
233
  ```
@@ -343,7 +343,7 @@ def first(col: str) -> Func:
343
343
  ```py
344
344
  window = func.window(partition_by="signal.category", order_by="created_at")
345
345
  dc.mutate(
346
- first_file=func.first("file.name").over(window),
346
+ first_file=func.first("file.path").over(window),
347
347
  )
348
348
  ```
349
349
 
@@ -1,6 +1,6 @@
1
1
  import itertools
2
2
  from collections.abc import Sequence
3
- from typing import Any, Optional, Union
3
+ from typing import Any, Union
4
4
 
5
5
  from datachain.lib.data_model import (
6
6
  DataType,
@@ -71,14 +71,13 @@ def values_to_tuples( # noqa: C901, PLR0912
71
71
  # If a non-None value appears early, it won't check the remaining items for
72
72
  # `None` values.
73
73
  try:
74
- pos, first_not_none_element = next(
75
- itertools.dropwhile(lambda pair: pair[1] is None, enumerate(v))
74
+ first_not_none_element = next(
75
+ itertools.dropwhile(lambda i: i is None, v)
76
76
  )
77
77
  except StopIteration:
78
- typ = str # default to str if all values are None or has length 0
79
- nullable = True
78
+ # set default type to `str` if column is empty or all values are `None`
79
+ typ = str
80
80
  else:
81
- nullable = pos > 0
82
81
  typ = type(first_not_none_element) # type: ignore[assignment]
83
82
  if not is_chain_type(typ):
84
83
  raise ValuesToTupleError(
@@ -88,8 +87,7 @@ def values_to_tuples( # noqa: C901, PLR0912
88
87
  )
89
88
  if isinstance(first_not_none_element, list):
90
89
  typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
91
-
92
- types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
90
+ types_map[k] = typ
93
91
 
94
92
  if length < 0:
95
93
  length = len_
@@ -32,11 +32,28 @@ class DatasetInfo(DataModel):
32
32
  metrics: dict[str, Any] = Field(default={})
33
33
  error_message: str = Field(default="")
34
34
  error_stack: str = Field(default="")
35
+ attrs: list[str] = Field(default=[])
35
36
 
36
37
  @property
37
38
  def is_temp(self) -> bool:
38
39
  return Session.is_temp_dataset(self.name)
39
40
 
41
+ def has_attr(self, attr: str) -> bool:
42
+ s = attr.split("=")
43
+ if len(s) == 1:
44
+ return attr in self.attrs
45
+
46
+ name = s[0]
47
+ value = s[1]
48
+ for a in self.attrs:
49
+ s = a.split("=")
50
+ if value == "*" and s[0] == name:
51
+ return True
52
+ if len(s) == 2 and s[0] == name and s[1] == value:
53
+ return True
54
+
55
+ return False
56
+
40
57
  @staticmethod
41
58
  def _validate_dict(
42
59
  v: Optional[Union[str, dict]],
@@ -83,4 +100,5 @@ class DatasetInfo(DataModel):
83
100
  metrics=job.metrics if job else {},
84
101
  error_message=version.error_message,
85
102
  error_stack=version.error_stack,
103
+ attrs=dataset.attrs,
86
104
  )
@@ -459,7 +459,7 @@ class DataChain:
459
459
  name: str,
460
460
  version: Optional[int] = None,
461
461
  description: Optional[str] = None,
462
- labels: Optional[list[str]] = None,
462
+ attrs: Optional[list[str]] = None,
463
463
  **kwargs,
464
464
  ) -> "Self":
465
465
  """Save to a Dataset. It returns the chain itself.
@@ -468,7 +468,8 @@ class DataChain:
468
468
  name : dataset name.
469
469
  version : version of a dataset. Default - the last version that exist.
470
470
  description : description of a dataset.
471
- labels : labels of a dataset.
471
+ attrs : attributes of a dataset. They can be without value, e.g "NLP",
472
+ or with a value, e.g "location=US".
472
473
  """
473
474
  schema = self.signals_schema.clone_without_sys_signals().serialize()
474
475
  return self._evolve(
@@ -476,7 +477,7 @@ class DataChain:
476
477
  name=name,
477
478
  version=version,
478
479
  description=description,
479
- labels=labels,
480
+ attrs=attrs,
480
481
  feature_schema=schema,
481
482
  **kwargs,
482
483
  )
@@ -755,7 +756,7 @@ class DataChain:
755
756
 
756
757
  Example:
757
758
  ```py
758
- dc.distinct("file.parent", "file.name")
759
+ dc.distinct("file.path")
759
760
  ```
760
761
  """
761
762
  return self._evolve(
@@ -881,7 +882,7 @@ class DataChain:
881
882
  ```py
882
883
  dc.mutate(
883
884
  area=Column("image.height") * Column("image.width"),
884
- extension=file_ext(Column("file.name")),
885
+ extension=file_ext(Column("file.path")),
885
886
  dist=cosine_distance(embedding_text, embedding_image)
886
887
  )
887
888
  ```
@@ -1070,13 +1071,13 @@ class DataChain:
1070
1071
 
1071
1072
  Iterating over all rows with selected columns:
1072
1073
  ```py
1073
- for name, size in dc.collect("file.name", "file.size"):
1074
+ for name, size in dc.collect("file.path", "file.size"):
1074
1075
  print(name, size)
1075
1076
  ```
1076
1077
 
1077
1078
  Iterating over a single column:
1078
1079
  ```py
1079
- for file in dc.collect("file.name"):
1080
+ for file in dc.collect("file.path"):
1080
1081
  print(file)
1081
1082
  ```
1082
1083
  """
@@ -1629,7 +1630,7 @@ class DataChain:
1629
1630
  import datachain as dc
1630
1631
 
1631
1632
  chain = dc.read_storage("s3://mybucket")
1632
- chain = chain.filter(dc.C("file.name").glob("*.jsonl"))
1633
+ chain = chain.filter(dc.C("file.path").glob("*.jsonl"))
1633
1634
  chain = chain.parse_tabular(format="json")
1634
1635
  ```
1635
1636
  """
@@ -2088,25 +2089,31 @@ class DataChain:
2088
2089
 
2089
2090
  Using glob to match patterns
2090
2091
  ```py
2091
- dc.filter(C("file.name").glob("*.jpg"))
2092
+ dc.filter(C("file.path").glob("*.jpg"))
2093
+ ```
2094
+
2095
+ Using in to match lists
2096
+ ```py
2097
+ ids = [1,2,3]
2098
+ dc.filter(C("experiment_id").in_(ids))
2092
2099
  ```
2093
2100
 
2094
2101
  Using `datachain.func`
2095
2102
  ```py
2096
2103
  from datachain.func import string
2097
- dc.filter(string.length(C("file.name")) > 5)
2104
+ dc.filter(string.length(C("file.path")) > 5)
2098
2105
  ```
2099
2106
 
2100
2107
  Combining filters with "or"
2101
2108
  ```py
2102
- dc.filter(C("file.name").glob("cat*") | C("file.name").glob("dog*))
2109
+ dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
2103
2110
  ```
2104
2111
 
2105
2112
  Combining filters with "and"
2106
2113
  ```py
2107
2114
  dc.filter(
2108
- C("file.name").glob("*.jpg) &
2109
- (string.length(C("file.name")) > 5)
2115
+ C("file.path").glob("*.jpg) &
2116
+ (string.length(C("file.path")) > 5)
2110
2117
  )
2111
2118
  ```
2112
2119
  """
@@ -102,6 +102,7 @@ def datasets(
102
102
  column: Optional[str] = None,
103
103
  include_listing: bool = False,
104
104
  studio: bool = False,
105
+ attrs: Optional[list[str]] = None,
105
106
  ) -> "DataChain":
106
107
  """Generate chain with list of registered datasets.
107
108
 
@@ -114,6 +115,10 @@ def datasets(
114
115
  include_listing: If True, includes listing datasets. Defaults to False.
115
116
  studio: If True, returns datasets from Studio only,
116
117
  otherwise returns all local datasets. Defaults to False.
118
+ attrs: Optional list of attributes to filter datasets on. It can be just
119
+ attribute without value e.g "NLP", or attribute with value
120
+ e.g "location=US". Attribute with value can also accept "*" to target
121
+ all that have specific name e.g "location=*"
117
122
 
118
123
  Returns:
119
124
  DataChain: A new DataChain instance containing dataset information.
@@ -139,6 +144,10 @@ def datasets(
139
144
  ]
140
145
  datasets_values = [d for d in datasets_values if not d.is_temp]
141
146
 
147
+ if attrs:
148
+ for attr in attrs:
149
+ datasets_values = [d for d in datasets_values if d.has_attr(attr)]
150
+
142
151
  if not column:
143
152
  # flattening dataset fields
144
153
  schema = {
@@ -4,12 +4,9 @@ from typing import TYPE_CHECKING, Optional, Union
4
4
  import sqlalchemy
5
5
 
6
6
  from datachain.lib.data_model import DataType
7
- from datachain.lib.file import (
8
- File,
9
- )
7
+ from datachain.lib.file import File
10
8
  from datachain.lib.signal_schema import SignalSchema
11
9
  from datachain.query import Session
12
- from datachain.query.schema import Column
13
10
 
14
11
  if TYPE_CHECKING:
15
12
  from typing_extensions import ParamSpec
@@ -41,6 +38,9 @@ def read_records(
41
38
  single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
42
39
  ```
43
40
  """
41
+ from datachain.query.dataset import adjust_outputs, get_col_types
42
+ from datachain.sql.types import SQLType
43
+
44
44
  from .datasets import read_dataset
45
45
 
46
46
  session = Session.get(session, in_memory=in_memory)
@@ -52,11 +52,10 @@ def read_records(
52
52
 
53
53
  if schema:
54
54
  signal_schema = SignalSchema(schema)
55
- columns = []
56
- for c in signal_schema.db_signals(as_columns=True):
57
- assert isinstance(c, Column)
58
- kw = {"nullable": c.nullable} if c.nullable is not None else {}
59
- columns.append(sqlalchemy.Column(c.name, c.type, **kw))
55
+ columns = [
56
+ sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
57
+ for c in signal_schema.db_signals(as_columns=True)
58
+ ]
60
59
  else:
61
60
  columns = [
62
61
  sqlalchemy.Column(name, typ)
@@ -83,6 +82,13 @@ def read_records(
83
82
  warehouse = catalog.warehouse
84
83
  dr = warehouse.dataset_rows(dsr)
85
84
  table = dr.get_table()
86
- warehouse.insert_rows(table, to_insert)
85
+
86
+ # Optimization: Compute row types once, rather than for every row.
87
+ col_types = get_col_types(
88
+ warehouse,
89
+ {c.name: c.type for c in columns if isinstance(c.type, SQLType)},
90
+ )
91
+ records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
92
+ warehouse.insert_rows(table, records)
87
93
  warehouse.insert_rows_done(table)
88
94
  return read_dataset(name=dsr.name, session=session, settings=settings)
@@ -31,8 +31,8 @@ def resolve_columns(
31
31
  ) -> "Callable[Concatenate[D, P], D]":
32
32
  """Decorator that resolvs input column names to their actual DB names. This is
33
33
  specially important for nested columns as user works with them by using dot
34
- notation e.g (file.name) but are actually defined with default delimiter
35
- in DB, e.g file__name.
34
+ notation e.g (file.path) but are actually defined with default delimiter
35
+ in DB, e.g file__path.
36
36
  If there are any sql functions in arguments, they will just be transferred as is
37
37
  to a method.
38
38
  """
@@ -581,11 +581,7 @@ class SignalSchema:
581
581
  signals = [
582
582
  DEFAULT_DELIMITER.join(path)
583
583
  if not as_columns
584
- else Column(
585
- DEFAULT_DELIMITER.join(path),
586
- python_to_sql(_type),
587
- nullable=is_optional(_type),
588
- )
584
+ else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
589
585
  for path, _type, has_subtree, _ in self.get_flat_tree(
590
586
  include_hidden=include_hidden
591
587
  )
@@ -994,8 +990,3 @@ class SignalSchema:
994
990
  }
995
991
 
996
992
  return SignalSchema.deserialize(schema)
997
-
998
-
999
- def is_optional(type_: Any) -> bool:
1000
- """Check if a type is Optional."""
1001
- return get_origin(type_) is Union and type(None) in get_args(type_)
@@ -474,8 +474,9 @@ class Generator(UDFBase):
474
474
  remove_prefetched=bool(self.prefetch) and not cache,
475
475
  )
476
476
  with closing(prepared_inputs):
477
- for row in processed_cb.wrap(prepared_inputs):
477
+ for row in prepared_inputs:
478
478
  yield _process_row(row)
479
+ processed_cb.relative_update(1)
479
480
 
480
481
  self.teardown()
481
482