datachain 0.3.10__tar.gz → 0.3.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (249) hide show
  1. {datachain-0.3.10 → datachain-0.3.12}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.3.10/src/datachain.egg-info → datachain-0.3.12}/PKG-INFO +7 -5
  3. {datachain-0.3.10 → datachain-0.3.12}/README.rst +6 -3
  4. datachain-0.3.12/docs/assets/datachain-white.svg +1 -0
  5. datachain-0.3.12/docs/assets/datachain.svg +24 -0
  6. {datachain-0.3.10 → datachain-0.3.12}/docs/index.md +1 -1
  7. {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/udfs/stateful.py +4 -0
  8. {datachain-0.3.10 → datachain-0.3.12}/examples/multimodal/clip_inference.py +10 -9
  9. {datachain-0.3.10 → datachain-0.3.12}/examples/multimodal/wds.py +11 -12
  10. {datachain-0.3.10 → datachain-0.3.12}/mkdocs.yml +4 -4
  11. {datachain-0.3.10 → datachain-0.3.12}/pyproject.toml +4 -2
  12. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/catalog/catalog.py +50 -230
  13. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/error.py +0 -4
  14. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/job.py +4 -3
  15. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/clip.py +1 -1
  16. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/dc.py +92 -38
  17. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/file.py +9 -8
  18. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/image.py +1 -1
  19. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/meta_formats.py +38 -59
  20. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/model_store.py +6 -1
  21. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/text.py +1 -1
  22. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/webdataset.py +13 -0
  23. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/webdataset_laion.py +13 -0
  24. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/dataset.py +9 -32
  25. {datachain-0.3.10 → datachain-0.3.12/src/datachain.egg-info}/PKG-INFO +7 -5
  26. {datachain-0.3.10 → datachain-0.3.12}/src/datachain.egg-info/SOURCES.txt +3 -2
  27. {datachain-0.3.10 → datachain-0.3.12}/src/datachain.egg-info/requires.txt +0 -1
  28. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_catalog.py +23 -96
  29. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_datasets.py +0 -2
  30. datachain-0.3.12/tests/func/test_meta_formats.py +87 -0
  31. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_pytorch.py +10 -3
  32. datachain-0.3.12/tests/func/test_query.py +173 -0
  33. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_datachain_merge.py +57 -4
  34. datachain-0.3.12/tests/unit/test_catalog.py +28 -0
  35. datachain-0.3.10/docs/assets/datachain.png +0 -0
  36. datachain-0.3.10/src/datachain/catalog/subclass.py +0 -60
  37. datachain-0.3.10/tests/func/test_query.py +0 -385
  38. datachain-0.3.10/tests/unit/test_catalog.py +0 -170
  39. {datachain-0.3.10 → datachain-0.3.12}/.cruft.json +0 -0
  40. {datachain-0.3.10 → datachain-0.3.12}/.gitattributes +0 -0
  41. {datachain-0.3.10 → datachain-0.3.12}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  42. {datachain-0.3.10 → datachain-0.3.12}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  43. {datachain-0.3.10 → datachain-0.3.12}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  44. {datachain-0.3.10 → datachain-0.3.12}/.github/codecov.yaml +0 -0
  45. {datachain-0.3.10 → datachain-0.3.12}/.github/dependabot.yml +0 -0
  46. {datachain-0.3.10 → datachain-0.3.12}/.github/workflows/benchmarks.yml +0 -0
  47. {datachain-0.3.10 → datachain-0.3.12}/.github/workflows/release.yml +0 -0
  48. {datachain-0.3.10 → datachain-0.3.12}/.github/workflows/tests-studio.yml +0 -0
  49. {datachain-0.3.10 → datachain-0.3.12}/.github/workflows/tests.yml +0 -0
  50. {datachain-0.3.10 → datachain-0.3.12}/.github/workflows/update-template.yaml +0 -0
  51. {datachain-0.3.10 → datachain-0.3.12}/.gitignore +0 -0
  52. {datachain-0.3.10 → datachain-0.3.12}/CODE_OF_CONDUCT.rst +0 -0
  53. {datachain-0.3.10 → datachain-0.3.12}/CONTRIBUTING.rst +0 -0
  54. {datachain-0.3.10 → datachain-0.3.12}/LICENSE +0 -0
  55. {datachain-0.3.10 → datachain-0.3.12}/docs/assets/captioned_cartoons.png +0 -0
  56. {datachain-0.3.10 → datachain-0.3.12}/docs/assets/flowchart.png +0 -0
  57. {datachain-0.3.10 → datachain-0.3.12}/docs/references/datachain.md +0 -0
  58. {datachain-0.3.10 → datachain-0.3.12}/docs/references/datatype.md +0 -0
  59. {datachain-0.3.10 → datachain-0.3.12}/docs/references/file.md +0 -0
  60. {datachain-0.3.10 → datachain-0.3.12}/docs/references/index.md +0 -0
  61. {datachain-0.3.10 → datachain-0.3.12}/docs/references/sql.md +0 -0
  62. {datachain-0.3.10 → datachain-0.3.12}/docs/references/torch.md +0 -0
  63. {datachain-0.3.10 → datachain-0.3.12}/docs/references/udf.md +0 -0
  64. {datachain-0.3.10 → datachain-0.3.12}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  65. {datachain-0.3.10 → datachain-0.3.12}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  66. {datachain-0.3.10 → datachain-0.3.12}/examples/computer_vision/openimage-detect.py +0 -0
  67. {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/common_sql_functions.py +0 -0
  68. {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/json-csv-reader.py +0 -0
  69. {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/torch-loader.py +0 -0
  70. {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/udfs/parallel.py +0 -0
  71. {datachain-0.3.10 → datachain-0.3.12}/examples/get_started/udfs/simple.py +0 -0
  72. {datachain-0.3.10 → datachain-0.3.12}/examples/llm_and_nlp/claude-query.py +0 -0
  73. {datachain-0.3.10 → datachain-0.3.12}/examples/llm_and_nlp/unstructured-text.py +0 -0
  74. {datachain-0.3.10 → datachain-0.3.12}/examples/multimodal/hf_pipeline.py +0 -0
  75. {datachain-0.3.10 → datachain-0.3.12}/examples/multimodal/openai_image_desc_lib.py +0 -0
  76. {datachain-0.3.10 → datachain-0.3.12}/examples/multimodal/wds_filtered.py +0 -0
  77. {datachain-0.3.10 → datachain-0.3.12}/noxfile.py +0 -0
  78. {datachain-0.3.10 → datachain-0.3.12}/setup.cfg +0 -0
  79. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/__init__.py +0 -0
  80. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/__main__.py +0 -0
  81. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/asyn.py +0 -0
  82. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/cache.py +0 -0
  83. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/catalog/__init__.py +0 -0
  84. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/catalog/datasource.py +0 -0
  85. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/catalog/loader.py +0 -0
  86. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/cli.py +0 -0
  87. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/cli_utils.py +0 -0
  88. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/__init__.py +0 -0
  89. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/azure.py +0 -0
  90. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/fileslice.py +0 -0
  91. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/fsspec.py +0 -0
  92. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/gcs.py +0 -0
  93. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/hf.py +0 -0
  94. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/local.py +0 -0
  95. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/client/s3.py +0 -0
  96. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/config.py +0 -0
  97. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/__init__.py +0 -0
  98. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/db_engine.py +0 -0
  99. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/id_generator.py +0 -0
  100. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/job.py +0 -0
  101. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/metastore.py +0 -0
  102. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/schema.py +0 -0
  103. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/serializer.py +0 -0
  104. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/sqlite.py +0 -0
  105. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/data_storage/warehouse.py +0 -0
  106. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/dataset.py +0 -0
  107. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/__init__.py +0 -0
  108. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/arrow.py +0 -0
  109. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/__init__.py +0 -0
  110. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/flatten.py +0 -0
  111. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/python_to_sql.py +0 -0
  112. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/sql_to_python.py +0 -0
  113. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/unflatten.py +0 -0
  114. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  115. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/data_model.py +0 -0
  116. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/dataset_info.py +0 -0
  117. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/hf.py +0 -0
  118. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/listing.py +0 -0
  119. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/listing_info.py +0 -0
  120. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/pytorch.py +0 -0
  121. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/settings.py +0 -0
  122. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/signal_schema.py +0 -0
  123. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/udf.py +0 -0
  124. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/udf_signature.py +0 -0
  125. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/utils.py +0 -0
  126. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/lib/vfile.py +0 -0
  127. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/listing.py +0 -0
  128. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/node.py +0 -0
  129. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/nodes_fetcher.py +0 -0
  130. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/nodes_thread_pool.py +0 -0
  131. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/progress.py +0 -0
  132. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/py.typed +0 -0
  133. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/__init__.py +0 -0
  134. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/batch.py +0 -0
  135. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/builtins.py +0 -0
  136. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/dispatch.py +0 -0
  137. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/metrics.py +0 -0
  138. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/params.py +0 -0
  139. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/queue.py +0 -0
  140. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/schema.py +0 -0
  141. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/session.py +0 -0
  142. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/query/udf.py +0 -0
  143. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/remote/__init__.py +0 -0
  144. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/remote/studio.py +0 -0
  145. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/__init__.py +0 -0
  146. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/default/__init__.py +0 -0
  147. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/default/base.py +0 -0
  148. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/__init__.py +0 -0
  149. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/array.py +0 -0
  150. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/conditional.py +0 -0
  151. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/path.py +0 -0
  152. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/random.py +0 -0
  153. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/functions/string.py +0 -0
  154. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/selectable.py +0 -0
  155. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/sqlite/__init__.py +0 -0
  156. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/sqlite/base.py +0 -0
  157. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/sqlite/types.py +0 -0
  158. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/sqlite/vector.py +0 -0
  159. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/types.py +0 -0
  160. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/sql/utils.py +0 -0
  161. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/storage.py +0 -0
  162. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/torch/__init__.py +0 -0
  163. {datachain-0.3.10 → datachain-0.3.12}/src/datachain/utils.py +0 -0
  164. {datachain-0.3.10 → datachain-0.3.12}/src/datachain.egg-info/dependency_links.txt +0 -0
  165. {datachain-0.3.10 → datachain-0.3.12}/src/datachain.egg-info/entry_points.txt +0 -0
  166. {datachain-0.3.10 → datachain-0.3.12}/src/datachain.egg-info/top_level.txt +0 -0
  167. {datachain-0.3.10 → datachain-0.3.12}/tests/__init__.py +0 -0
  168. {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/__init__.py +0 -0
  169. {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/conftest.py +0 -0
  170. {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  171. {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/datasets/.dvc/config +0 -0
  172. {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/datasets/.gitignore +0 -0
  173. {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  174. {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/test_datachain.py +0 -0
  175. {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/test_ls.py +0 -0
  176. {datachain-0.3.10 → datachain-0.3.12}/tests/benchmarks/test_version.py +0 -0
  177. {datachain-0.3.10 → datachain-0.3.12}/tests/conftest.py +0 -0
  178. {datachain-0.3.10 → datachain-0.3.12}/tests/data.py +0 -0
  179. {datachain-0.3.10 → datachain-0.3.12}/tests/examples/__init__.py +0 -0
  180. {datachain-0.3.10 → datachain-0.3.12}/tests/examples/test_examples.py +0 -0
  181. {datachain-0.3.10 → datachain-0.3.12}/tests/examples/test_wds_e2e.py +0 -0
  182. {datachain-0.3.10 → datachain-0.3.12}/tests/examples/wds_data.py +0 -0
  183. {datachain-0.3.10 → datachain-0.3.12}/tests/func/__init__.py +0 -0
  184. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_client.py +0 -0
  185. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_datachain.py +0 -0
  186. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_dataset_query.py +0 -0
  187. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_feature_pickling.py +0 -0
  188. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_listing.py +0 -0
  189. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_ls.py +0 -0
  190. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_metrics.py +0 -0
  191. {datachain-0.3.10 → datachain-0.3.12}/tests/func/test_pull.py +0 -0
  192. {datachain-0.3.10 → datachain-0.3.12}/tests/scripts/feature_class.py +0 -0
  193. {datachain-0.3.10 → datachain-0.3.12}/tests/scripts/feature_class_parallel.py +0 -0
  194. {datachain-0.3.10 → datachain-0.3.12}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  195. {datachain-0.3.10 → datachain-0.3.12}/tests/scripts/name_len_slow.py +0 -0
  196. {datachain-0.3.10 → datachain-0.3.12}/tests/test_cli_e2e.py +0 -0
  197. {datachain-0.3.10 → datachain-0.3.12}/tests/test_query_e2e.py +0 -0
  198. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/__init__.py +0 -0
  199. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/__init__.py +0 -0
  200. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/conftest.py +0 -0
  201. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_arrow.py +0 -0
  202. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_clip.py +0 -0
  203. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_datachain.py +0 -0
  204. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  205. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_feature.py +0 -0
  206. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_feature_utils.py +0 -0
  207. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_file.py +0 -0
  208. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_hf.py +0 -0
  209. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_image.py +0 -0
  210. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_schema.py +0 -0
  211. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_signal_schema.py +0 -0
  212. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_sql_to_python.py +0 -0
  213. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_text.py +0 -0
  214. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_udf_signature.py +0 -0
  215. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_utils.py +0 -0
  216. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/lib/test_webdataset.py +0 -0
  217. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/__init__.py +0 -0
  218. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/sqlite/__init__.py +0 -0
  219. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/sqlite/test_utils.py +0 -0
  220. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_array.py +0 -0
  221. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_conditional.py +0 -0
  222. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_path.py +0 -0
  223. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_random.py +0 -0
  224. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_selectable.py +0 -0
  225. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/sql/test_string.py +0 -0
  226. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_asyn.py +0 -0
  227. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_cache.py +0 -0
  228. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_catalog_loader.py +0 -0
  229. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_cli_parsing.py +0 -0
  230. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_client.py +0 -0
  231. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_client_s3.py +0 -0
  232. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_data_storage.py +0 -0
  233. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_database_engine.py +0 -0
  234. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_dataset.py +0 -0
  235. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_dispatch.py +0 -0
  236. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_fileslice.py +0 -0
  237. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_id_generator.py +0 -0
  238. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_listing.py +0 -0
  239. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_metastore.py +0 -0
  240. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_module_exports.py +0 -0
  241. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_query_metrics.py +0 -0
  242. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_query_params.py +0 -0
  243. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_serializer.py +0 -0
  244. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_session.py +0 -0
  245. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_storage.py +0 -0
  246. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_udf.py +0 -0
  247. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_utils.py +0 -0
  248. {datachain-0.3.10 → datachain-0.3.12}/tests/unit/test_warehouse.py +0 -0
  249. {datachain-0.3.10 → datachain-0.3.12}/tests/utils.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.6.3'
27
+ rev: 'v0.6.4'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.10
3
+ Version: 0.3.12
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -71,7 +71,6 @@ Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
71
71
  Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
72
72
  Requires-Dist: pytest-servers[all]>=0.5.5; extra == "tests"
73
73
  Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
74
- Requires-Dist: pytest-asyncio>=0.23.2; extra == "tests"
75
74
  Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
76
75
  Requires-Dist: virtualenv; extra == "tests"
77
76
  Requires-Dist: dulwich; extra == "tests"
@@ -96,8 +95,14 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
96
95
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
97
96
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
98
97
 
98
+ ================
99
+ |logo| DataChain
100
+ ================
101
+
99
102
  |PyPI| |Python Version| |Codecov| |Tests|
100
103
 
104
+ .. |logo| image:: docs/assets/datachain.svg
105
+ :height: 24
101
106
  .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
102
107
  :target: https://pypi.org/project/datachain/
103
108
  :alt: PyPI
@@ -111,9 +116,6 @@ Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
111
116
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
112
117
  :alt: Tests
113
118
 
114
- AI 🔗 DataChain
115
- ----------------
116
-
117
119
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
118
120
  It is made to organize your unstructured data into datasets and wrangle it at scale on
119
121
  your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
@@ -1,5 +1,11 @@
1
+ ================
2
+ |logo| DataChain
3
+ ================
4
+
1
5
  |PyPI| |Python Version| |Codecov| |Tests|
2
6
 
7
+ .. |logo| image:: docs/assets/datachain.svg
8
+ :height: 24
3
9
  .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
4
10
  :target: https://pypi.org/project/datachain/
5
11
  :alt: PyPI
@@ -13,9 +19,6 @@
13
19
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
14
20
  :alt: Tests
15
21
 
16
- AI 🔗 DataChain
17
- ----------------
18
-
19
22
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
20
23
  It is made to organize your unstructured data into datasets and wrangle it at scale on
21
24
  your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
@@ -0,0 +1 @@
1
+ <svg width="180" height="33" fill="none" xmlns="http://www.w3.org/2000/svg"><style>.prefix__logo-fill{fill:#fff}</style><path fill-rule="evenodd" clip-rule="evenodd" d="M23.997 24.53l2.34-2.342 3.14 3.135-2.357 2.342a5.533 5.533 0 01-7.822 0l-4.704-4.7a5.536 5.536 0 010-7.823l4.76-4.763 3.124 3.14-4.745 4.747a1.106 1.106 0 000 1.57l4.699 4.694a1.107 1.107 0 001.565 0z" fill="url(#prefix__paint0_linear_449_28)"/><path fill-rule="evenodd" clip-rule="evenodd" d="M37.733 10.65a1.184 1.184 0 01-.234.357L26.253 22.255l3.13 3.135 11.234-11.242a5.536 5.536 0 000-7.824l-4.699-4.705a5.534 5.534 0 00-7.822 0l-3.278 3.263 3.134 3.135 3.268-3.268a1.107 1.107 0 011.564 0l4.694 4.694a1.108 1.108 0 01.244 1.208h.011z" fill="url(#prefix__paint1_linear_449_28)"/><path d="M24.54 14.722L22.2 17.063v.016l-2.405 2.388 3.14 3.134 4.741-4.75a5.534 5.534 0 000-7.822l-4.704-4.704a5.535 5.535 0 00-7.824 0l-5.955 5.954 3.14 3.13 5.944-5.945a1.107 1.107 0 011.565 0l4.7 4.694a1.107 1.107 0 010 1.564z" fill="url(#prefix__paint2_linear_449_28)"/><path d="M4.514 22.335c.054-.133.139-.256.24-.357L7.1 19.632l-.005-.011 3.14-3.129 2.147-2.135-3.135-3.14-7.629 7.638a5.534 5.534 0 000 7.822l4.705 4.704a5.536 5.536 0 007.824 0l3.175-3.18-3.134-3.13-3.165 3.165a1.106 1.106 0 01-1.57 0l-4.7-4.693a1.107 1.107 0 01-.24-1.208z" fill="url(#prefix__paint3_linear_449_28)"/><path d="M55.645 26.613c-.994 0-1.908-.182-2.745-.547a6.407 6.407 0 01-2.169-1.538 7.037 7.037 0 01-1.41-2.294 8.126 8.126 0 01-.497-2.867v-.547c0-1.008.157-1.955.47-2.841a7.478 7.478 0 011.36-2.32 6.201 6.201 0 012.116-1.538c.836-.382 1.76-.573 2.77-.573 1.115 0 2.09.243 2.927.73.854.469 1.533 1.181 2.038 2.137.506.956.784 2.155.837 3.597L60.27 16.76V7.117h3.633v19.027h-2.875v-6.02h.627c-.052 1.441-.348 2.649-.888 3.622-.54.956-1.255 1.677-2.143 2.163-.871.47-1.864.704-2.98.704zm.81-3.05c.714 0 1.367-.156 1.96-.469.592-.33 1.063-.799 1.41-1.407.367-.626.55-1.355.55-2.19v-1.042c0-.834-.183-1.53-.55-2.085a3.572 3.572 0 00-1.436-1.303 4.078 4.078 0 00-1.934-.47c-.784 0-1.481.192-2.091.574-.592.365-1.063.886-1.411 1.564-.331.678-.497 1.468-.497 2.372 0 .903.174 1.694.523 2.372.348.66.819 1.172 1.411 1.537.61.365 1.298.548 2.065.548zM76.635 26.144v-4.196h-.6v-4.666c0-.817-.201-1.425-.602-1.824-.4-.4-1.019-.6-1.855-.6a68.629 68.629 0 00-3.423.104c-.61.018-1.16.044-1.647.079v-3.076c.4-.035.854-.07 1.359-.104.505-.035 1.02-.052 1.542-.052.54-.018 1.045-.026 1.515-.026 1.464 0 2.675.19 3.633.573.976.382 1.707.982 2.195 1.799.505.816.758 1.885.758 3.205v8.784h-2.875zm-4.573.365c-1.028 0-1.934-.183-2.718-.547a4.274 4.274 0 01-1.803-1.564c-.418-.678-.627-1.495-.627-2.45 0-1.043.252-1.894.758-2.555.522-.66 1.245-1.155 2.169-1.485.94-.33 2.038-.495 3.293-.495h3.292v2.163h-3.345c-.836 0-1.48.208-1.934.625-.435.4-.653.921-.653 1.564s.218 1.164.653 1.564c.453.4 1.098.6 1.934.6.505 0 .967-.087 1.385-.261a2.413 2.413 0 001.072-.938c.296-.452.462-1.06.496-1.825l.889 1.017c-.087.99-.331 1.824-.732 2.502a3.899 3.899 0 01-1.62 1.564c-.68.347-1.516.52-2.509.52zM89.569 26.326c-1.307 0-2.387-.165-3.24-.495a3.635 3.635 0 01-1.882-1.72c-.419-.817-.628-1.911-.628-3.284l.026-12.824h3.398l-.026 13.058c0 .695.183 1.234.548 1.616.384.365.924.548 1.62.548h2.222v3.101h-2.038zM81.572 14.65V11.99h10.035v2.659H81.572zM103.203 26.144v-4.196h-.601v-4.666c0-.817-.201-1.425-.601-1.824-.401-.4-1.02-.6-1.856-.6a68.629 68.629 0 00-3.423.104c-.61.018-1.159.044-1.647.079v-3.076c.4-.035.854-.07 1.36-.104.504-.035 1.018-.052 1.541-.052.54-.018 1.045-.026 1.516-.026 1.463 0 2.674.19 3.632.573.976.382 1.708.982 2.196 1.799.505.816.757 1.885.757 3.205v8.784h-2.874zm-4.574.365c-1.028 0-1.934-.183-2.718-.547a4.274 4.274 0 01-1.803-1.564c-.418-.678-.627-1.495-.627-2.45 0-1.043.253-1.894.758-2.555.523-.66 1.246-1.155 2.169-1.485.94-.33 2.038-.495 3.293-.495h3.293v2.163h-3.345c-.837 0-1.481.208-1.934.625-.436.4-.654.921-.654 1.564s.218 1.164.654 1.564c.453.4 1.097.6 1.934.6.505 0 .966-.087 1.385-.261a2.417 2.417 0 001.071-.938c.296-.452.462-1.06.497-1.825l.888 1.017c-.087.99-.331 1.824-.732 2.502a3.897 3.897 0 01-1.62 1.564c-.679.347-1.516.52-2.509.52zM116.267 26.64c-1.237 0-2.309-.21-3.215-.626a6.773 6.773 0 01-2.247-1.668A7.117 7.117 0 01109.472 22a8.19 8.19 0 01-.444-2.659v-.495c0-.956.148-1.868.444-2.737a6.905 6.905 0 011.385-2.346 6.488 6.488 0 012.247-1.642c.906-.417 1.952-.625 3.136-.625 1.237 0 2.344.243 3.319.73.976.469 1.751 1.13 2.326 1.98.593.852.924 1.843.993 2.972h-3.528a2.824 2.824 0 00-.941-1.825c-.522-.486-1.245-.73-2.169-.73-.801 0-1.472.192-2.012.574-.523.382-.915.912-1.176 1.59-.261.66-.392 1.425-.392 2.294 0 .834.122 1.59.366 2.267.261.678.653 1.208 1.176 1.59.54.382 1.228.574 2.065.574.627 0 1.167-.114 1.62-.34.453-.225.81-.538 1.071-.938.279-.4.453-.851.523-1.355h3.528c-.07 1.147-.409 2.155-1.019 3.023-.593.852-1.385 1.52-2.378 2.007-.976.487-2.091.73-3.345.73zM125.919 26.144V7.117h3.633v11.104h-.628c0-1.425.183-2.633.549-3.623.366-.99.906-1.747 1.62-2.268.732-.521 1.656-.782 2.771-.782h.156c1.621 0 2.849.556 3.685 1.668.836 1.112 1.255 2.728 1.255 4.848v8.08h-3.633v-8.419c0-.903-.261-1.616-.784-2.137-.505-.521-1.176-.782-2.012-.782-.889 0-1.612.296-2.169.886-.54.574-.81 1.33-.81 2.268v8.184h-3.633zM151.463 26.144v-4.196h-.601v-4.666c0-.817-.201-1.425-.601-1.824-.401-.4-1.02-.6-1.856-.6a68.524 68.524 0 00-3.423.104c-.61.018-1.159.044-1.647.079v-3.076c.401-.035.854-.07 1.359-.104a22.491 22.491 0 011.542-.052c.54-.018 1.045-.026 1.516-.026 1.463 0 2.674.19 3.632.573.976.382 1.708.982 2.196 1.799.505.816.757 1.885.757 3.205v8.784h-2.874zm-4.574.365c-1.027 0-1.933-.183-2.717-.547a4.277 4.277 0 01-1.804-1.564c-.418-.678-.627-1.495-.627-2.45 0-1.043.253-1.894.758-2.555.523-.66 1.246-1.155 2.169-1.485.941-.33 2.038-.495 3.293-.495h3.293v2.163h-3.345c-.837 0-1.481.208-1.934.625-.436.4-.654.921-.654 1.564s.218 1.164.654 1.564c.453.4 1.097.6 1.934.6.505 0 .967-.087 1.385-.261a2.417 2.417 0 001.071-.938c.296-.452.462-1.06.497-1.825l.888 1.017c-.087.99-.331 1.824-.731 2.502a3.905 3.905 0 01-1.621 1.564c-.679.347-1.516.52-2.509.52zM158.908 26.144V11.99h3.632v14.153h-3.632zm-1.986-11.442v-2.71h5.618v2.71h-5.618zm3.319-4.405c-.715 0-1.246-.183-1.594-.547-.331-.383-.497-.86-.497-1.434 0-.573.166-1.042.497-1.407.348-.365.879-.548 1.594-.548.714 0 1.237.183 1.568.548.331.365.496.834.496 1.407 0 .574-.165 1.051-.496 1.434-.331.364-.854.547-1.568.547zM166.727 26.144V11.99h2.875v6.073h-.262c0-1.442.192-2.641.575-3.597.384-.973.95-1.703 1.699-2.19.766-.486 1.716-.729 2.848-.729h.157c1.69 0 2.971.547 3.842 1.642.871 1.077 1.307 2.693 1.307 4.848v8.106h-3.633v-8.419c0-.869-.253-1.572-.758-2.11-.488-.54-1.167-.809-2.038-.809-.889 0-1.612.278-2.169.834-.54.539-.811 1.269-.811 2.19v8.314h-3.632z" class="prefix__logo-fill"/><defs><linearGradient id="prefix__paint0_linear_449_28" x1="36.032" y1="5.404" x2="18.067" y2="23.054" gradientUnits="userSpaceOnUse"><stop stop-color="#F46837"/><stop offset="1" stop-color="#945DD6"/></linearGradient><linearGradient id="prefix__paint1_linear_449_28" x1="36.045" y1="5.607" x2="18.067" y2="23.363" gradientUnits="userSpaceOnUse"><stop stop-color="#F46837"/><stop offset="1" stop-color="#945DD6"/></linearGradient><linearGradient id="prefix__paint2_linear_449_28" x1="5.924" y1="27.432" x2="23.883" y2="10.239" gradientUnits="userSpaceOnUse"><stop stop-color="#13ADC7"/><stop offset="1" stop-color="#945DD6"/></linearGradient><linearGradient id="prefix__paint3_linear_449_28" x1="5.77" y1="27.586" x2="23.574" y2="9.776" gradientUnits="userSpaceOnUse"><stop stop-color="#13ADC7"/><stop offset="1" stop-color="#945DD6"/></linearGradient></defs></svg>
@@ -0,0 +1,24 @@
1
+ <svg width="33" height="33" viewBox="0 0 33 33" fill="none" xmlns="http://www.w3.org/2000/svg">
2
+ <path fill-rule="evenodd" clip-rule="evenodd" d="M18.7492 22.785L20.5786 20.9554L23.0316 23.4046L21.1898 25.2343C20.379 26.0444 19.2798 26.4994 18.1338 26.4994C16.9878 26.4994 15.8887 26.0444 15.0779 25.2343L11.4025 21.5625C10.5926 20.7516 10.1377 19.6523 10.1377 18.5061C10.1377 17.3599 10.5926 16.2606 11.4025 15.4497L15.1222 11.7285L17.5628 14.182L13.8556 17.8906C13.7748 17.971 13.7106 18.0666 13.6668 18.1719C13.6231 18.2771 13.6005 18.39 13.6005 18.504C13.6005 18.618 13.6231 18.7309 13.6668 18.8362C13.7106 18.9414 13.7748 19.037 13.8556 19.1174L17.5268 22.785C17.689 22.9471 17.9088 23.0381 18.138 23.0381C18.3672 23.0381 18.587 22.9471 18.7492 22.785Z" fill="url(#paint0_linear_426_297)"/>
3
+ <path fill-rule="evenodd" clip-rule="evenodd" d="M29.4817 11.941C29.436 12.0491 29.3736 12.1406 29.2988 12.2196L20.5124 21.0074L22.9571 23.4567L31.7352 14.673C32.5451 13.8621 33 12.7628 33 11.6166C33 10.4704 32.5451 9.3711 31.7352 8.5602L28.064 4.88419C27.2532 4.07415 26.1541 3.61914 25.0081 3.61914C23.8621 3.61914 22.7629 4.07415 21.9521 4.88419L19.3906 7.43354L21.8395 9.88282L24.3927 7.32932C24.5549 7.16731 24.7747 7.07631 25.0039 7.07631C25.2331 7.07631 25.4529 7.16731 25.6151 7.32932L29.2822 10.997C29.404 11.1177 29.4873 11.2718 29.5213 11.4399C29.5554 11.608 29.5387 11.7824 29.4734 11.941H29.4776H29.4817Z" fill="url(#paint1_linear_426_297)"/>
4
+ <path d="M19.1743 15.1218L17.3446 16.9511L17.3446 16.9636L15.4656 18.8289L17.919 21.2778L21.6235 17.5665C22.4336 16.7557 22.8886 15.6566 22.8886 14.5106C22.8886 13.3646 22.4336 12.2654 21.6235 11.4547L17.9475 7.77926C17.1366 6.96935 16.0373 6.51442 14.8911 6.51442C13.7449 6.51442 12.6456 6.96935 11.8347 7.77926L7.18188 12.4318L9.63532 14.8765L14.2799 10.2323C14.442 10.0703 14.6619 9.97933 14.8911 9.97933C15.1204 9.97933 15.3402 10.0703 15.5024 10.2323L19.1743 13.8994C19.3363 14.0615 19.4273 14.2814 19.4273 14.5106C19.4273 14.7398 19.3363 14.9596 19.1743 15.1218Z" fill="url(#paint2_linear_426_297)"/>
5
+ <path d="M3.52721 21.0699C3.56879 20.966 3.63532 20.8703 3.71433 20.7913L5.54818 18.9578L5.54402 18.9495L7.99746 16.5048L9.6749 14.8364L7.22562 12.3834L1.26505 18.3508C0.455006 19.1615 -3.99616e-07 20.2607 -3.49523e-07 21.4067C-2.9943e-07 22.5527 0.455006 23.6518 1.26505 24.4626L4.94105 28.138C5.75196 28.9479 6.85127 29.4028 7.99746 29.4028C9.14364 29.4028 10.243 28.9479 11.0539 28.138L13.5353 25.6527L11.086 23.208L8.6129 25.6808C8.53251 25.7616 8.43695 25.8258 8.33168 25.8695C8.22642 25.9133 8.11354 25.9358 7.99954 25.9358C7.88553 25.9358 7.77265 25.9133 7.66739 25.8695C7.56213 25.8258 7.46656 25.7616 7.38618 25.6808L3.71433 22.0137C3.59316 21.8926 3.51069 21.7383 3.47737 21.5702C3.44406 21.4022 3.4614 21.2281 3.52721 21.0699Z" fill="url(#paint3_linear_426_297)"/>
6
+ <defs>
7
+ <linearGradient id="paint0_linear_426_297" x1="28.1527" y1="7.84149" x2="14.1164" y2="21.6319" gradientUnits="userSpaceOnUse">
8
+ <stop stop-color="#F46837"/>
9
+ <stop offset="1" stop-color="#945DD6"/>
10
+ </linearGradient>
11
+ <linearGradient id="paint1_linear_426_297" x1="28.1626" y1="8.00042" x2="14.1164" y2="21.8731" gradientUnits="userSpaceOnUse">
12
+ <stop stop-color="#F46837"/>
13
+ <stop offset="1" stop-color="#945DD6"/>
14
+ </linearGradient>
15
+ <linearGradient id="paint2_linear_426_297" x1="4.62869" y1="25.0522" x2="18.6605" y2="11.619" gradientUnits="userSpaceOnUse">
16
+ <stop stop-color="#13ADC7"/>
17
+ <stop offset="1" stop-color="#945DD6"/>
18
+ </linearGradient>
19
+ <linearGradient id="paint3_linear_426_297" x1="4.50795" y1="25.1728" x2="18.4191" y2="11.2572" gradientUnits="userSpaceOnUse">
20
+ <stop stop-color="#13ADC7"/>
21
+ <stop offset="1" stop-color="#945DD6"/>
22
+ </linearGradient>
23
+ </defs>
24
+ </svg>
@@ -1,4 +1,4 @@
1
- # 🔗 DataChain Getting Started
1
+ # Get Started with DataChain
2
2
 
3
3
  🔨Wrangle unstructured AI data at scale
4
4
 
@@ -5,6 +5,10 @@ To install dependencies:
5
5
 
6
6
  """
7
7
 
8
+ import os
9
+
10
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
11
+
8
12
  import open_clip
9
13
 
10
14
  from datachain import C, DataChain, Mapper
@@ -4,22 +4,23 @@ from torch.nn.functional import cosine_similarity
4
4
  from torch.utils.data import DataLoader
5
5
 
6
6
  from datachain import C, DataChain
7
+ from datachain.sql.functions import path
7
8
 
8
9
  source = "gs://datachain-demo/50k-laion-files/000000/00000000*"
9
10
 
10
11
 
11
12
  def create_dataset():
12
- imgs = (
13
- DataChain.from_storage(source, type="image")
14
- .filter(C("file.path").glob("*.jpg"))
15
- .map(stem=lambda file: file.get_file_stem(), params=["file"], output=str)
13
+ imgs = DataChain.from_storage(source, type="image").filter(
14
+ C("file.path").glob("*.jpg")
16
15
  )
17
- captions = (
18
- DataChain.from_storage(source, type="text")
19
- .filter(C("file.path").glob("*.txt"))
20
- .map(stem=lambda file: file.get_file_stem(), params=["file"], output=str)
16
+ captions = DataChain.from_storage(source, type="text").filter(
17
+ C("file.path").glob("*.txt")
18
+ )
19
+ return imgs.merge(
20
+ captions,
21
+ on=path.file_stem(imgs.c("file.path")),
22
+ right_on=path.file_stem(captions.c("file.path")),
21
23
  )
22
- return imgs.merge(captions, on="stem")
23
24
 
24
25
 
25
26
  if __name__ == "__main__":
@@ -1,6 +1,6 @@
1
1
  import os
2
2
 
3
- from datachain import C, DataChain
3
+ from datachain import DataChain
4
4
  from datachain.lib.webdataset import process_webdataset
5
5
  from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
6
6
  from datachain.sql.functions import path
@@ -16,7 +16,7 @@ NPZ_METADATA = os.getenv(
16
16
  )
17
17
 
18
18
  wds_images = (
19
- DataChain.from_storage(IMAGE_TARS)
19
+ DataChain.from_storage(IMAGE_TARS, type="image")
20
20
  .settings(cache=True)
21
21
  .gen(laion=process_webdataset(spec=WDSLaion), params="file")
22
22
  )
@@ -25,21 +25,20 @@ wds_with_pq = (
25
25
  DataChain.from_parquet(PARQUET_METADATA)
26
26
  .settings(cache=True)
27
27
  .merge(wds_images, on="uid", right_on="laion.json.uid", inner=True)
28
- .mutate(stem=path.file_stem(C("source.file.path")))
29
28
  )
30
29
 
31
- res = (
30
+ wds_npz = (
32
31
  DataChain.from_storage(NPZ_METADATA)
33
32
  .settings(cache=True)
34
33
  .gen(emd=process_laion_meta)
35
- .mutate(stem=path.file_stem(C("emd.file.path")))
36
- .merge(
37
- wds_with_pq,
38
- on=["stem", "emd.index"],
39
- right_on=["stem", "source.index"],
40
- inner=True,
41
- )
42
- .save("wds")
43
34
  )
44
35
 
36
+
37
+ res = wds_npz.merge(
38
+ wds_with_pq,
39
+ on=[path.file_stem(wds_npz.c("emd.file.path")), "emd.index"],
40
+ right_on=[path.file_stem(wds_with_pq.c("source.file.path")), "source.index"],
41
+ inner=True,
42
+ ).save("wds")
43
+
45
44
  res.show(5)
@@ -1,5 +1,5 @@
1
- site_name: DataChain
2
- site_url: https://datachain.dvc.ai
1
+ site_name: ''
2
+ site_url: https://docs.datachain.ai
3
3
  site_description: Wrangle unstructured AI data at scale
4
4
 
5
5
  repo_url: "https://github.com/iterative/datachain"
@@ -15,8 +15,8 @@ validation:
15
15
 
16
16
  theme:
17
17
  name: material
18
- logo: assets/datachain.png
19
- favicon: assets/datachain.png
18
+ logo: assets/datachain-white.svg
19
+ favicon: assets/datachain.svg
20
20
  icon:
21
21
  repo: fontawesome/brands/github
22
22
  features:
@@ -82,7 +82,6 @@ tests = [
82
82
  "pytest-mock>=3.12.0",
83
83
  "pytest-servers[all]>=0.5.5",
84
84
  "pytest-benchmark[histogram]",
85
- "pytest-asyncio>=0.23.2",
86
85
  "pytest-xdist>=3.3.1",
87
86
  "virtualenv",
88
87
  "dulwich",
@@ -136,13 +135,16 @@ markers = [
136
135
  "llm_and_nlp: LLM and NLP examples",
137
136
  "multimodal: Multimodal examples"
138
137
  ]
139
- asyncio_mode = "auto"
140
138
  filterwarnings = [
141
139
  "error::pandas.errors.PerformanceWarning",
142
140
  "error::pydantic.warnings.PydanticDeprecatedSince20",
143
141
  "error::pytest_mock.PytestMockWarning",
144
142
  "error::pytest.PytestCollectionWarning",
145
143
  "error::sqlalchemy.exc.SADeprecationWarning",
144
+ "ignore::DeprecationWarning:timm.*",
145
+ "ignore::DeprecationWarning:botocore.auth",
146
+ "ignore::DeprecationWarning:datasets.utils._dill",
147
+ "ignore::DeprecationWarning:librosa.core.intervals",
146
148
  "ignore:Field name .* shadows an attribute in parent:UserWarning" # datachain.lib.feature
147
149
  ]
148
150
 
@@ -9,11 +9,9 @@ import os.path
9
9
  import posixpath
10
10
  import subprocess
11
11
  import sys
12
- import tempfile
13
12
  import time
14
13
  import traceback
15
14
  from collections.abc import Iterable, Iterator, Mapping, Sequence
16
- from contextlib import contextmanager, nullcontext
17
15
  from copy import copy
18
16
  from dataclasses import dataclass
19
17
  from functools import cached_property, reduce
@@ -24,7 +22,6 @@ from typing import (
24
22
  TYPE_CHECKING,
25
23
  Any,
26
24
  Callable,
27
- NamedTuple,
28
25
  NoReturn,
29
26
  Optional,
30
27
  Union,
@@ -59,7 +56,6 @@ from datachain.error import (
59
56
  PendingIndexingError,
60
57
  QueryScriptCancelError,
61
58
  QueryScriptCompileError,
62
- QueryScriptDatasetNotFound,
63
59
  QueryScriptRunError,
64
60
  )
65
61
  from datachain.listing import Listing
@@ -77,7 +73,6 @@ from datachain.utils import (
77
73
  )
78
74
 
79
75
  from .datasource import DataSource
80
- from .subclass import SubclassFinder
81
76
 
82
77
  if TYPE_CHECKING:
83
78
  from datachain.data_storage import (
@@ -92,7 +87,6 @@ logger = logging.getLogger("datachain")
92
87
 
93
88
  DEFAULT_DATASET_DIR = "dataset"
94
89
  DATASET_FILE_SUFFIX = ".edatachain"
95
- FEATURE_CLASSES = ["DataModel"]
96
90
 
97
91
  TTL_INT = 4 * 60 * 60
98
92
 
@@ -118,44 +112,19 @@ def noop(_: str):
118
112
  pass
119
113
 
120
114
 
121
- @contextmanager
122
- def print_and_capture(
123
- stream: "IO[bytes]|IO[str]", callback: Callable[[str], None] = noop
124
- ) -> "Iterator[list[str]]":
125
- lines: list[str] = []
126
- append = lines.append
115
+ def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
116
+ buffer = b""
117
+ while byt := stream.read(1): # Read one byte at a time
118
+ buffer += byt
127
119
 
128
- def loop() -> None:
129
- buffer = b""
130
- while byt := stream.read(1): # Read one byte at a time
131
- buffer += byt.encode("utf-8") if isinstance(byt, str) else byt
132
-
133
- if byt in (b"\n", b"\r"): # Check for newline or carriage return
134
- line = buffer.decode("utf-8")
135
- print(line, end="")
136
- callback(line)
137
- append(line)
138
- buffer = b"" # Clear buffer for next line
139
-
140
- if buffer: # Handle any remaining data in the buffer
120
+ if byt in (b"\n", b"\r"): # Check for newline or carriage return
141
121
  line = buffer.decode("utf-8")
142
- print(line, end="")
143
122
  callback(line)
144
- append(line)
145
-
146
- thread = Thread(target=loop, daemon=True)
147
- thread.start()
148
-
149
- try:
150
- yield lines
151
- finally:
152
- thread.join()
123
+ buffer = b"" # Clear buffer for next line
153
124
 
154
-
155
- class QueryResult(NamedTuple):
156
- dataset: Optional[DatasetRecord]
157
- version: Optional[int]
158
- output: str
125
+ if buffer: # Handle any remaining data in the buffer
126
+ line = buffer.decode("utf-8")
127
+ callback(line)
159
128
 
160
129
 
161
130
  class DatasetRowsFetcher(NodesThreadPool):
@@ -569,12 +538,6 @@ def find_column_to_str( # noqa: PLR0911
569
538
  return ""
570
539
 
571
540
 
572
- def form_module_source(source_ast):
573
- module = ast.Module(body=source_ast, type_ignores=[])
574
- module = ast.fix_missing_locations(module)
575
- return ast.unparse(module)
576
-
577
-
578
541
  class Catalog:
579
542
  def __init__(
580
543
  self,
@@ -658,34 +621,8 @@ class Catalog:
658
621
  ),
659
622
  ]
660
623
  code_ast.body[-1:] = new_expressions
661
- else:
662
- raise Exception("Last line in a script was not an expression")
663
624
  return code_ast
664
625
 
665
- def compile_query_script(
666
- self, script: str, feature_module_name: str
667
- ) -> tuple[Union[str, None], str]:
668
- code_ast = ast.parse(script)
669
- code_ast = self.attach_query_wrapper(code_ast)
670
- finder = SubclassFinder(FEATURE_CLASSES)
671
- finder.visit(code_ast)
672
-
673
- if not finder.feature_class:
674
- main_module = form_module_source([*finder.imports, *finder.main_body])
675
- return None, main_module
676
-
677
- feature_import = ast.ImportFrom(
678
- module=feature_module_name,
679
- names=[ast.alias(name="*", asname=None)],
680
- level=0,
681
- )
682
- feature_module = form_module_source([*finder.imports, *finder.feature_class])
683
- main_module = form_module_source(
684
- [*finder.imports, feature_import, *finder.main_body]
685
- )
686
-
687
- return feature_module, main_module
688
-
689
626
  def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
690
627
  config = config or self.client_config
691
628
  return Client.parse_url(uri, self.cache, **config)
@@ -1416,7 +1353,8 @@ class Catalog:
1416
1353
 
1417
1354
  for d in datasets:
1418
1355
  yield from (
1419
- (d, v, jobs.get(v.job_id) if v.job_id else None) for v in d.versions
1356
+ (d, v, jobs.get(str(v.job_id)) if v.job_id else None)
1357
+ for v in d.versions
1420
1358
  )
1421
1359
 
1422
1360
  def ls_dataset_rows(
@@ -1834,14 +1772,15 @@ class Catalog:
1834
1772
  def query(
1835
1773
  self,
1836
1774
  query_script: str,
1837
- envs: Optional[Mapping[str, str]] = None,
1838
- python_executable: Optional[str] = None,
1775
+ env: Optional[Mapping[str, str]] = None,
1776
+ python_executable: str = sys.executable,
1839
1777
  save: bool = False,
1840
1778
  capture_output: bool = True,
1841
1779
  output_hook: Callable[[str], None] = noop,
1842
1780
  params: Optional[dict[str, str]] = None,
1843
1781
  job_id: Optional[str] = None,
1844
- ) -> QueryResult:
1782
+ _execute_last_expression: bool = False,
1783
+ ) -> None:
1845
1784
  """
1846
1785
  Method to run custom user Python script to run a query and, as result,
1847
1786
  creates new dataset from the results of a query.
@@ -1864,170 +1803,51 @@ class Catalog:
1864
1803
  C.size > 1000
1865
1804
  )
1866
1805
  """
1867
-
1868
- feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
1869
- dir=os.getcwd(), suffix=".py", delete=False
1870
- )
1871
- _, feature_module = os.path.split(feature_file.name)
1872
-
1873
- try:
1874
- lines, proc, response_text = self.run_query(
1875
- python_executable or sys.executable,
1876
- query_script,
1877
- envs,
1878
- feature_file,
1879
- capture_output,
1880
- feature_module,
1881
- output_hook,
1882
- params,
1883
- save,
1884
- job_id,
1885
- )
1886
- finally:
1887
- feature_file.close()
1888
- os.unlink(feature_file.name)
1889
-
1890
- output = "".join(lines)
1891
-
1892
- if proc.returncode:
1893
- if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1894
- raise QueryScriptCancelError(
1895
- "Query script was canceled by user",
1896
- return_code=proc.returncode,
1897
- output=output,
1898
- )
1899
- if proc.returncode == QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE:
1900
- raise QueryScriptRunError(
1901
- "Last line in a script was not an instance of DataChain",
1902
- return_code=proc.returncode,
1903
- output=output,
1904
- )
1905
- raise QueryScriptRunError(
1906
- f"Query script exited with error code {proc.returncode}",
1907
- return_code=proc.returncode,
1908
- output=output,
1909
- )
1910
-
1911
- try:
1912
- result = json.loads(response_text)
1913
- except ValueError:
1914
- result = None
1915
-
1916
- dataset: Optional[DatasetRecord] = None
1917
- version: Optional[int] = None
1918
- if save:
1919
- dataset, version = self.save_result(
1920
- query_script, result, output, version, job_id
1921
- )
1922
-
1923
- return QueryResult(dataset=dataset, version=version, output=output)
1924
-
1925
- def run_query(
1926
- self,
1927
- python_executable: str,
1928
- query_script: str,
1929
- envs: Optional[Mapping[str, str]],
1930
- feature_file: IO[bytes],
1931
- capture_output: bool,
1932
- feature_module: str,
1933
- output_hook: Callable[[str], None],
1934
- params: Optional[dict[str, str]],
1935
- save: bool,
1936
- job_id: Optional[str],
1937
- ) -> tuple[list[str], subprocess.Popen, str]:
1938
- try:
1939
- feature_code, query_script_compiled = self.compile_query_script(
1940
- query_script, feature_module[:-3]
1941
- )
1942
- if feature_code:
1943
- feature_file.write(feature_code.encode())
1944
- feature_file.flush()
1945
-
1946
- except Exception as exc:
1947
- raise QueryScriptCompileError(
1948
- f"Query script failed to compile, reason: {exc}"
1949
- ) from exc
1950
- r, w = os.pipe()
1951
- if os.name == "nt":
1952
- import msvcrt
1953
-
1954
- os.set_inheritable(w, True)
1955
-
1956
- startupinfo = subprocess.STARTUPINFO() # type: ignore[attr-defined]
1957
- handle = msvcrt.get_osfhandle(w) # type: ignore[attr-defined]
1958
- startupinfo.lpAttributeList["handle_list"].append(handle)
1959
- kwargs: dict[str, Any] = {"startupinfo": startupinfo}
1806
+ if _execute_last_expression:
1807
+ try:
1808
+ code_ast = ast.parse(query_script)
1809
+ code_ast = self.attach_query_wrapper(code_ast)
1810
+ query_script_compiled = ast.unparse(code_ast)
1811
+ except Exception as exc:
1812
+ raise QueryScriptCompileError(
1813
+ f"Query script failed to compile, reason: {exc}"
1814
+ ) from exc
1960
1815
  else:
1961
- handle = w
1962
- kwargs = {"pass_fds": [w]}
1963
- envs = dict(envs or os.environ)
1964
- if feature_code:
1965
- envs["DATACHAIN_FEATURE_CLASS_SOURCE"] = json.dumps(
1966
- {feature_module: feature_code}
1967
- )
1968
- envs.update(
1816
+ query_script_compiled = query_script
1817
+ assert not save
1818
+
1819
+ env = dict(env or os.environ)
1820
+ env.update(
1969
1821
  {
1970
1822
  "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
1971
1823
  "PYTHONPATH": os.getcwd(), # For local imports
1972
1824
  "DATACHAIN_QUERY_SAVE": "1" if save else "",
1973
1825
  "PYTHONUNBUFFERED": "1",
1974
- "DATACHAIN_OUTPUT_FD": str(handle),
1975
1826
  "DATACHAIN_JOB_ID": job_id or "",
1976
1827
  },
1977
1828
  )
1978
- with subprocess.Popen( # noqa: S603
1979
- [python_executable, "-c", query_script_compiled],
1980
- env=envs,
1981
- stdout=subprocess.PIPE if capture_output else None,
1982
- stderr=subprocess.STDOUT if capture_output else None,
1983
- bufsize=1,
1984
- text=False,
1985
- **kwargs,
1986
- ) as proc:
1987
- os.close(w)
1988
-
1989
- out = proc.stdout
1990
- _lines: list[str] = []
1991
- ctx = print_and_capture(out, output_hook) if out else nullcontext(_lines)
1992
-
1993
- with ctx as lines, open(r) as f:
1994
- response_text = ""
1995
- while proc.poll() is None:
1996
- response_text += f.readline()
1997
- time.sleep(0.1)
1998
- response_text += f.readline()
1999
- return lines, proc, response_text
2000
-
2001
- def save_result(self, query_script, exec_result, output, version, job_id):
2002
- if not exec_result:
2003
- raise QueryScriptDatasetNotFound(
2004
- "No dataset found after running Query script",
2005
- output=output,
1829
+ popen_kwargs = {}
1830
+ if capture_output:
1831
+ popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1832
+
1833
+ cmd = [python_executable, "-c", query_script_compiled]
1834
+ with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # type: ignore[call-overload] # noqa: S603
1835
+ if capture_output:
1836
+ args = (proc.stdout, output_hook)
1837
+ thread = Thread(target=_process_stream, args=args, daemon=True)
1838
+ thread.start()
1839
+ thread.join() # wait for the reader thread
1840
+
1841
+ if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1842
+ raise QueryScriptCancelError(
1843
+ "Query script was canceled by user",
1844
+ return_code=proc.returncode,
1845
+ )
1846
+ if proc.returncode:
1847
+ raise QueryScriptRunError(
1848
+ f"Query script exited with error code {proc.returncode}",
1849
+ return_code=proc.returncode,
2006
1850
  )
2007
- name, version = exec_result
2008
- # finding returning dataset
2009
- try:
2010
- dataset = self.get_dataset(name)
2011
- dataset.get_version(version)
2012
- except (DatasetNotFoundError, ValueError) as e:
2013
- raise QueryScriptDatasetNotFound(
2014
- "No dataset found after running Query script",
2015
- output=output,
2016
- ) from e
2017
- dataset = self.update_dataset(
2018
- dataset,
2019
- script_output=output,
2020
- query_script=query_script,
2021
- )
2022
- self.update_dataset_version_with_warehouse_info(
2023
- dataset,
2024
- version,
2025
- script_output=output,
2026
- query_script=query_script,
2027
- job_id=job_id,
2028
- is_job_result=True,
2029
- )
2030
- return dataset, version
2031
1851
 
2032
1852
  def cp(
2033
1853
  self,
@@ -42,10 +42,6 @@ class QueryScriptRunError(Exception):
42
42
  super().__init__(self.message)
43
43
 
44
44
 
45
- class QueryScriptDatasetNotFound(QueryScriptRunError): # noqa: N818
46
- pass
47
-
48
-
49
45
  class QueryScriptCancelError(QueryScriptRunError):
50
46
  pass
51
47