datachain 0.1.10__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (233) hide show
  1. {datachain-0.1.10/src/datachain.egg-info → datachain-0.1.12}/PKG-INFO +3 -1
  2. {datachain-0.1.10 → datachain-0.1.12}/examples/clip.py +8 -12
  3. datachain-0.1.12/examples/json-csv-reader.py +87 -0
  4. {datachain-0.1.10 → datachain-0.1.12}/examples/torch-loader.py +4 -9
  5. {datachain-0.1.10 → datachain-0.1.12}/pyproject.toml +3 -1
  6. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/_version.py +2 -2
  7. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/catalog/catalog.py +47 -3
  8. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/metastore.py +2 -0
  9. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/dataset.py +5 -7
  10. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/dc.py +150 -7
  11. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/feature.py +0 -10
  12. datachain-0.1.12/src/datachain/lib/meta_formats.py +164 -0
  13. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/pytorch.py +33 -4
  14. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/signal_schema.py +63 -6
  15. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/dataset.py +1 -1
  16. {datachain-0.1.10 → datachain-0.1.12/src/datachain.egg-info}/PKG-INFO +3 -1
  17. {datachain-0.1.10 → datachain-0.1.12}/src/datachain.egg-info/SOURCES.txt +2 -0
  18. {datachain-0.1.10 → datachain-0.1.12}/src/datachain.egg-info/requires.txt +2 -0
  19. {datachain-0.1.10 → datachain-0.1.12}/tests/conftest.py +1 -0
  20. {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_catalog.py +67 -0
  21. {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_dataset_query.py +4 -24
  22. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_datachain.py +66 -20
  23. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_datachain_merge.py +23 -9
  24. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_feature_utils.py +2 -2
  25. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_signal_schema.py +41 -6
  26. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_id_generator.py +2 -2
  27. {datachain-0.1.10 → datachain-0.1.12}/tests/utils.py +15 -0
  28. {datachain-0.1.10 → datachain-0.1.12}/.cruft.json +0 -0
  29. {datachain-0.1.10 → datachain-0.1.12}/.gitattributes +0 -0
  30. {datachain-0.1.10 → datachain-0.1.12}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  31. {datachain-0.1.10 → datachain-0.1.12}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  32. {datachain-0.1.10 → datachain-0.1.12}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  33. {datachain-0.1.10 → datachain-0.1.12}/.github/codecov.yaml +0 -0
  34. {datachain-0.1.10 → datachain-0.1.12}/.github/dependabot.yml +0 -0
  35. {datachain-0.1.10 → datachain-0.1.12}/.github/workflows/benchmarks.yml +0 -0
  36. {datachain-0.1.10 → datachain-0.1.12}/.github/workflows/release.yml +0 -0
  37. {datachain-0.1.10 → datachain-0.1.12}/.github/workflows/tests.yml +0 -0
  38. {datachain-0.1.10 → datachain-0.1.12}/.github/workflows/update-template.yaml +0 -0
  39. {datachain-0.1.10 → datachain-0.1.12}/.gitignore +0 -0
  40. {datachain-0.1.10 → datachain-0.1.12}/.pre-commit-config.yaml +0 -0
  41. {datachain-0.1.10 → datachain-0.1.12}/.reuse/dep5 +0 -0
  42. {datachain-0.1.10 → datachain-0.1.12}/CODE_OF_CONDUCT.rst +0 -0
  43. {datachain-0.1.10 → datachain-0.1.12}/CONTRIBUTING.rst +0 -0
  44. {datachain-0.1.10 → datachain-0.1.12}/LICENSE +0 -0
  45. {datachain-0.1.10 → datachain-0.1.12}/LICENSES/Apache-2.0.txt +0 -0
  46. {datachain-0.1.10 → datachain-0.1.12}/LICENSES/BSD-3-Clause.txt +0 -0
  47. {datachain-0.1.10 → datachain-0.1.12}/LICENSES/Python-2.0.txt +0 -0
  48. {datachain-0.1.10 → datachain-0.1.12}/README.rst +0 -0
  49. {datachain-0.1.10 → datachain-0.1.12}/docs/cv_intro.md +0 -0
  50. {datachain-0.1.10 → datachain-0.1.12}/docs/udfs.md +0 -0
  51. {datachain-0.1.10 → datachain-0.1.12}/examples/blip2_image_desc_lib.py +0 -0
  52. {datachain-0.1.10 → datachain-0.1.12}/examples/common_sql_functions.py +0 -0
  53. {datachain-0.1.10 → datachain-0.1.12}/examples/dir_expansion.py +0 -0
  54. {datachain-0.1.10 → datachain-0.1.12}/examples/hf_pipeline.py +0 -0
  55. {datachain-0.1.10 → datachain-0.1.12}/examples/iptc_exif_xmp_lib.py +0 -0
  56. {datachain-0.1.10 → datachain-0.1.12}/examples/llava2_image_desc_lib.py +0 -0
  57. {datachain-0.1.10 → datachain-0.1.12}/examples/llm-claude-aggregate-query.py +0 -0
  58. {datachain-0.1.10 → datachain-0.1.12}/examples/llm-claude-simple-query.py +0 -0
  59. {datachain-0.1.10 → datachain-0.1.12}/examples/llm-claude.py +0 -0
  60. {datachain-0.1.10 → datachain-0.1.12}/examples/loader.py +0 -0
  61. {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/README +0 -0
  62. {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/distance_to_query.py +0 -0
  63. {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/llm_chat.py +0 -0
  64. {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/requirements.txt +0 -0
  65. {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/single_query.py +0 -0
  66. {datachain-0.1.10 → datachain-0.1.12}/examples/neurips/text_loaders.py +0 -0
  67. {datachain-0.1.10 → datachain-0.1.12}/examples/notebooks/clip_fine_tuning.ipynb +0 -0
  68. {datachain-0.1.10 → datachain-0.1.12}/examples/openai_image_desc_lib.py +0 -0
  69. {datachain-0.1.10 → datachain-0.1.12}/examples/openimage-detect.py +0 -0
  70. {datachain-0.1.10 → datachain-0.1.12}/examples/pose_detection.py +0 -0
  71. {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/batching.py +0 -0
  72. {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/image_transformation.py +0 -0
  73. {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/parallel.py +0 -0
  74. {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/simple.py +0 -0
  75. {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/stateful.py +0 -0
  76. {datachain-0.1.10 → datachain-0.1.12}/examples/udfs/stateful_similarity.py +0 -0
  77. {datachain-0.1.10 → datachain-0.1.12}/examples/unstructured-text.py +0 -0
  78. {datachain-0.1.10 → datachain-0.1.12}/examples/wds.py +0 -0
  79. {datachain-0.1.10 → datachain-0.1.12}/examples/wds_filtered.py +0 -0
  80. {datachain-0.1.10 → datachain-0.1.12}/examples/zalando/zalando_clip.py +0 -0
  81. {datachain-0.1.10 → datachain-0.1.12}/examples/zalando/zalando_dir_as_class.py +0 -0
  82. {datachain-0.1.10 → datachain-0.1.12}/examples/zalando/zalando_splits_and_classes_ds.py +0 -0
  83. {datachain-0.1.10 → datachain-0.1.12}/examples/zalando/zalando_splits_and_classes_output.py +0 -0
  84. {datachain-0.1.10 → datachain-0.1.12}/noxfile.py +0 -0
  85. {datachain-0.1.10 → datachain-0.1.12}/setup.cfg +0 -0
  86. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/__init__.py +0 -0
  87. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/__main__.py +0 -0
  88. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/asyn.py +0 -0
  89. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/cache.py +0 -0
  90. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/catalog/__init__.py +0 -0
  91. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/catalog/datasource.py +0 -0
  92. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/catalog/loader.py +0 -0
  93. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/catalog/subclass.py +0 -0
  94. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/cli.py +0 -0
  95. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/cli_utils.py +0 -0
  96. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/__init__.py +0 -0
  97. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/azure.py +0 -0
  98. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/fileslice.py +0 -0
  99. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/fsspec.py +0 -0
  100. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/gcs.py +0 -0
  101. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/local.py +0 -0
  102. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/client/s3.py +0 -0
  103. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/config.py +0 -0
  104. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/__init__.py +0 -0
  105. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/db_engine.py +0 -0
  106. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/id_generator.py +0 -0
  107. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/job.py +0 -0
  108. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/schema.py +0 -0
  109. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/serializer.py +0 -0
  110. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/sqlite.py +0 -0
  111. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/data_storage/warehouse.py +0 -0
  112. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/error.py +0 -0
  113. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/__init__.py +0 -0
  114. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/cached_stream.py +0 -0
  115. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/claude.py +0 -0
  116. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/feature_registry.py +0 -0
  117. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/feature_utils.py +0 -0
  118. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/file.py +0 -0
  119. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/gpt4_vision.py +0 -0
  120. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/hf_image_to_text.py +0 -0
  121. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/hf_pipeline.py +0 -0
  122. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/image.py +0 -0
  123. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/image_transform.py +0 -0
  124. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/iptc_exif_xmp.py +0 -0
  125. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/parquet.py +0 -0
  126. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/reader.py +0 -0
  127. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/settings.py +0 -0
  128. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/text.py +0 -0
  129. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/udf.py +0 -0
  130. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/udf_signature.py +0 -0
  131. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/unstructured.py +0 -0
  132. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/utils.py +0 -0
  133. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/vfile.py +0 -0
  134. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/webdataset.py +0 -0
  135. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/lib/webdataset_laion.py +0 -0
  136. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/listing.py +0 -0
  137. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/node.py +0 -0
  138. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/nodes_fetcher.py +0 -0
  139. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/nodes_thread_pool.py +0 -0
  140. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/progress.py +0 -0
  141. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/py.typed +0 -0
  142. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/__init__.py +0 -0
  143. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/batch.py +0 -0
  144. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/builtins.py +0 -0
  145. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/dispatch.py +0 -0
  146. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/params.py +0 -0
  147. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/schema.py +0 -0
  148. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/session.py +0 -0
  149. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/query/udf.py +0 -0
  150. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/remote/__init__.py +0 -0
  151. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/remote/studio.py +0 -0
  152. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/__init__.py +0 -0
  153. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/default/__init__.py +0 -0
  154. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/default/base.py +0 -0
  155. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/__init__.py +0 -0
  156. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/array.py +0 -0
  157. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/conditional.py +0 -0
  158. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/path.py +0 -0
  159. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/random.py +0 -0
  160. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/functions/string.py +0 -0
  161. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/selectable.py +0 -0
  162. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/sqlite/__init__.py +0 -0
  163. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/sqlite/base.py +0 -0
  164. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/sqlite/types.py +0 -0
  165. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/sqlite/vector.py +0 -0
  166. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/types.py +0 -0
  167. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/sql/utils.py +0 -0
  168. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/storage.py +0 -0
  169. {datachain-0.1.10 → datachain-0.1.12}/src/datachain/utils.py +0 -0
  170. {datachain-0.1.10 → datachain-0.1.12}/src/datachain.egg-info/dependency_links.txt +0 -0
  171. {datachain-0.1.10 → datachain-0.1.12}/src/datachain.egg-info/entry_points.txt +0 -0
  172. {datachain-0.1.10 → datachain-0.1.12}/src/datachain.egg-info/top_level.txt +0 -0
  173. {datachain-0.1.10 → datachain-0.1.12}/tests/__init__.py +0 -0
  174. {datachain-0.1.10 → datachain-0.1.12}/tests/benchmarks/__init__.py +0 -0
  175. {datachain-0.1.10 → datachain-0.1.12}/tests/benchmarks/conftest.py +0 -0
  176. {datachain-0.1.10 → datachain-0.1.12}/tests/benchmarks/test_ls.py +0 -0
  177. {datachain-0.1.10 → datachain-0.1.12}/tests/benchmarks/test_version.py +0 -0
  178. {datachain-0.1.10 → datachain-0.1.12}/tests/data.py +0 -0
  179. {datachain-0.1.10 → datachain-0.1.12}/tests/func/__init__.py +0 -0
  180. {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_client.py +0 -0
  181. {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_datasets.py +0 -0
  182. {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_ls.py +0 -0
  183. {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_pull.py +0 -0
  184. {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_pytorch.py +0 -0
  185. {datachain-0.1.10 → datachain-0.1.12}/tests/func/test_query.py +0 -0
  186. {datachain-0.1.10 → datachain-0.1.12}/tests/scripts/feature_class.py +0 -0
  187. {datachain-0.1.10 → datachain-0.1.12}/tests/scripts/feature_class_parallel.py +0 -0
  188. {datachain-0.1.10 → datachain-0.1.12}/tests/scripts/name_len_normal.py +0 -0
  189. {datachain-0.1.10 → datachain-0.1.12}/tests/scripts/name_len_slow.py +0 -0
  190. {datachain-0.1.10 → datachain-0.1.12}/tests/test_cli_e2e.py +0 -0
  191. {datachain-0.1.10 → datachain-0.1.12}/tests/test_query_e2e.py +0 -0
  192. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/__init__.py +0 -0
  193. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/__init__.py +0 -0
  194. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_cached_stream.py +0 -0
  195. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_feature.py +0 -0
  196. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_file.py +0 -0
  197. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_image.py +0 -0
  198. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_parquet.py +0 -0
  199. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_reader.py +0 -0
  200. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_text.py +0 -0
  201. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_udf_signature.py +0 -0
  202. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_utils.py +0 -0
  203. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/lib/test_webdataset.py +0 -0
  204. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/__init__.py +0 -0
  205. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/sqlite/__init__.py +0 -0
  206. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/sqlite/test_utils.py +0 -0
  207. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_array.py +0 -0
  208. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_conditional.py +0 -0
  209. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_path.py +0 -0
  210. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_random.py +0 -0
  211. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_selectable.py +0 -0
  212. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/sql/test_string.py +0 -0
  213. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_asyn.py +0 -0
  214. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_cache.py +0 -0
  215. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_catalog.py +0 -0
  216. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_catalog_loader.py +0 -0
  217. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_cli_parsing.py +0 -0
  218. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_client.py +0 -0
  219. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_client_s3.py +0 -0
  220. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_data_storage.py +0 -0
  221. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_database_engine.py +0 -0
  222. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_dataset.py +0 -0
  223. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_dispatch.py +0 -0
  224. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_fileslice.py +0 -0
  225. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_listing.py +0 -0
  226. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_metastore.py +0 -0
  227. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_query_params.py +0 -0
  228. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_serializer.py +0 -0
  229. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_session.py +0 -0
  230. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_storage.py +0 -0
  231. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_udf.py +0 -0
  232. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_utils.py +0 -0
  233. {datachain-0.1.10 → datachain-0.1.12}/tests/unit/test_warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.1.10
3
+ Version: 0.1.12
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -36,6 +36,8 @@ Requires-Dist: multiprocess==0.70.16
36
36
  Requires-Dist: dill==0.3.8
37
37
  Requires-Dist: ujson>=5.9.0
38
38
  Requires-Dist: pydantic<3,>=2
39
+ Requires-Dist: jmespath>=1.0
40
+ Requires-Dist: datamodel-code-generator>=0.25
39
41
  Provides-Extra: cv
40
42
  Requires-Dist: Pillow<11,>=10.0.0; extra == "cv"
41
43
  Requires-Dist: torch>=2.1.0; extra == "cv"
@@ -4,26 +4,22 @@ from torch.nn.functional import cosine_similarity
4
4
  from torch.utils.data import DataLoader
5
5
 
6
6
  from datachain.lib.dc import C, DataChain
7
- from datachain.lib.image import ImageReader
8
- from datachain.lib.text import TextReader
9
- from datachain.sql.functions import path
10
7
 
11
8
  source = "gs://dvcx-50k-laion-files/000000/00000000*"
12
9
 
13
10
 
14
11
  def create_dataset():
15
12
  imgs = (
16
- DataChain(source)
13
+ DataChain.from_storage(source, type="image")
17
14
  .filter(C.name.glob("*.jpg"))
18
- .mutate(stem=path.file_stem(C.name))
15
+ .map(stem=lambda name: name.split(".")[0], output=str)
19
16
  )
20
17
  captions = (
21
- DataChain.from_storage(source, is_text=True)
18
+ DataChain.from_storage(source, type="text")
22
19
  .filter(C.name.glob("*.txt"))
23
- .mutate(stem=path.file_stem(C.name))
24
- .map(lambda file: file.get_value(), output={"caption": str})
20
+ .map(stem=lambda name: name.split(".")[0], output=str)
25
21
  )
26
- return imgs.join(captions.select("stem", "text"), "stem").save("laion-50k")
22
+ return imgs.merge(captions, on="stem")
27
23
 
28
24
 
29
25
  if __name__ == "__main__":
@@ -34,9 +30,9 @@ if __name__ == "__main__":
34
30
  )
35
31
  tokenizer = open_clip.get_tokenizer("ViT-B-32")
36
32
 
37
- ds = q.to_pytorch(
38
- ImageReader(transform=preprocess),
39
- TextReader("text", tokenizer=tokenizer),
33
+ ds = q.select("file", "right_file").to_pytorch(
34
+ transform=preprocess,
35
+ tokenizer=tokenizer,
40
36
  )
41
37
  loader = DataLoader(ds, batch_size=16)
42
38
 
@@ -0,0 +1,87 @@
1
+ #
2
+ # TODO:
3
+ # refactor lib/meta_formats/read_scema into a Datachain method
4
+ #
5
+ # ER: add support for Optional fields in read_schema()
6
+ # ER: add support for headless CSV within static schema only
7
+ # ER: fix the bug in datamodel-codegen failing to recognize csv float and int columns
8
+ #
9
+ # Open issues:
10
+ # 1. A single filename cannot be passed as schema source (#1563)
11
+ # 2. Need syntax like "file.open(encoding='utf-8')" to avoid "type=text" (#1614)
12
+ # 3. Need syntax like "datachain.collate(func -> Any)" (#1615)
13
+ # 4. "Feature" does not tolerate creating a class twice (#1617)
14
+ # 5. Unsure how to deal with 'folder' pseudo-files in cloud systems(#1618)
15
+ # 6. There should be exec() method to force-run the existing chain (#1616)
16
+ # 7. data-model-codegenerator: datamodel-codegen reports all CSV fields as 'str'.
17
+ # 8. from_json and from_csv methods do not filter empty files from AWS
18
+ # dependencies:
19
+ # pip install datamodel-code-generator
20
+ # pip install jmespath
21
+
22
+ from typing import Optional
23
+
24
+ from pydantic import BaseModel
25
+
26
+ from datachain.lib.dc import C, DataChain
27
+ from datachain.lib.feature_utils import pydantic_to_feature
28
+ from datachain.lib.meta_formats import read_schema
29
+
30
+
31
+ # Sample model for static JSON model
32
+ class LicenseModel(BaseModel):
33
+ url: str
34
+ id: int
35
+ name: str
36
+
37
+
38
+ LicenseFeature = pydantic_to_feature(LicenseModel)
39
+
40
+
41
+ # Sample model for static CSV model
42
+ class ChatDialog(BaseModel):
43
+ id: Optional[int] = None
44
+ count: Optional[int] = None
45
+ sender: Optional[str] = None
46
+ text: Optional[str] = None
47
+
48
+
49
+ ChatFeature = pydantic_to_feature(ChatDialog)
50
+
51
+
52
+ def main():
53
+ uri = "gs://datachain-demo/coco2017/annotations_captions/"
54
+
55
+ print("Reading schema from the root COCO annotation")
56
+ chain = (
57
+ DataChain.from_storage(uri)
58
+ .filter(C.name.glob("*.json"))
59
+ .limit(1)
60
+ .map( # dummy column created (#1615)
61
+ meta_schema=lambda file: read_schema(file, data_type="json"), output=str
62
+ )
63
+ )
64
+ # dummy executor (#1616)
65
+ chain.save()
66
+
67
+ print("static JSON schema test parsing 7 objects")
68
+ static_json_ds = DataChain.from_json(uri, jmespath="licenses", spec=LicenseFeature)
69
+ print(static_json_ds.to_pandas())
70
+
71
+ print("dynamic JSON schema test parsing 5K objects")
72
+ dynamic_json_ds = DataChain.from_json(uri, jmespath="images", show_schema=True)
73
+ print(dynamic_json_ds.to_pandas())
74
+
75
+ uri = "gs://datachain-demo/chatbot-csv/"
76
+ print("static CSV with header schema test parsing 3.5K objects")
77
+ static_csv_ds = DataChain.from_csv(uri, spec=ChatFeature)
78
+ print(static_csv_ds.to_pandas())
79
+
80
+ uri = "gs://datachain-demo/laion-aesthetics-csv"
81
+ print("dynamic CSV with header schema test parsing 3M objects")
82
+ dynamic_csv_ds = DataChain.from_csv(uri, show_schema=True)
83
+ print(dynamic_csv_ds.to_pandas())
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
@@ -6,8 +6,7 @@ from torch.utils.data import DataLoader
6
6
  from torchvision.transforms import v2
7
7
 
8
8
  from datachain.lib.dc import C, DataChain
9
- from datachain.lib.image import ImageReader
10
- from datachain.lib.reader import LabelReader
9
+ from datachain.lib.pytorch import label_to_int
11
10
 
12
11
  STORAGE = "gs://dvcx-datalakes/dogs-and-cats/"
13
12
 
@@ -45,17 +44,13 @@ class CNN(nn.Module):
45
44
 
46
45
  if __name__ == "__main__":
47
46
  ds = (
48
- DataChain(STORAGE)
47
+ DataChain.from_storage(STORAGE, type="image")
49
48
  .filter(C.name.glob("*.jpg"))
50
- .map(lambda name: (name[:3],), output={"label": str})
49
+ .map(label=lambda name: label_to_int(name[:3], CLASSES), output=int)
51
50
  )
52
51
 
53
52
  train_loader = DataLoader(
54
- ds.to_pytorch(
55
- ImageReader(),
56
- LabelReader("label", classes=CLASSES),
57
- transform=transform,
58
- ),
53
+ ds.to_pytorch(transform=transform),
59
54
  batch_size=16,
60
55
  num_workers=2,
61
56
  )
@@ -39,7 +39,9 @@ dependencies = [
39
39
  "multiprocess==0.70.16",
40
40
  "dill==0.3.8",
41
41
  "ujson>=5.9.0",
42
- "pydantic>=2,<3"
42
+ "pydantic>=2,<3",
43
+ "jmespath>=1.0",
44
+ "datamodel-code-generator>=0.25"
43
45
  ]
44
46
 
45
47
  [project.optional-dependencies]
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.1.10'
16
- __version_tuple__ = version_tuple = (0, 1, 10)
15
+ __version__ = version = '0.1.12'
16
+ __version_tuple__ = version_tuple = (0, 1, 12)
@@ -1580,10 +1580,54 @@ class Catalog:
1580
1580
 
1581
1581
  return dst
1582
1582
 
1583
- def open_object(self, row: RowDict, use_cache: bool = True, **config: Any):
1583
+ def get_file_signals(
1584
+ self, dataset_name: str, dataset_version: int, row: RowDict
1585
+ ) -> Optional[dict]:
1586
+ """
1587
+ Function that returns file signals from dataset row.
1588
+ Note that signal names are without prefix, so if there was 'laion__file__source'
1589
+ in original row, result will have just 'source'
1590
+ Example output:
1591
+ {
1592
+ "source": "s3://ldb-public",
1593
+ "parent": "animals/dogs",
1594
+ "name": "dog.jpg",
1595
+ ...
1596
+ }
1597
+ """
1598
+ from datachain.lib.signal_schema import SignalSchema
1599
+
1600
+ version = self.get_dataset(dataset_name).get_version(dataset_version)
1601
+
1602
+ file_signals_values = SignalSchema.deserialize(
1603
+ version.feature_schema
1604
+ ).get_file_signals_values(row)
1605
+ if not file_signals_values:
1606
+ return None
1607
+
1608
+ # there can be multiple file signals in a schema, but taking the first
1609
+ # one for now. In future we might add ability to choose from which one
1610
+ # to open object
1611
+ return next(iter(file_signals_values.values()))
1612
+
1613
+ def open_object(
1614
+ self,
1615
+ dataset_name: str,
1616
+ dataset_version: int,
1617
+ row: RowDict,
1618
+ use_cache: bool = True,
1619
+ **config: Any,
1620
+ ):
1621
+ file_signals = self.get_file_signals(dataset_name, dataset_version, row)
1622
+ if not file_signals:
1623
+ raise RuntimeError("Cannot open object without file signals")
1624
+
1584
1625
  config = config or self.client_config
1585
- client = self.get_client(row["source"], **config)
1586
- return client.open_object(self._get_row_uid(row), use_cache=use_cache)
1626
+ client = self.get_client(file_signals["source"], **config)
1627
+ return client.open_object(
1628
+ self._get_row_uid(file_signals), # type: ignore [arg-type]
1629
+ use_cache=use_cache,
1630
+ )
1587
1631
 
1588
1632
  def _get_row_uid(self, row: RowDict) -> UniqueId:
1589
1633
  return UniqueId(
@@ -1142,6 +1142,8 @@ class AbstractDBMetastore(AbstractMetastore):
1142
1142
  if field == "schema":
1143
1143
  dataset_version.update(**{field: DatasetRecord.parse_schema(value)})
1144
1144
  values[field] = json.dumps(value) if value else None
1145
+ elif field == "feature_schema":
1146
+ values[field] = json.dumps(value) if value else None
1145
1147
  elif field == "preview" and isinstance(value, list):
1146
1148
  values[field] = json.dumps(value, cls=JSONSerialize)
1147
1149
  else:
@@ -157,7 +157,7 @@ class DatasetVersion:
157
157
  dataset_id: int
158
158
  version: int
159
159
  status: int
160
- feature_schema: Optional[str]
160
+ feature_schema: dict
161
161
  created_at: datetime
162
162
  finished_at: Optional[datetime]
163
163
  error_message: str
@@ -199,7 +199,7 @@ class DatasetVersion:
199
199
  dataset_id,
200
200
  version,
201
201
  status,
202
- feature_schema,
202
+ json.loads(feature_schema) if feature_schema else {},
203
203
  created_at,
204
204
  finished_at,
205
205
  error_message,
@@ -263,9 +263,9 @@ class DatasetRecord:
263
263
  labels: list[str]
264
264
  shadow: bool
265
265
  schema: dict[str, Union[SQLType, type[SQLType]]]
266
+ feature_schema: dict
266
267
  versions: list[DatasetVersion]
267
268
  status: int = DatasetStatus.CREATED
268
- feature_schema: Optional[dict] = None
269
269
  created_at: Optional[datetime] = None
270
270
  finished_at: Optional[datetime] = None
271
271
  error_message: str = ""
@@ -320,8 +320,6 @@ class DatasetRecord:
320
320
  version_job_id: Optional[str] = None,
321
321
  version_is_job_result: bool = False,
322
322
  ) -> "DatasetRecord":
323
- fr_schema = json.loads(feature_schema) if feature_schema else {}
324
-
325
323
  labels_lst: list[str] = json.loads(labels) if labels else []
326
324
  schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
327
325
  version_schema_dct: dict[str, str] = (
@@ -333,7 +331,7 @@ class DatasetRecord:
333
331
  version_dataset_id,
334
332
  version,
335
333
  version_status,
336
- fr_schema,
334
+ version_feature_schema,
337
335
  version_created_at,
338
336
  version_finished_at,
339
337
  version_error_message,
@@ -356,9 +354,9 @@ class DatasetRecord:
356
354
  labels_lst,
357
355
  bool(shadow),
358
356
  cls.parse_schema(schema_dct), # type: ignore[arg-type]
357
+ json.loads(feature_schema) if feature_schema else {},
359
358
  [dataset_version],
360
359
  status,
361
- fr_schema,
362
360
  created_at,
363
361
  finished_at,
364
362
  error_message,
@@ -6,6 +6,7 @@ import sqlalchemy
6
6
  from datachain.lib.feature import Feature, FeatureType
7
7
  from datachain.lib.feature_utils import features_to_tuples
8
8
  from datachain.lib.file import File, get_file
9
+ from datachain.lib.meta_formats import read_meta
9
10
  from datachain.lib.settings import Settings
10
11
  from datachain.lib.signal_schema import SignalSchema
11
12
  from datachain.lib.udf import (
@@ -219,6 +220,89 @@ class DataChain(DatasetQuery):
219
220
  """
220
221
  return DataChain(name=name, version=version)
221
222
 
223
+ @classmethod
224
+ def from_csv(
225
+ cls,
226
+ path,
227
+ type: Literal["binary", "text", "image"] = "text",
228
+ anon: bool = False,
229
+ spec: Optional[FeatureType] = None,
230
+ schema_from: Optional[str] = "auto",
231
+ show_schema: Optional[bool] = False,
232
+ ) -> "DataChain":
233
+ """Get data from CSV. It returns the chain itself.
234
+
235
+ Parameters
236
+ ----------
237
+ path : storage URI with directory. URI must start with storage prefix such
238
+ as `s3://`, `gs://`, `az://` or "file:///"
239
+ type : read file as "binary", "text", or "image" data. Default is "binary".
240
+ anon : use anonymous mode to access the storage.
241
+ spec : optional Data Model
242
+ schema_from : path to sample to infer spec from
243
+ show_schema : print auto-generated schema
244
+
245
+ Examples
246
+ --------
247
+
248
+ >>> chain = DataChain.from_csv("gs://csv")
249
+ """
250
+ if schema_from == "auto":
251
+ schema_from = path
252
+
253
+ chain = DataChain.from_storage(path=path, type=type, anon=anon)
254
+ return chain.gen(
255
+ csv=read_meta(
256
+ schema_from=schema_from,
257
+ meta_type="csv",
258
+ spec=spec,
259
+ show_schema=show_schema,
260
+ )
261
+ )
262
+
263
+ @classmethod
264
+ def from_json(
265
+ cls,
266
+ path,
267
+ type: Literal["binary", "text", "image"] = "text",
268
+ anon: bool = False,
269
+ spec: Optional[FeatureType] = None,
270
+ schema_from: Optional[str] = "auto",
271
+ jmespath: Optional[str] = None,
272
+ show_schema: Optional[bool] = False,
273
+ ) -> "DataChain":
274
+ """Get data from CSV. It returns the chain itself.
275
+
276
+ Parameters
277
+ ----------
278
+ path : storage URI with directory. URI must start with storage prefix such
279
+ as `s3://`, `gs://`, `az://` or "file:///"
280
+ type : read file as "binary", "text", or "image" data. Default is "binary".
281
+ anon : use anonymous mode to access the storage.
282
+ spec : optional Data Model
283
+ schema_from : path to sample to infer spec from
284
+ show_schema : print auto-generated schema
285
+ jmespath : JMESPATH expression to reduce JSON
286
+ name : return object name
287
+ Examples
288
+ --------
289
+
290
+ >>> chain = DataChain.from_json("gs://json")
291
+ """
292
+ if schema_from == "auto":
293
+ schema_from = path
294
+
295
+ chain = DataChain.from_storage(path=path, type=type, anon=anon)
296
+ return chain.gen(
297
+ json=read_meta(
298
+ schema_from=schema_from,
299
+ meta_type="json",
300
+ spec=spec,
301
+ show_schema=show_schema,
302
+ jmespath=jmespath,
303
+ )
304
+ )
305
+
222
306
  def save( # type: ignore[override]
223
307
  self, name: Optional[str] = None, version: Optional[int] = None
224
308
  ) -> "DataChain":
@@ -408,16 +492,43 @@ class DataChain(DatasetQuery):
408
492
  chain.signals_schema = new_schema
409
493
  return chain
410
494
 
411
- def get_values(self) -> Iterator[Sequence]:
412
- """Iterate over rows, getting feature values and applying reader calls."""
413
- for features in self.iterate():
414
- yield [fr.get_value() if isinstance(fr, Feature) else fr for fr in features]
495
+ def get_values(self, *cols: str) -> Iterator[list]:
496
+ """Iterate over rows, getting feature values and applying reader calls.
497
+ If columns are specified - limit them to specified columns.
498
+ """
499
+ for features in self.iterate(*cols):
500
+ yield [fr.get_value() if isinstance(fr, Feature) else fr for fr in features] # type: ignore[union-attr,call-arg]
501
+
502
+ def get_one_value(self, col: str) -> Iterator:
503
+ for item in self.get_values(col):
504
+ yield item[0]
415
505
 
416
- def iterate(self) -> Iterator[Sequence[Feature]]:
417
- db_signals = self.signals_schema.db_signals()
506
+ def iterate(self, *cols: str) -> Iterator[list[FeatureType]]:
507
+ """Iterate over rows. If columns are specified - limit them to specified
508
+ columns.
509
+ """
510
+ chain = self.select(*cols) if cols else self
511
+
512
+ db_signals = chain.signals_schema.db_signals()
418
513
  with super().select(*db_signals).as_iterable() as rows_iter:
419
514
  for row in rows_iter:
420
- yield self.signals_schema.row_to_features(row, self.session.catalog)
515
+ yield chain.signals_schema.row_to_features(row, chain.session.catalog)
516
+
517
+ def iterate_one(self, col: str) -> Iterator[FeatureType]:
518
+ for item in self.iterate(col):
519
+ yield item[0]
520
+
521
+ def collect(self, *cols: str) -> list[list[FeatureType]]:
522
+ return list(self.iterate(*cols))
523
+
524
+ def collect_one(self, col: str) -> list[FeatureType]:
525
+ return list(self.iterate_one(col))
526
+
527
+ def collect_values(self, *cols: str) -> list[list]:
528
+ return list(self.get_values(*cols))
529
+
530
+ def collect_one_value(self, col: str) -> list:
531
+ return list(self.get_one_value(col))
421
532
 
422
533
  def to_pytorch(self, **kwargs):
423
534
  """Convert to pytorch dataset format."""
@@ -607,3 +718,35 @@ class DataChain(DatasetQuery):
607
718
 
608
719
  def max(self, fr: FeatureType): # type: ignore[override]
609
720
  return self._extend_features("max", fr)
721
+
722
+ @detach
723
+ def gen_random(self) -> "DataChain":
724
+ from random import getrandbits
725
+
726
+ from datachain.data_storage.warehouse import RANDOM_BITS
727
+
728
+ if "random" not in self.signals_schema.values:
729
+ chain = self.map(random=lambda: getrandbits(RANDOM_BITS), output=int).save()
730
+ return chain.select_except("random")
731
+
732
+ return self
733
+
734
+ @detach
735
+ def shuffle(self) -> "DataChain":
736
+ """Return results in deterministic random order."""
737
+ chain = self.gen_random()
738
+ return DatasetQuery.shuffle(chain)
739
+
740
+ @detach
741
+ def chunk(self, index: int, total: int) -> "DataChain":
742
+ """Split a query into smaller chunks for e.g. parallelization.
743
+ Example:
744
+ >>> dc = DataChain(...)
745
+ >>> chunk_1 = dc._chunk(0, 2)
746
+ >>> chunk_2 = dc._chunk(1, 2)
747
+ Note:
748
+ Bear in mind that `index` is 0-indexed but `total` isn't.
749
+ Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
750
+ """
751
+ chain = self.gen_random()
752
+ return DatasetQuery.chunk(chain, index, total)
@@ -78,16 +78,6 @@ DATACHAIN_TO_TYPE = {
78
78
  JSON: dict,
79
79
  }
80
80
 
81
- NAMES_TO_TYPES = {
82
- "int": int,
83
- "str": str,
84
- "float": float,
85
- "bool": bool,
86
- "list": list,
87
- "dict": dict,
88
- "bytes": bytes,
89
- "datetime": datetime,
90
- }
91
81
 
92
82
  NUMPY_TO_DATACHAIN = {
93
83
  np.dtype("int8"): Int,