datachain 0.1.10__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (233) hide show
  1. {datachain-0.1.10/src/datachain.egg-info → datachain-0.1.11}/PKG-INFO +3 -1
  2. {datachain-0.1.10 → datachain-0.1.11}/examples/clip.py +8 -12
  3. datachain-0.1.11/examples/json-csv-reader.py +87 -0
  4. {datachain-0.1.10 → datachain-0.1.11}/examples/torch-loader.py +4 -9
  5. {datachain-0.1.10 → datachain-0.1.11}/pyproject.toml +3 -1
  6. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/_version.py +2 -2
  7. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/catalog/catalog.py +47 -3
  8. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/data_storage/metastore.py +2 -0
  9. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/dataset.py +5 -7
  10. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/dc.py +117 -1
  11. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/feature.py +0 -10
  12. datachain-0.1.11/src/datachain/lib/meta_formats.py +164 -0
  13. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/pytorch.py +33 -4
  14. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/signal_schema.py +63 -6
  15. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/query/dataset.py +1 -1
  16. {datachain-0.1.10 → datachain-0.1.11/src/datachain.egg-info}/PKG-INFO +3 -1
  17. {datachain-0.1.10 → datachain-0.1.11}/src/datachain.egg-info/SOURCES.txt +2 -0
  18. {datachain-0.1.10 → datachain-0.1.11}/src/datachain.egg-info/requires.txt +2 -0
  19. {datachain-0.1.10 → datachain-0.1.11}/tests/conftest.py +1 -0
  20. {datachain-0.1.10 → datachain-0.1.11}/tests/func/test_catalog.py +64 -0
  21. {datachain-0.1.10 → datachain-0.1.11}/tests/func/test_dataset_query.py +4 -24
  22. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_signal_schema.py +41 -6
  23. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_id_generator.py +2 -2
  24. {datachain-0.1.10 → datachain-0.1.11}/.cruft.json +0 -0
  25. {datachain-0.1.10 → datachain-0.1.11}/.gitattributes +0 -0
  26. {datachain-0.1.10 → datachain-0.1.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  27. {datachain-0.1.10 → datachain-0.1.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  28. {datachain-0.1.10 → datachain-0.1.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  29. {datachain-0.1.10 → datachain-0.1.11}/.github/codecov.yaml +0 -0
  30. {datachain-0.1.10 → datachain-0.1.11}/.github/dependabot.yml +0 -0
  31. {datachain-0.1.10 → datachain-0.1.11}/.github/workflows/benchmarks.yml +0 -0
  32. {datachain-0.1.10 → datachain-0.1.11}/.github/workflows/release.yml +0 -0
  33. {datachain-0.1.10 → datachain-0.1.11}/.github/workflows/tests.yml +0 -0
  34. {datachain-0.1.10 → datachain-0.1.11}/.github/workflows/update-template.yaml +0 -0
  35. {datachain-0.1.10 → datachain-0.1.11}/.gitignore +0 -0
  36. {datachain-0.1.10 → datachain-0.1.11}/.pre-commit-config.yaml +0 -0
  37. {datachain-0.1.10 → datachain-0.1.11}/.reuse/dep5 +0 -0
  38. {datachain-0.1.10 → datachain-0.1.11}/CODE_OF_CONDUCT.rst +0 -0
  39. {datachain-0.1.10 → datachain-0.1.11}/CONTRIBUTING.rst +0 -0
  40. {datachain-0.1.10 → datachain-0.1.11}/LICENSE +0 -0
  41. {datachain-0.1.10 → datachain-0.1.11}/LICENSES/Apache-2.0.txt +0 -0
  42. {datachain-0.1.10 → datachain-0.1.11}/LICENSES/BSD-3-Clause.txt +0 -0
  43. {datachain-0.1.10 → datachain-0.1.11}/LICENSES/Python-2.0.txt +0 -0
  44. {datachain-0.1.10 → datachain-0.1.11}/README.rst +0 -0
  45. {datachain-0.1.10 → datachain-0.1.11}/docs/cv_intro.md +0 -0
  46. {datachain-0.1.10 → datachain-0.1.11}/docs/udfs.md +0 -0
  47. {datachain-0.1.10 → datachain-0.1.11}/examples/blip2_image_desc_lib.py +0 -0
  48. {datachain-0.1.10 → datachain-0.1.11}/examples/common_sql_functions.py +0 -0
  49. {datachain-0.1.10 → datachain-0.1.11}/examples/dir_expansion.py +0 -0
  50. {datachain-0.1.10 → datachain-0.1.11}/examples/hf_pipeline.py +0 -0
  51. {datachain-0.1.10 → datachain-0.1.11}/examples/iptc_exif_xmp_lib.py +0 -0
  52. {datachain-0.1.10 → datachain-0.1.11}/examples/llava2_image_desc_lib.py +0 -0
  53. {datachain-0.1.10 → datachain-0.1.11}/examples/llm-claude-aggregate-query.py +0 -0
  54. {datachain-0.1.10 → datachain-0.1.11}/examples/llm-claude-simple-query.py +0 -0
  55. {datachain-0.1.10 → datachain-0.1.11}/examples/llm-claude.py +0 -0
  56. {datachain-0.1.10 → datachain-0.1.11}/examples/loader.py +0 -0
  57. {datachain-0.1.10 → datachain-0.1.11}/examples/neurips/README +0 -0
  58. {datachain-0.1.10 → datachain-0.1.11}/examples/neurips/distance_to_query.py +0 -0
  59. {datachain-0.1.10 → datachain-0.1.11}/examples/neurips/llm_chat.py +0 -0
  60. {datachain-0.1.10 → datachain-0.1.11}/examples/neurips/requirements.txt +0 -0
  61. {datachain-0.1.10 → datachain-0.1.11}/examples/neurips/single_query.py +0 -0
  62. {datachain-0.1.10 → datachain-0.1.11}/examples/neurips/text_loaders.py +0 -0
  63. {datachain-0.1.10 → datachain-0.1.11}/examples/notebooks/clip_fine_tuning.ipynb +0 -0
  64. {datachain-0.1.10 → datachain-0.1.11}/examples/openai_image_desc_lib.py +0 -0
  65. {datachain-0.1.10 → datachain-0.1.11}/examples/openimage-detect.py +0 -0
  66. {datachain-0.1.10 → datachain-0.1.11}/examples/pose_detection.py +0 -0
  67. {datachain-0.1.10 → datachain-0.1.11}/examples/udfs/batching.py +0 -0
  68. {datachain-0.1.10 → datachain-0.1.11}/examples/udfs/image_transformation.py +0 -0
  69. {datachain-0.1.10 → datachain-0.1.11}/examples/udfs/parallel.py +0 -0
  70. {datachain-0.1.10 → datachain-0.1.11}/examples/udfs/simple.py +0 -0
  71. {datachain-0.1.10 → datachain-0.1.11}/examples/udfs/stateful.py +0 -0
  72. {datachain-0.1.10 → datachain-0.1.11}/examples/udfs/stateful_similarity.py +0 -0
  73. {datachain-0.1.10 → datachain-0.1.11}/examples/unstructured-text.py +0 -0
  74. {datachain-0.1.10 → datachain-0.1.11}/examples/wds.py +0 -0
  75. {datachain-0.1.10 → datachain-0.1.11}/examples/wds_filtered.py +0 -0
  76. {datachain-0.1.10 → datachain-0.1.11}/examples/zalando/zalando_clip.py +0 -0
  77. {datachain-0.1.10 → datachain-0.1.11}/examples/zalando/zalando_dir_as_class.py +0 -0
  78. {datachain-0.1.10 → datachain-0.1.11}/examples/zalando/zalando_splits_and_classes_ds.py +0 -0
  79. {datachain-0.1.10 → datachain-0.1.11}/examples/zalando/zalando_splits_and_classes_output.py +0 -0
  80. {datachain-0.1.10 → datachain-0.1.11}/noxfile.py +0 -0
  81. {datachain-0.1.10 → datachain-0.1.11}/setup.cfg +0 -0
  82. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/__init__.py +0 -0
  83. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/__main__.py +0 -0
  84. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/asyn.py +0 -0
  85. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/cache.py +0 -0
  86. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/catalog/__init__.py +0 -0
  87. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/catalog/datasource.py +0 -0
  88. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/catalog/loader.py +0 -0
  89. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/catalog/subclass.py +0 -0
  90. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/cli.py +0 -0
  91. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/cli_utils.py +0 -0
  92. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/client/__init__.py +0 -0
  93. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/client/azure.py +0 -0
  94. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/client/fileslice.py +0 -0
  95. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/client/fsspec.py +0 -0
  96. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/client/gcs.py +0 -0
  97. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/client/local.py +0 -0
  98. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/client/s3.py +0 -0
  99. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/config.py +0 -0
  100. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/data_storage/__init__.py +0 -0
  101. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/data_storage/db_engine.py +0 -0
  102. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/data_storage/id_generator.py +0 -0
  103. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/data_storage/job.py +0 -0
  104. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/data_storage/schema.py +0 -0
  105. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/data_storage/serializer.py +0 -0
  106. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/data_storage/sqlite.py +0 -0
  107. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/data_storage/warehouse.py +0 -0
  108. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/error.py +0 -0
  109. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/__init__.py +0 -0
  110. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/cached_stream.py +0 -0
  111. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/claude.py +0 -0
  112. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/feature_registry.py +0 -0
  113. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/feature_utils.py +0 -0
  114. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/file.py +0 -0
  115. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/gpt4_vision.py +0 -0
  116. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/hf_image_to_text.py +0 -0
  117. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/hf_pipeline.py +0 -0
  118. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/image.py +0 -0
  119. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/image_transform.py +0 -0
  120. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/iptc_exif_xmp.py +0 -0
  121. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/parquet.py +0 -0
  122. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/reader.py +0 -0
  123. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/settings.py +0 -0
  124. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/text.py +0 -0
  125. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/udf.py +0 -0
  126. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/udf_signature.py +0 -0
  127. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/unstructured.py +0 -0
  128. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/utils.py +0 -0
  129. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/vfile.py +0 -0
  130. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/webdataset.py +0 -0
  131. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/lib/webdataset_laion.py +0 -0
  132. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/listing.py +0 -0
  133. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/node.py +0 -0
  134. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/nodes_fetcher.py +0 -0
  135. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/nodes_thread_pool.py +0 -0
  136. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/progress.py +0 -0
  137. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/py.typed +0 -0
  138. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/query/__init__.py +0 -0
  139. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/query/batch.py +0 -0
  140. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/query/builtins.py +0 -0
  141. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/query/dispatch.py +0 -0
  142. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/query/params.py +0 -0
  143. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/query/schema.py +0 -0
  144. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/query/session.py +0 -0
  145. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/query/udf.py +0 -0
  146. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/remote/__init__.py +0 -0
  147. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/remote/studio.py +0 -0
  148. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/__init__.py +0 -0
  149. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/default/__init__.py +0 -0
  150. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/default/base.py +0 -0
  151. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/functions/__init__.py +0 -0
  152. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/functions/array.py +0 -0
  153. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/functions/conditional.py +0 -0
  154. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/functions/path.py +0 -0
  155. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/functions/random.py +0 -0
  156. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/functions/string.py +0 -0
  157. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/selectable.py +0 -0
  158. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/sqlite/__init__.py +0 -0
  159. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/sqlite/base.py +0 -0
  160. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/sqlite/types.py +0 -0
  161. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/sqlite/vector.py +0 -0
  162. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/types.py +0 -0
  163. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/sql/utils.py +0 -0
  164. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/storage.py +0 -0
  165. {datachain-0.1.10 → datachain-0.1.11}/src/datachain/utils.py +0 -0
  166. {datachain-0.1.10 → datachain-0.1.11}/src/datachain.egg-info/dependency_links.txt +0 -0
  167. {datachain-0.1.10 → datachain-0.1.11}/src/datachain.egg-info/entry_points.txt +0 -0
  168. {datachain-0.1.10 → datachain-0.1.11}/src/datachain.egg-info/top_level.txt +0 -0
  169. {datachain-0.1.10 → datachain-0.1.11}/tests/__init__.py +0 -0
  170. {datachain-0.1.10 → datachain-0.1.11}/tests/benchmarks/__init__.py +0 -0
  171. {datachain-0.1.10 → datachain-0.1.11}/tests/benchmarks/conftest.py +0 -0
  172. {datachain-0.1.10 → datachain-0.1.11}/tests/benchmarks/test_ls.py +0 -0
  173. {datachain-0.1.10 → datachain-0.1.11}/tests/benchmarks/test_version.py +0 -0
  174. {datachain-0.1.10 → datachain-0.1.11}/tests/data.py +0 -0
  175. {datachain-0.1.10 → datachain-0.1.11}/tests/func/__init__.py +0 -0
  176. {datachain-0.1.10 → datachain-0.1.11}/tests/func/test_client.py +0 -0
  177. {datachain-0.1.10 → datachain-0.1.11}/tests/func/test_datasets.py +0 -0
  178. {datachain-0.1.10 → datachain-0.1.11}/tests/func/test_ls.py +0 -0
  179. {datachain-0.1.10 → datachain-0.1.11}/tests/func/test_pull.py +0 -0
  180. {datachain-0.1.10 → datachain-0.1.11}/tests/func/test_pytorch.py +0 -0
  181. {datachain-0.1.10 → datachain-0.1.11}/tests/func/test_query.py +0 -0
  182. {datachain-0.1.10 → datachain-0.1.11}/tests/scripts/feature_class.py +0 -0
  183. {datachain-0.1.10 → datachain-0.1.11}/tests/scripts/feature_class_parallel.py +0 -0
  184. {datachain-0.1.10 → datachain-0.1.11}/tests/scripts/name_len_normal.py +0 -0
  185. {datachain-0.1.10 → datachain-0.1.11}/tests/scripts/name_len_slow.py +0 -0
  186. {datachain-0.1.10 → datachain-0.1.11}/tests/test_cli_e2e.py +0 -0
  187. {datachain-0.1.10 → datachain-0.1.11}/tests/test_query_e2e.py +0 -0
  188. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/__init__.py +0 -0
  189. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/__init__.py +0 -0
  190. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_cached_stream.py +0 -0
  191. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_datachain.py +0 -0
  192. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_datachain_merge.py +0 -0
  193. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_feature.py +0 -0
  194. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_feature_utils.py +0 -0
  195. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_file.py +0 -0
  196. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_image.py +0 -0
  197. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_parquet.py +0 -0
  198. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_reader.py +0 -0
  199. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_text.py +0 -0
  200. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_udf_signature.py +0 -0
  201. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_utils.py +0 -0
  202. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/lib/test_webdataset.py +0 -0
  203. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/sql/__init__.py +0 -0
  204. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/sql/sqlite/__init__.py +0 -0
  205. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
  206. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/sql/test_array.py +0 -0
  207. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/sql/test_conditional.py +0 -0
  208. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/sql/test_path.py +0 -0
  209. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/sql/test_random.py +0 -0
  210. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/sql/test_selectable.py +0 -0
  211. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/sql/test_string.py +0 -0
  212. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_asyn.py +0 -0
  213. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_cache.py +0 -0
  214. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_catalog.py +0 -0
  215. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_catalog_loader.py +0 -0
  216. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_cli_parsing.py +0 -0
  217. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_client.py +0 -0
  218. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_client_s3.py +0 -0
  219. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_data_storage.py +0 -0
  220. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_database_engine.py +0 -0
  221. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_dataset.py +0 -0
  222. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_dispatch.py +0 -0
  223. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_fileslice.py +0 -0
  224. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_listing.py +0 -0
  225. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_metastore.py +0 -0
  226. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_query_params.py +0 -0
  227. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_serializer.py +0 -0
  228. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_session.py +0 -0
  229. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_storage.py +0 -0
  230. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_udf.py +0 -0
  231. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_utils.py +0 -0
  232. {datachain-0.1.10 → datachain-0.1.11}/tests/unit/test_warehouse.py +0 -0
  233. {datachain-0.1.10 → datachain-0.1.11}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.1.10
3
+ Version: 0.1.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -36,6 +36,8 @@ Requires-Dist: multiprocess==0.70.16
36
36
  Requires-Dist: dill==0.3.8
37
37
  Requires-Dist: ujson>=5.9.0
38
38
  Requires-Dist: pydantic<3,>=2
39
+ Requires-Dist: jmespath>=1.0
40
+ Requires-Dist: datamodel-code-generator>=0.25
39
41
  Provides-Extra: cv
40
42
  Requires-Dist: Pillow<11,>=10.0.0; extra == "cv"
41
43
  Requires-Dist: torch>=2.1.0; extra == "cv"
@@ -4,26 +4,22 @@ from torch.nn.functional import cosine_similarity
4
4
  from torch.utils.data import DataLoader
5
5
 
6
6
  from datachain.lib.dc import C, DataChain
7
- from datachain.lib.image import ImageReader
8
- from datachain.lib.text import TextReader
9
- from datachain.sql.functions import path
10
7
 
11
8
  source = "gs://dvcx-50k-laion-files/000000/00000000*"
12
9
 
13
10
 
14
11
  def create_dataset():
15
12
  imgs = (
16
- DataChain(source)
13
+ DataChain.from_storage(source, type="image")
17
14
  .filter(C.name.glob("*.jpg"))
18
- .mutate(stem=path.file_stem(C.name))
15
+ .map(stem=lambda name: name.split(".")[0], output=str)
19
16
  )
20
17
  captions = (
21
- DataChain.from_storage(source, is_text=True)
18
+ DataChain.from_storage(source, type="text")
22
19
  .filter(C.name.glob("*.txt"))
23
- .mutate(stem=path.file_stem(C.name))
24
- .map(lambda file: file.get_value(), output={"caption": str})
20
+ .map(stem=lambda name: name.split(".")[0], output=str)
25
21
  )
26
- return imgs.join(captions.select("stem", "text"), "stem").save("laion-50k")
22
+ return imgs.merge(captions, on="stem")
27
23
 
28
24
 
29
25
  if __name__ == "__main__":
@@ -34,9 +30,9 @@ if __name__ == "__main__":
34
30
  )
35
31
  tokenizer = open_clip.get_tokenizer("ViT-B-32")
36
32
 
37
- ds = q.to_pytorch(
38
- ImageReader(transform=preprocess),
39
- TextReader("text", tokenizer=tokenizer),
33
+ ds = q.select("file", "right_file").to_pytorch(
34
+ transform=preprocess,
35
+ tokenizer=tokenizer,
40
36
  )
41
37
  loader = DataLoader(ds, batch_size=16)
42
38
 
@@ -0,0 +1,87 @@
1
+ #
2
+ # TODO:
3
+ # refactor lib/meta_formats/read_scema into a Datachain method
4
+ #
5
+ # ER: add support for Optional fields in read_schema()
6
+ # ER: add support for headless CSV within static schema only
7
+ # ER: fix the bug in datamodel-codegen failing to recognize csv float and int columns
8
+ #
9
+ # Open issues:
10
+ # 1. A single filename cannot be passed as schema source (#1563)
11
+ # 2. Need syntax like "file.open(encoding='utf-8')" to avoid "type=text" (#1614)
12
+ # 3. Need syntax like "datachain.collate(func -> Any)" (#1615)
13
+ # 4. "Feature" does not tolerate creating a class twice (#1617)
14
+ # 5. Unsure how to deal with 'folder' pseudo-files in cloud systems(#1618)
15
+ # 6. There should be exec() method to force-run the existing chain (#1616)
16
+ # 7. data-model-codegenerator: datamodel-codegen reports all CSV fields as 'str'.
17
+ # 8. from_json and from_csv methods do not filter empty files from AWS
18
+ # dependencies:
19
+ # pip install datamodel-code-generator
20
+ # pip install jmespath
21
+
22
+ from typing import Optional
23
+
24
+ from pydantic import BaseModel
25
+
26
+ from datachain.lib.dc import C, DataChain
27
+ from datachain.lib.feature_utils import pydantic_to_feature
28
+ from datachain.lib.meta_formats import read_schema
29
+
30
+
31
+ # Sample model for static JSON model
32
+ class LicenseModel(BaseModel):
33
+ url: str
34
+ id: int
35
+ name: str
36
+
37
+
38
+ LicenseFeature = pydantic_to_feature(LicenseModel)
39
+
40
+
41
+ # Sample model for static CSV model
42
+ class ChatDialog(BaseModel):
43
+ id: Optional[int] = None
44
+ count: Optional[int] = None
45
+ sender: Optional[str] = None
46
+ text: Optional[str] = None
47
+
48
+
49
+ ChatFeature = pydantic_to_feature(ChatDialog)
50
+
51
+
52
+ def main():
53
+ uri = "gs://datachain-demo/coco2017/annotations_captions/"
54
+
55
+ print("Reading schema from the root COCO annotation")
56
+ chain = (
57
+ DataChain.from_storage(uri)
58
+ .filter(C.name.glob("*.json"))
59
+ .limit(1)
60
+ .map( # dummy column created (#1615)
61
+ meta_schema=lambda file: read_schema(file, data_type="json"), output=str
62
+ )
63
+ )
64
+ # dummy executor (#1616)
65
+ chain.save()
66
+
67
+ print("static JSON schema test parsing 7 objects")
68
+ static_json_ds = DataChain.from_json(uri, jmespath="licenses", spec=LicenseFeature)
69
+ print(static_json_ds.to_pandas())
70
+
71
+ print("dynamic JSON schema test parsing 5K objects")
72
+ dynamic_json_ds = DataChain.from_json(uri, jmespath="images", show_schema=True)
73
+ print(dynamic_json_ds.to_pandas())
74
+
75
+ uri = "gs://datachain-demo/chatbot-csv/"
76
+ print("static CSV with header schema test parsing 3.5K objects")
77
+ static_csv_ds = DataChain.from_csv(uri, spec=ChatFeature)
78
+ print(static_csv_ds.to_pandas())
79
+
80
+ uri = "gs://datachain-demo/laion-aesthetics-csv"
81
+ print("dynamic CSV with header schema test parsing 3M objects")
82
+ dynamic_csv_ds = DataChain.from_csv(uri, show_schema=True)
83
+ print(dynamic_csv_ds.to_pandas())
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
@@ -6,8 +6,7 @@ from torch.utils.data import DataLoader
6
6
  from torchvision.transforms import v2
7
7
 
8
8
  from datachain.lib.dc import C, DataChain
9
- from datachain.lib.image import ImageReader
10
- from datachain.lib.reader import LabelReader
9
+ from datachain.lib.pytorch import label_to_int
11
10
 
12
11
  STORAGE = "gs://dvcx-datalakes/dogs-and-cats/"
13
12
 
@@ -45,17 +44,13 @@ class CNN(nn.Module):
45
44
 
46
45
  if __name__ == "__main__":
47
46
  ds = (
48
- DataChain(STORAGE)
47
+ DataChain.from_storage(STORAGE, type="image")
49
48
  .filter(C.name.glob("*.jpg"))
50
- .map(lambda name: (name[:3],), output={"label": str})
49
+ .map(label=lambda name: label_to_int(name[:3], CLASSES), output=int)
51
50
  )
52
51
 
53
52
  train_loader = DataLoader(
54
- ds.to_pytorch(
55
- ImageReader(),
56
- LabelReader("label", classes=CLASSES),
57
- transform=transform,
58
- ),
53
+ ds.to_pytorch(transform=transform),
59
54
  batch_size=16,
60
55
  num_workers=2,
61
56
  )
@@ -39,7 +39,9 @@ dependencies = [
39
39
  "multiprocess==0.70.16",
40
40
  "dill==0.3.8",
41
41
  "ujson>=5.9.0",
42
- "pydantic>=2,<3"
42
+ "pydantic>=2,<3",
43
+ "jmespath>=1.0",
44
+ "datamodel-code-generator>=0.25"
43
45
  ]
44
46
 
45
47
  [project.optional-dependencies]
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.1.10'
16
- __version_tuple__ = version_tuple = (0, 1, 10)
15
+ __version__ = version = '0.1.11'
16
+ __version_tuple__ = version_tuple = (0, 1, 11)
@@ -1580,10 +1580,54 @@ class Catalog:
1580
1580
 
1581
1581
  return dst
1582
1582
 
1583
- def open_object(self, row: RowDict, use_cache: bool = True, **config: Any):
1583
+ def get_file_signals(
1584
+ self, dataset_name: str, dataset_version: int, row: RowDict
1585
+ ) -> Optional[dict]:
1586
+ """
1587
+ Function that returns file signals from dataset row.
1588
+ Note that signal names are without prefix, so if there was 'laion__file__source'
1589
+ in original row, result will have just 'source'
1590
+ Example output:
1591
+ {
1592
+ "source": "s3://ldb-public",
1593
+ "parent": "animals/dogs",
1594
+ "name": "dog.jpg",
1595
+ ...
1596
+ }
1597
+ """
1598
+ from datachain.lib.signal_schema import SignalSchema
1599
+
1600
+ version = self.get_dataset(dataset_name).get_version(dataset_version)
1601
+
1602
+ file_signals_values = SignalSchema.deserialize(
1603
+ version.feature_schema
1604
+ ).get_file_signals_values(row)
1605
+ if not file_signals_values:
1606
+ return None
1607
+
1608
+ # there can be multiple file signals in a schema, but taking the first
1609
+ # one for now. In future we might add ability to choose from which one
1610
+ # to open object
1611
+ return next(iter(file_signals_values.values()))
1612
+
1613
+ def open_object(
1614
+ self,
1615
+ dataset_name: str,
1616
+ dataset_version: int,
1617
+ row: RowDict,
1618
+ use_cache: bool = True,
1619
+ **config: Any,
1620
+ ):
1621
+ file_signals = self.get_file_signals(dataset_name, dataset_version, row)
1622
+ if not file_signals:
1623
+ raise RuntimeError("Cannot open object without file signals")
1624
+
1584
1625
  config = config or self.client_config
1585
- client = self.get_client(row["source"], **config)
1586
- return client.open_object(self._get_row_uid(row), use_cache=use_cache)
1626
+ client = self.get_client(file_signals["source"], **config)
1627
+ return client.open_object(
1628
+ self._get_row_uid(file_signals), # type: ignore [arg-type]
1629
+ use_cache=use_cache,
1630
+ )
1587
1631
 
1588
1632
  def _get_row_uid(self, row: RowDict) -> UniqueId:
1589
1633
  return UniqueId(
@@ -1142,6 +1142,8 @@ class AbstractDBMetastore(AbstractMetastore):
1142
1142
  if field == "schema":
1143
1143
  dataset_version.update(**{field: DatasetRecord.parse_schema(value)})
1144
1144
  values[field] = json.dumps(value) if value else None
1145
+ elif field == "feature_schema":
1146
+ values[field] = json.dumps(value) if value else None
1145
1147
  elif field == "preview" and isinstance(value, list):
1146
1148
  values[field] = json.dumps(value, cls=JSONSerialize)
1147
1149
  else:
@@ -157,7 +157,7 @@ class DatasetVersion:
157
157
  dataset_id: int
158
158
  version: int
159
159
  status: int
160
- feature_schema: Optional[str]
160
+ feature_schema: dict
161
161
  created_at: datetime
162
162
  finished_at: Optional[datetime]
163
163
  error_message: str
@@ -199,7 +199,7 @@ class DatasetVersion:
199
199
  dataset_id,
200
200
  version,
201
201
  status,
202
- feature_schema,
202
+ json.loads(feature_schema) if feature_schema else {},
203
203
  created_at,
204
204
  finished_at,
205
205
  error_message,
@@ -263,9 +263,9 @@ class DatasetRecord:
263
263
  labels: list[str]
264
264
  shadow: bool
265
265
  schema: dict[str, Union[SQLType, type[SQLType]]]
266
+ feature_schema: dict
266
267
  versions: list[DatasetVersion]
267
268
  status: int = DatasetStatus.CREATED
268
- feature_schema: Optional[dict] = None
269
269
  created_at: Optional[datetime] = None
270
270
  finished_at: Optional[datetime] = None
271
271
  error_message: str = ""
@@ -320,8 +320,6 @@ class DatasetRecord:
320
320
  version_job_id: Optional[str] = None,
321
321
  version_is_job_result: bool = False,
322
322
  ) -> "DatasetRecord":
323
- fr_schema = json.loads(feature_schema) if feature_schema else {}
324
-
325
323
  labels_lst: list[str] = json.loads(labels) if labels else []
326
324
  schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
327
325
  version_schema_dct: dict[str, str] = (
@@ -333,7 +331,7 @@ class DatasetRecord:
333
331
  version_dataset_id,
334
332
  version,
335
333
  version_status,
336
- fr_schema,
334
+ version_feature_schema,
337
335
  version_created_at,
338
336
  version_finished_at,
339
337
  version_error_message,
@@ -356,9 +354,9 @@ class DatasetRecord:
356
354
  labels_lst,
357
355
  bool(shadow),
358
356
  cls.parse_schema(schema_dct), # type: ignore[arg-type]
357
+ json.loads(feature_schema) if feature_schema else {},
359
358
  [dataset_version],
360
359
  status,
361
- fr_schema,
362
360
  created_at,
363
361
  finished_at,
364
362
  error_message,
@@ -6,6 +6,7 @@ import sqlalchemy
6
6
  from datachain.lib.feature import Feature, FeatureType
7
7
  from datachain.lib.feature_utils import features_to_tuples
8
8
  from datachain.lib.file import File, get_file
9
+ from datachain.lib.meta_formats import read_meta
9
10
  from datachain.lib.settings import Settings
10
11
  from datachain.lib.signal_schema import SignalSchema
11
12
  from datachain.lib.udf import (
@@ -219,6 +220,89 @@ class DataChain(DatasetQuery):
219
220
  """
220
221
  return DataChain(name=name, version=version)
221
222
 
223
+ @classmethod
224
+ def from_csv(
225
+ cls,
226
+ path,
227
+ type: Literal["binary", "text", "image"] = "text",
228
+ anon: bool = False,
229
+ spec: Optional[FeatureType] = None,
230
+ schema_from: Optional[str] = "auto",
231
+ show_schema: Optional[bool] = False,
232
+ ) -> "DataChain":
233
+ """Get data from CSV. It returns the chain itself.
234
+
235
+ Parameters
236
+ ----------
237
+ path : storage URI with directory. URI must start with storage prefix such
238
+ as `s3://`, `gs://`, `az://` or "file:///"
239
+ type : read file as "binary", "text", or "image" data. Default is "binary".
240
+ anon : use anonymous mode to access the storage.
241
+ spec : optional Data Model
242
+ schema_from : path to sample to infer spec from
243
+ show_schema : print auto-generated schema
244
+
245
+ Examples
246
+ --------
247
+
248
+ >>> chain = DataChain.from_csv("gs://csv")
249
+ """
250
+ if schema_from == "auto":
251
+ schema_from = path
252
+
253
+ chain = DataChain.from_storage(path=path, type=type, anon=anon)
254
+ return chain.gen(
255
+ csv=read_meta(
256
+ schema_from=schema_from,
257
+ meta_type="csv",
258
+ spec=spec,
259
+ show_schema=show_schema,
260
+ )
261
+ )
262
+
263
+ @classmethod
264
+ def from_json(
265
+ cls,
266
+ path,
267
+ type: Literal["binary", "text", "image"] = "text",
268
+ anon: bool = False,
269
+ spec: Optional[FeatureType] = None,
270
+ schema_from: Optional[str] = "auto",
271
+ jmespath: Optional[str] = None,
272
+ show_schema: Optional[bool] = False,
273
+ ) -> "DataChain":
274
+ """Get data from CSV. It returns the chain itself.
275
+
276
+ Parameters
277
+ ----------
278
+ path : storage URI with directory. URI must start with storage prefix such
279
+ as `s3://`, `gs://`, `az://` or "file:///"
280
+ type : read file as "binary", "text", or "image" data. Default is "binary".
281
+ anon : use anonymous mode to access the storage.
282
+ spec : optional Data Model
283
+ schema_from : path to sample to infer spec from
284
+ show_schema : print auto-generated schema
285
+ jmespath : JMESPATH expression to reduce JSON
286
+ name : return object name
287
+ Examples
288
+ --------
289
+
290
+ >>> chain = DataChain.from_json("gs://json")
291
+ """
292
+ if schema_from == "auto":
293
+ schema_from = path
294
+
295
+ chain = DataChain.from_storage(path=path, type=type, anon=anon)
296
+ return chain.gen(
297
+ json=read_meta(
298
+ schema_from=schema_from,
299
+ meta_type="json",
300
+ spec=spec,
301
+ show_schema=show_schema,
302
+ jmespath=jmespath,
303
+ )
304
+ )
305
+
222
306
  def save( # type: ignore[override]
223
307
  self, name: Optional[str] = None, version: Optional[int] = None
224
308
  ) -> "DataChain":
@@ -408,7 +492,7 @@ class DataChain(DatasetQuery):
408
492
  chain.signals_schema = new_schema
409
493
  return chain
410
494
 
411
- def get_values(self) -> Iterator[Sequence]:
495
+ def get_values(self) -> Iterator[list]:
412
496
  """Iterate over rows, getting feature values and applying reader calls."""
413
497
  for features in self.iterate():
414
498
  yield [fr.get_value() if isinstance(fr, Feature) else fr for fr in features]
@@ -607,3 +691,35 @@ class DataChain(DatasetQuery):
607
691
 
608
692
  def max(self, fr: FeatureType): # type: ignore[override]
609
693
  return self._extend_features("max", fr)
694
+
695
+ @detach
696
+ def gen_random(self) -> "DataChain":
697
+ from random import getrandbits
698
+
699
+ from datachain.data_storage.warehouse import RANDOM_BITS
700
+
701
+ if "random" not in self.signals_schema.values:
702
+ chain = self.map(random=lambda: getrandbits(RANDOM_BITS), output=int).save()
703
+ return chain.select_except("random")
704
+
705
+ return self
706
+
707
+ @detach
708
+ def shuffle(self) -> "DataChain":
709
+ """Return results in deterministic random order."""
710
+ chain = self.gen_random()
711
+ return DatasetQuery.shuffle(chain)
712
+
713
+ @detach
714
+ def chunk(self, index: int, total: int) -> "DataChain":
715
+ """Split a query into smaller chunks for e.g. parallelization.
716
+ Example:
717
+ >>> dc = DataChain(...)
718
+ >>> chunk_1 = dc._chunk(0, 2)
719
+ >>> chunk_2 = dc._chunk(1, 2)
720
+ Note:
721
+ Bear in mind that `index` is 0-indexed but `total` isn't.
722
+ Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
723
+ """
724
+ chain = self.gen_random()
725
+ return DatasetQuery.chunk(chain, index, total)
@@ -78,16 +78,6 @@ DATACHAIN_TO_TYPE = {
78
78
  JSON: dict,
79
79
  }
80
80
 
81
- NAMES_TO_TYPES = {
82
- "int": int,
83
- "str": str,
84
- "float": float,
85
- "bool": bool,
86
- "list": list,
87
- "dict": dict,
88
- "bytes": bytes,
89
- "datetime": datetime,
90
- }
91
81
 
92
82
  NUMPY_TO_DATACHAIN = {
93
83
  np.dtype("int8"): Int,
@@ -0,0 +1,164 @@
1
+ # pip install datamodel-code-generator
2
+ # pip install jmespath
3
+ #
4
+ import csv
5
+ import io
6
+ import json
7
+ import subprocess
8
+ import sys
9
+ import uuid
10
+ from collections.abc import Iterator
11
+ from typing import Any, Callable
12
+
13
+ import jmespath as jsp
14
+
15
+ from datachain.lib.feature_utils import pydantic_to_feature # noqa: F401
16
+ from datachain.lib.file import File
17
+
18
+ # from datachain.lib.dc import C, DataChain
19
+
20
+
21
+ def generate_uuid():
22
+ return uuid.uuid4() # Generates a random UUID.
23
+
24
+
25
+ # JSON decoder
26
+ def load_json_from_string(json_string):
27
+ try:
28
+ data = json.loads(json_string)
29
+ print("Successfully parsed JSON", file=sys.stderr)
30
+ return data
31
+ except json.JSONDecodeError:
32
+ print("Failed to decode JSON: The string is not formatted correctly.")
33
+ return None
34
+
35
+
36
+ # Read valid JSON and return a data object sample
37
+ def process_json(data_string, jmespath):
38
+ json_dict = load_json_from_string(data_string)
39
+ if jmespath:
40
+ json_dict = jsp.search(jmespath, json_dict)
41
+ # we allow non-list JSONs here to print the root schema
42
+ # but if jmespath expression is given, we assume a list
43
+ if not isinstance(json_dict, list):
44
+ raise ValueError("JMESPATH expression must resolve to a list")
45
+ return None
46
+ json_dict = json_dict[0] # sample the first object
47
+ return json.dumps(json_dict)
48
+
49
+
50
+ # Print a dynamic datamodel-codegen output from JSON or CSV on stdout
51
+ def read_schema(source_file, data_type="csv", expr=None):
52
+ data_string = ""
53
+ uid_str = str(generate_uuid()).replace("-", "") # comply with Python class names
54
+ # using uiid to get around issue #1617
55
+ model_name = f"Model{uid_str}"
56
+ try:
57
+ with source_file.open() as fd: # CSV can be larger than memory
58
+ if data_type == "csv":
59
+ data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
60
+ data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
61
+ else:
62
+ data_string = fd.read() # other meta must fit into RAM
63
+ except OSError as e:
64
+ print(f"An unexpected file error occurred: {e}")
65
+ return
66
+ if data_type == "json":
67
+ data_string = process_json(data_string, expr)
68
+ command = [
69
+ "datamodel-codegen",
70
+ "--input-file-type",
71
+ data_type,
72
+ "--class-name",
73
+ model_name,
74
+ ]
75
+ try:
76
+ result = subprocess.run(
77
+ command, # noqa: S603
78
+ input=data_string,
79
+ text=True,
80
+ capture_output=True,
81
+ check=True,
82
+ )
83
+ model_output = (
84
+ result.stdout
85
+ ) # This will contain the output from datamodel-codegen
86
+ except subprocess.CalledProcessError as e:
87
+ model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
88
+ print(f"{model_output}")
89
+ print("\n" + f"spec=pydantic_to_feature({model_name})" + "\n")
90
+
91
+
92
+ #
93
+ # UDF mapper which calls chain in the setup to infer the dynamic schema
94
+ #
95
+ def read_meta(
96
+ spec=None, schema_from=None, meta_type="json", jmespath=None, show_schema=False
97
+ ) -> Callable:
98
+ from datachain.lib.dc import DataChain
99
+
100
+ # ugly hack: datachain is run redirecting printed outputs to a variable
101
+ if schema_from:
102
+ captured_output = io.StringIO()
103
+ current_stdout = sys.stdout
104
+ sys.stdout = captured_output
105
+ try:
106
+ chain = (
107
+ DataChain.from_storage(schema_from)
108
+ .limit(1)
109
+ .map( # dummy column created (#1615)
110
+ meta_schema=lambda file: read_schema(
111
+ file, data_type=meta_type, expr=jmespath
112
+ ),
113
+ output=str,
114
+ )
115
+ )
116
+ # dummy executor (#1616)
117
+ chain.save()
118
+ finally:
119
+ sys.stdout = current_stdout
120
+ model_output = captured_output.getvalue()
121
+ captured_output.close()
122
+ if show_schema:
123
+ print(f"{model_output}")
124
+ # Below 'spec' should be a dynamically converted Feature from Pydantic datamodel
125
+ if not spec:
126
+ local_vars: dict[str, Any] = {}
127
+ exec(model_output, globals(), local_vars) # noqa: S102
128
+ spec = local_vars["spec"]
129
+
130
+ if not (spec) and not (schema_from):
131
+ raise ValueError(
132
+ "Must provide a static schema in spec: or metadata sample in schema_from:"
133
+ )
134
+
135
+ #
136
+ # UDF mapper parsing a JSON or CSV file using schema spec
137
+ #
138
+ def parse_data(
139
+ file: File, data_model=spec, meta_type=meta_type, jmespath=jmespath
140
+ ) -> Iterator[spec]:
141
+ if meta_type == "csv":
142
+ with (
143
+ file.open() as fd
144
+ ): # TODO: if schema is statically given, should allow CSV without headers
145
+ reader = csv.DictReader(fd)
146
+ for row in reader: # CSV can be larger than memory
147
+ json_string = json.dumps(row)
148
+ yield data_model.model_validate_json(json_string)
149
+ if meta_type == "json":
150
+ try:
151
+ with file.open() as fd: # JSON must fit into RAM
152
+ data_string = fd.read()
153
+ except OSError as e:
154
+ print(f"An unexpected file error occurred: {e}")
155
+ json_object = load_json_from_string(data_string)
156
+ if jmespath:
157
+ json_object = jsp.search(jmespath, json_object)
158
+ if not isinstance(json_object, list):
159
+ raise ValueError("JSON expression must resolve in a list of objects")
160
+ for json_dict in json_object:
161
+ json_string = json.dumps(json_dict)
162
+ yield data_model.model_validate_json(json_string)
163
+
164
+ return parse_data