datachain 0.3.14__tar.gz → 0.3.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (246) hide show
  1. {datachain-0.3.14/src/datachain.egg-info → datachain-0.3.15}/PKG-INFO +1 -1
  2. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/catalog/catalog.py +18 -29
  3. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/client/fsspec.py +9 -8
  4. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/dataset.py +1 -1
  5. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/arrow.py +51 -16
  6. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/dc.py +7 -2
  7. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/file.py +76 -2
  8. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/hf.py +23 -6
  9. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/listing.py +7 -5
  10. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/listing_info.py +2 -2
  11. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/signal_schema.py +11 -2
  12. datachain-0.3.15/src/datachain/lib/tar.py +33 -0
  13. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/webdataset.py +3 -59
  14. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/dataset.py +2 -6
  15. {datachain-0.3.14 → datachain-0.3.15/src/datachain.egg-info}/PKG-INFO +1 -1
  16. {datachain-0.3.14 → datachain-0.3.15}/src/datachain.egg-info/SOURCES.txt +1 -0
  17. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_datachain.py +34 -2
  18. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_dataset_query.py +0 -19
  19. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_datasets.py +4 -3
  20. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_listing.py +2 -1
  21. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_arrow.py +24 -5
  22. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_datachain.py +2 -2
  23. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_datachain_bootstrap.py +38 -19
  24. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_file.py +84 -1
  25. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_hf.py +8 -8
  26. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_signal_schema.py +16 -0
  27. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_client.py +32 -24
  28. {datachain-0.3.14 → datachain-0.3.15}/.cruft.json +0 -0
  29. {datachain-0.3.14 → datachain-0.3.15}/.gitattributes +0 -0
  30. {datachain-0.3.14 → datachain-0.3.15}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  31. {datachain-0.3.14 → datachain-0.3.15}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  32. {datachain-0.3.14 → datachain-0.3.15}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  33. {datachain-0.3.14 → datachain-0.3.15}/.github/codecov.yaml +0 -0
  34. {datachain-0.3.14 → datachain-0.3.15}/.github/dependabot.yml +0 -0
  35. {datachain-0.3.14 → datachain-0.3.15}/.github/workflows/benchmarks.yml +0 -0
  36. {datachain-0.3.14 → datachain-0.3.15}/.github/workflows/release.yml +0 -0
  37. {datachain-0.3.14 → datachain-0.3.15}/.github/workflows/tests-studio.yml +0 -0
  38. {datachain-0.3.14 → datachain-0.3.15}/.github/workflows/tests.yml +0 -0
  39. {datachain-0.3.14 → datachain-0.3.15}/.github/workflows/update-template.yaml +0 -0
  40. {datachain-0.3.14 → datachain-0.3.15}/.gitignore +0 -0
  41. {datachain-0.3.14 → datachain-0.3.15}/.pre-commit-config.yaml +0 -0
  42. {datachain-0.3.14 → datachain-0.3.15}/CODE_OF_CONDUCT.rst +0 -0
  43. {datachain-0.3.14 → datachain-0.3.15}/CONTRIBUTING.rst +0 -0
  44. {datachain-0.3.14 → datachain-0.3.15}/LICENSE +0 -0
  45. {datachain-0.3.14 → datachain-0.3.15}/README.rst +0 -0
  46. {datachain-0.3.14 → datachain-0.3.15}/docs/assets/captioned_cartoons.png +0 -0
  47. {datachain-0.3.14 → datachain-0.3.15}/docs/assets/datachain-white.svg +0 -0
  48. {datachain-0.3.14 → datachain-0.3.15}/docs/assets/datachain.svg +0 -0
  49. {datachain-0.3.14 → datachain-0.3.15}/docs/assets/flowchart.png +0 -0
  50. {datachain-0.3.14 → datachain-0.3.15}/docs/index.md +0 -0
  51. {datachain-0.3.14 → datachain-0.3.15}/docs/references/datachain.md +0 -0
  52. {datachain-0.3.14 → datachain-0.3.15}/docs/references/datatype.md +0 -0
  53. {datachain-0.3.14 → datachain-0.3.15}/docs/references/file.md +0 -0
  54. {datachain-0.3.14 → datachain-0.3.15}/docs/references/index.md +0 -0
  55. {datachain-0.3.14 → datachain-0.3.15}/docs/references/sql.md +0 -0
  56. {datachain-0.3.14 → datachain-0.3.15}/docs/references/torch.md +0 -0
  57. {datachain-0.3.14 → datachain-0.3.15}/docs/references/udf.md +0 -0
  58. {datachain-0.3.14 → datachain-0.3.15}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  59. {datachain-0.3.14 → datachain-0.3.15}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  60. {datachain-0.3.14 → datachain-0.3.15}/examples/computer_vision/openimage-detect.py +0 -0
  61. {datachain-0.3.14 → datachain-0.3.15}/examples/get_started/common_sql_functions.py +0 -0
  62. {datachain-0.3.14 → datachain-0.3.15}/examples/get_started/json-csv-reader.py +0 -0
  63. {datachain-0.3.14 → datachain-0.3.15}/examples/get_started/torch-loader.py +0 -0
  64. {datachain-0.3.14 → datachain-0.3.15}/examples/get_started/udfs/parallel.py +0 -0
  65. {datachain-0.3.14 → datachain-0.3.15}/examples/get_started/udfs/simple.py +0 -0
  66. {datachain-0.3.14 → datachain-0.3.15}/examples/get_started/udfs/stateful.py +0 -0
  67. {datachain-0.3.14 → datachain-0.3.15}/examples/llm_and_nlp/claude-query.py +0 -0
  68. {datachain-0.3.14 → datachain-0.3.15}/examples/llm_and_nlp/unstructured-text.py +0 -0
  69. {datachain-0.3.14 → datachain-0.3.15}/examples/multimodal/clip_inference.py +0 -0
  70. {datachain-0.3.14 → datachain-0.3.15}/examples/multimodal/hf_pipeline.py +0 -0
  71. {datachain-0.3.14 → datachain-0.3.15}/examples/multimodal/openai_image_desc_lib.py +0 -0
  72. {datachain-0.3.14 → datachain-0.3.15}/examples/multimodal/wds.py +0 -0
  73. {datachain-0.3.14 → datachain-0.3.15}/examples/multimodal/wds_filtered.py +0 -0
  74. {datachain-0.3.14 → datachain-0.3.15}/mkdocs.yml +0 -0
  75. {datachain-0.3.14 → datachain-0.3.15}/noxfile.py +0 -0
  76. {datachain-0.3.14 → datachain-0.3.15}/pyproject.toml +0 -0
  77. {datachain-0.3.14 → datachain-0.3.15}/setup.cfg +0 -0
  78. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/__init__.py +0 -0
  79. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/__main__.py +0 -0
  80. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/asyn.py +0 -0
  81. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/cache.py +0 -0
  82. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/catalog/__init__.py +0 -0
  83. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/catalog/datasource.py +0 -0
  84. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/catalog/loader.py +0 -0
  85. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/cli.py +0 -0
  86. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/cli_utils.py +0 -0
  87. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/client/__init__.py +0 -0
  88. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/client/azure.py +0 -0
  89. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/client/fileslice.py +0 -0
  90. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/client/gcs.py +0 -0
  91. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/client/hf.py +0 -0
  92. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/client/local.py +0 -0
  93. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/client/s3.py +0 -0
  94. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/config.py +0 -0
  95. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/data_storage/__init__.py +0 -0
  96. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/data_storage/db_engine.py +0 -0
  97. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/data_storage/id_generator.py +0 -0
  98. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/data_storage/job.py +0 -0
  99. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/data_storage/metastore.py +0 -0
  100. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/data_storage/schema.py +0 -0
  101. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/data_storage/serializer.py +0 -0
  102. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/data_storage/sqlite.py +0 -0
  103. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/data_storage/warehouse.py +0 -0
  104. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/error.py +0 -0
  105. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/job.py +0 -0
  106. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/__init__.py +0 -0
  107. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/clip.py +0 -0
  108. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/convert/__init__.py +0 -0
  109. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/convert/flatten.py +0 -0
  110. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/convert/python_to_sql.py +0 -0
  111. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/convert/sql_to_python.py +0 -0
  112. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/convert/unflatten.py +0 -0
  113. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  114. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/data_model.py +0 -0
  115. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/dataset_info.py +0 -0
  116. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/image.py +0 -0
  117. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/meta_formats.py +0 -0
  118. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/model_store.py +0 -0
  119. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/pytorch.py +0 -0
  120. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/settings.py +0 -0
  121. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/text.py +0 -0
  122. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/udf.py +0 -0
  123. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/udf_signature.py +0 -0
  124. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/utils.py +0 -0
  125. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/vfile.py +0 -0
  126. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/lib/webdataset_laion.py +0 -0
  127. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/listing.py +0 -0
  128. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/node.py +0 -0
  129. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/nodes_fetcher.py +0 -0
  130. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/nodes_thread_pool.py +0 -0
  131. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/progress.py +0 -0
  132. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/py.typed +0 -0
  133. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/__init__.py +0 -0
  134. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/batch.py +0 -0
  135. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/builtins.py +0 -0
  136. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/dispatch.py +0 -0
  137. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/metrics.py +0 -0
  138. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/params.py +0 -0
  139. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/queue.py +0 -0
  140. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/schema.py +0 -0
  141. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/session.py +0 -0
  142. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/query/udf.py +0 -0
  143. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/remote/__init__.py +0 -0
  144. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/remote/studio.py +0 -0
  145. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/__init__.py +0 -0
  146. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/default/__init__.py +0 -0
  147. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/default/base.py +0 -0
  148. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/functions/__init__.py +0 -0
  149. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/functions/array.py +0 -0
  150. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/functions/conditional.py +0 -0
  151. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/functions/path.py +0 -0
  152. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/functions/random.py +0 -0
  153. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/functions/string.py +0 -0
  154. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/selectable.py +0 -0
  155. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/sqlite/__init__.py +0 -0
  156. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/sqlite/base.py +0 -0
  157. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/sqlite/types.py +0 -0
  158. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/sqlite/vector.py +0 -0
  159. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/types.py +0 -0
  160. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/sql/utils.py +0 -0
  161. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/storage.py +0 -0
  162. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/torch/__init__.py +0 -0
  163. {datachain-0.3.14 → datachain-0.3.15}/src/datachain/utils.py +0 -0
  164. {datachain-0.3.14 → datachain-0.3.15}/src/datachain.egg-info/dependency_links.txt +0 -0
  165. {datachain-0.3.14 → datachain-0.3.15}/src/datachain.egg-info/entry_points.txt +0 -0
  166. {datachain-0.3.14 → datachain-0.3.15}/src/datachain.egg-info/requires.txt +0 -0
  167. {datachain-0.3.14 → datachain-0.3.15}/src/datachain.egg-info/top_level.txt +0 -0
  168. {datachain-0.3.14 → datachain-0.3.15}/tests/__init__.py +0 -0
  169. {datachain-0.3.14 → datachain-0.3.15}/tests/benchmarks/__init__.py +0 -0
  170. {datachain-0.3.14 → datachain-0.3.15}/tests/benchmarks/conftest.py +0 -0
  171. {datachain-0.3.14 → datachain-0.3.15}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  172. {datachain-0.3.14 → datachain-0.3.15}/tests/benchmarks/datasets/.dvc/config +0 -0
  173. {datachain-0.3.14 → datachain-0.3.15}/tests/benchmarks/datasets/.gitignore +0 -0
  174. {datachain-0.3.14 → datachain-0.3.15}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  175. {datachain-0.3.14 → datachain-0.3.15}/tests/benchmarks/test_datachain.py +0 -0
  176. {datachain-0.3.14 → datachain-0.3.15}/tests/benchmarks/test_ls.py +0 -0
  177. {datachain-0.3.14 → datachain-0.3.15}/tests/benchmarks/test_version.py +0 -0
  178. {datachain-0.3.14 → datachain-0.3.15}/tests/conftest.py +0 -0
  179. {datachain-0.3.14 → datachain-0.3.15}/tests/data.py +0 -0
  180. {datachain-0.3.14 → datachain-0.3.15}/tests/examples/__init__.py +0 -0
  181. {datachain-0.3.14 → datachain-0.3.15}/tests/examples/test_examples.py +0 -0
  182. {datachain-0.3.14 → datachain-0.3.15}/tests/examples/test_wds_e2e.py +0 -0
  183. {datachain-0.3.14 → datachain-0.3.15}/tests/examples/wds_data.py +0 -0
  184. {datachain-0.3.14 → datachain-0.3.15}/tests/func/__init__.py +0 -0
  185. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_catalog.py +0 -0
  186. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_client.py +0 -0
  187. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_feature_pickling.py +0 -0
  188. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_ls.py +0 -0
  189. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_meta_formats.py +0 -0
  190. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_metrics.py +0 -0
  191. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_pull.py +0 -0
  192. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_pytorch.py +0 -0
  193. {datachain-0.3.14 → datachain-0.3.15}/tests/func/test_query.py +0 -0
  194. {datachain-0.3.14 → datachain-0.3.15}/tests/scripts/feature_class.py +0 -0
  195. {datachain-0.3.14 → datachain-0.3.15}/tests/scripts/feature_class_parallel.py +0 -0
  196. {datachain-0.3.14 → datachain-0.3.15}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  197. {datachain-0.3.14 → datachain-0.3.15}/tests/scripts/name_len_slow.py +0 -0
  198. {datachain-0.3.14 → datachain-0.3.15}/tests/test_cli_e2e.py +0 -0
  199. {datachain-0.3.14 → datachain-0.3.15}/tests/test_query_e2e.py +0 -0
  200. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/__init__.py +0 -0
  201. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/__init__.py +0 -0
  202. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/conftest.py +0 -0
  203. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_clip.py +0 -0
  204. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_datachain_merge.py +0 -0
  205. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_feature.py +0 -0
  206. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_feature_utils.py +0 -0
  207. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_image.py +0 -0
  208. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_schema.py +0 -0
  209. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_sql_to_python.py +0 -0
  210. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_text.py +0 -0
  211. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_udf_signature.py +0 -0
  212. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_utils.py +0 -0
  213. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/lib/test_webdataset.py +0 -0
  214. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/sql/__init__.py +0 -0
  215. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/sql/sqlite/__init__.py +0 -0
  216. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/sql/sqlite/test_utils.py +0 -0
  217. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/sql/test_array.py +0 -0
  218. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/sql/test_conditional.py +0 -0
  219. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/sql/test_path.py +0 -0
  220. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/sql/test_random.py +0 -0
  221. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/sql/test_selectable.py +0 -0
  222. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/sql/test_string.py +0 -0
  223. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_asyn.py +0 -0
  224. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_cache.py +0 -0
  225. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_catalog.py +0 -0
  226. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_catalog_loader.py +0 -0
  227. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_cli_parsing.py +0 -0
  228. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_client_s3.py +0 -0
  229. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_data_storage.py +0 -0
  230. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_database_engine.py +0 -0
  231. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_dataset.py +0 -0
  232. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_dispatch.py +0 -0
  233. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_fileslice.py +0 -0
  234. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_id_generator.py +0 -0
  235. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_listing.py +0 -0
  236. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_metastore.py +0 -0
  237. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_module_exports.py +0 -0
  238. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_query_metrics.py +0 -0
  239. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_query_params.py +0 -0
  240. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_serializer.py +0 -0
  241. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_session.py +0 -0
  242. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_storage.py +0 -0
  243. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_udf.py +0 -0
  244. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_utils.py +0 -0
  245. {datachain-0.3.14 → datachain-0.3.15}/tests/unit/test_warehouse.py +0 -0
  246. {datachain-0.3.14 → datachain-0.3.15}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.14
3
+ Version: 0.3.15
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -621,10 +621,6 @@ class Catalog:
621
621
  code_ast.body[-1:] = new_expressions
622
622
  return code_ast
623
623
 
624
- def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
625
- config = config or self.client_config
626
- return Client.parse_url(uri, self.cache, **config)
627
-
628
624
  def get_client(self, uri: StorageURI, **config: Any) -> Client:
629
625
  """
630
626
  Return the client corresponding to the given source `uri`.
@@ -651,17 +647,16 @@ class Catalog:
651
647
  partial_path: Optional[str]
652
648
 
653
649
  client_config = client_config or self.client_config
654
- client, path = self.parse_url(source, **client_config)
650
+ uri, path = Client.parse_url(source)
651
+ client = Client.get_client(source, self.cache, **client_config)
655
652
  stem = os.path.basename(os.path.normpath(path))
656
653
  prefix = (
657
654
  posixpath.dirname(path)
658
655
  if glob.has_magic(stem) or client.fs.isfile(source)
659
656
  else path
660
657
  )
661
- storage_dataset_name = Storage.dataset_name(
662
- client.uri, posixpath.join(prefix, "")
663
- )
664
- source_metastore = self.metastore.clone(client.uri)
658
+ storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
659
+ source_metastore = self.metastore.clone(uri)
665
660
 
666
661
  columns = [
667
662
  Column("path", String),
@@ -675,15 +670,13 @@ class Catalog:
675
670
  ]
676
671
 
677
672
  if skip_indexing:
678
- source_metastore.create_storage_if_not_registered(client.uri)
679
- storage = source_metastore.get_storage(client.uri)
680
- source_metastore.init_partial_id(client.uri)
681
- partial_id = source_metastore.get_next_partial_id(client.uri)
673
+ source_metastore.create_storage_if_not_registered(uri)
674
+ storage = source_metastore.get_storage(uri)
675
+ source_metastore.init_partial_id(uri)
676
+ partial_id = source_metastore.get_next_partial_id(uri)
682
677
 
683
- source_metastore = self.metastore.clone(
684
- uri=client.uri, partial_id=partial_id
685
- )
686
- source_metastore.init(client.uri)
678
+ source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
679
+ source_metastore.init(uri)
687
680
 
688
681
  source_warehouse = self.warehouse.clone()
689
682
  dataset = self.create_dataset(
@@ -701,20 +694,16 @@ class Catalog:
701
694
  in_progress,
702
695
  partial_id,
703
696
  partial_path,
704
- ) = source_metastore.register_storage_for_indexing(
705
- client.uri, force_update, prefix
706
- )
697
+ ) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
707
698
  if in_progress:
708
699
  raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
709
700
 
710
701
  if not need_index:
711
702
  assert partial_id is not None
712
703
  assert partial_path is not None
713
- source_metastore = self.metastore.clone(
714
- uri=client.uri, partial_id=partial_id
715
- )
704
+ source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
716
705
  source_warehouse = self.warehouse.clone()
717
- dataset = self.get_dataset(Storage.dataset_name(client.uri, partial_path))
706
+ dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
718
707
  lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
719
708
  logger.debug(
720
709
  "Using cached listing %s. Valid till: %s",
@@ -731,11 +720,11 @@ class Catalog:
731
720
 
732
721
  return lst, path
733
722
 
734
- source_metastore.init_partial_id(client.uri)
735
- partial_id = source_metastore.get_next_partial_id(client.uri)
723
+ source_metastore.init_partial_id(uri)
724
+ partial_id = source_metastore.get_next_partial_id(uri)
736
725
 
737
- source_metastore.init(client.uri)
738
- source_metastore = self.metastore.clone(uri=client.uri, partial_id=partial_id)
726
+ source_metastore.init(uri)
727
+ source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
739
728
 
740
729
  source_warehouse = self.warehouse.clone()
741
730
 
@@ -1370,7 +1359,7 @@ class Catalog:
1370
1359
 
1371
1360
  def signed_url(self, source: str, path: str, client_config=None) -> str:
1372
1361
  client_config = client_config or self.client_config
1373
- client, _ = self.parse_url(source, **client_config)
1362
+ client = Client.get_client(source, self.cache, **client_config)
1374
1363
  return client.url(path)
1375
1364
 
1376
1365
  def export_dataset_table(
@@ -116,15 +116,16 @@ class Client(ABC):
116
116
  return DATA_SOURCE_URI_PATTERN.match(name) is not None
117
117
 
118
118
  @staticmethod
119
- def parse_url(
120
- source: str,
121
- cache: DataChainCache,
122
- **kwargs,
123
- ) -> tuple["Client", str]:
119
+ def parse_url(source: str) -> tuple[StorageURI, str]:
120
+ cls = Client.get_implementation(source)
121
+ storage_name, rel_path = cls.split_url(source)
122
+ return cls.get_uri(storage_name), rel_path
123
+
124
+ @staticmethod
125
+ def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
124
126
  cls = Client.get_implementation(source)
125
- storage_url, rel_path = cls.split_url(source)
126
- client = cls.from_name(storage_url, cache, kwargs)
127
- return client, rel_path
127
+ storage_url, _ = cls.split_url(source)
128
+ return cls.from_name(storage_url, cache, kwargs)
128
129
 
129
130
  @classmethod
130
131
  def create_fs(cls, **kwargs) -> "AbstractFileSystem":
@@ -112,7 +112,7 @@ class DatasetDependency:
112
112
 
113
113
  if is_listing_dataset(dataset_name):
114
114
  dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
115
- dependency_name = listing_uri_from_name(dataset_name)
115
+ dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
116
116
 
117
117
  return cls(
118
118
  id,
@@ -13,8 +13,10 @@ from datachain.lib.model_store import ModelStore
13
13
  from datachain.lib.udf import Generator
14
14
 
15
15
  if TYPE_CHECKING:
16
+ from datasets.features.features import Features
16
17
  from pydantic import BaseModel
17
18
 
19
+ from datachain.lib.data_model import DataType
18
20
  from datachain.lib.dc import DataChain
19
21
 
20
22
 
@@ -46,7 +48,10 @@ class ArrowGenerator(Generator):
46
48
  self.kwargs = kwargs
47
49
 
48
50
  def process(self, file: File):
49
- if self.nrows:
51
+ if file._caching_enabled:
52
+ path = file.get_local_path(download=True)
53
+ ds = dataset(path, schema=self.input_schema, **self.kwargs)
54
+ elif self.nrows:
50
55
  path = _nrows_file(file, self.nrows)
51
56
  ds = dataset(path, schema=self.input_schema, **self.kwargs)
52
57
  else:
@@ -54,6 +59,7 @@ class ArrowGenerator(Generator):
54
59
  ds = dataset(
55
60
  path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
56
61
  )
62
+ hf_schema = _get_hf_schema(ds.schema)
57
63
  index = 0
58
64
  with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
59
65
  for record_batch in ds.to_batches():
@@ -62,9 +68,17 @@ class ArrowGenerator(Generator):
62
68
  if self.output_schema:
63
69
  fields = self.output_schema.model_fields
64
70
  vals_dict = {}
65
- for (field, field_info), val in zip(fields.items(), vals):
66
- if ModelStore.is_pydantic(field_info.annotation):
67
- vals_dict[field] = field_info.annotation(**val) # type: ignore[misc]
71
+ for i, ((field, field_info), val) in enumerate(
72
+ zip(fields.items(), vals)
73
+ ):
74
+ anno = field_info.annotation
75
+ if hf_schema:
76
+ from datachain.lib.hf import convert_feature
77
+
78
+ feat = list(hf_schema[0].values())[i]
79
+ vals_dict[field] = convert_feature(val, feat, anno)
80
+ elif ModelStore.is_pydantic(anno):
81
+ vals_dict[field] = anno(**val) # type: ignore[misc]
68
82
  else:
69
83
  vals_dict[field] = val
70
84
  vals = [self.output_schema(**vals_dict)]
@@ -91,26 +105,36 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
91
105
  "Error generating output from Arrow schema - "
92
106
  f"Schema has {len(schema)} columns but got {len(col_names)} column names."
93
107
  )
94
- default_column = 0
108
+ if not col_names:
109
+ col_names = schema.names
110
+ columns = _convert_col_names(col_names) # type: ignore[arg-type]
111
+ hf_schema = _get_hf_schema(schema)
112
+ if hf_schema:
113
+ return {
114
+ column: hf_type for hf_type, column in zip(hf_schema[1].values(), columns)
115
+ }
95
116
  output = {}
96
- for i, field in enumerate(schema):
97
- if col_names:
98
- column = col_names[i]
99
- else:
100
- column = field.name
101
- column = column.lower()
102
- column = re.sub("[^0-9a-z_]+", "", column)
103
- if not column:
104
- column = f"c{default_column}"
105
- default_column += 1
117
+ for field, column in zip(schema, columns):
106
118
  dtype = arrow_type_mapper(field.type, column) # type: ignore[assignment]
107
119
  if field.nullable and not ModelStore.is_pydantic(dtype):
108
120
  dtype = Optional[dtype] # type: ignore[assignment]
109
121
  output[column] = dtype
110
-
111
122
  return output
112
123
 
113
124
 
125
+ def _convert_col_names(col_names: Sequence[str]) -> list[str]:
126
+ default_column = 0
127
+ converted_col_names = []
128
+ for column in col_names:
129
+ column = column.lower()
130
+ column = re.sub("[^0-9a-z_]+", "", column)
131
+ if not column:
132
+ column = f"c{default_column}"
133
+ default_column += 1
134
+ converted_col_names.append(column)
135
+ return converted_col_names
136
+
137
+
114
138
  def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
115
139
  """Convert pyarrow types to basic types."""
116
140
  from datetime import datetime
@@ -156,3 +180,14 @@ def _nrows_file(file: File, nrows: int) -> str:
156
180
  writer.write(line)
157
181
  writer.write("\n")
158
182
  return tf.name
183
+
184
+
185
+ def _get_hf_schema(
186
+ schema: "pa.Schema",
187
+ ) -> Optional[tuple["Features", dict[str, "DataType"]]]:
188
+ if schema.metadata and b"huggingface" in schema.metadata:
189
+ from datachain.lib.hf import get_output_schema, schema_from_arrow
190
+
191
+ features = schema_from_arrow(schema)
192
+ return features, get_output_schema(features)
193
+ return None
@@ -408,7 +408,11 @@ class DataChain(DatasetQuery):
408
408
  in_memory=in_memory,
409
409
  )
410
410
  .gen(
411
- list_bucket(list_uri, client_config=session.catalog.client_config),
411
+ list_bucket(
412
+ list_uri,
413
+ session.catalog.cache,
414
+ client_config=session.catalog.client_config,
415
+ ),
412
416
  output={f"{object_name}": File},
413
417
  )
414
418
  .save(list_dataset_name, listing=True)
@@ -1523,7 +1527,8 @@ class DataChain(DatasetQuery):
1523
1527
  output = {"split": str}
1524
1528
 
1525
1529
  model_name = model_name or object_name or ""
1526
- output = output | get_output_schema(next(iter(ds_dict.values())), model_name)
1530
+ hf_features = next(iter(ds_dict.values())).features
1531
+ output = output | get_output_schema(hf_features, model_name)
1527
1532
  model = dict_to_data_model(model_name, output)
1528
1533
  if object_name:
1529
1534
  output = {object_name: model}
@@ -1,5 +1,6 @@
1
1
  import io
2
2
  import json
3
+ import logging
3
4
  import os
4
5
  import posixpath
5
6
  from abc import ABC, abstractmethod
@@ -15,6 +16,9 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
15
16
  from PIL import Image
16
17
  from pydantic import Field, field_validator
17
18
 
19
+ if TYPE_CHECKING:
20
+ from typing_extensions import Self
21
+
18
22
  from datachain.cache import UniqueId
19
23
  from datachain.client.fileslice import FileSlice
20
24
  from datachain.lib.data_model import DataModel
@@ -25,6 +29,8 @@ from datachain.utils import TIME_ZERO
25
29
  if TYPE_CHECKING:
26
30
  from datachain.catalog import Catalog
27
31
 
32
+ logger = logging.getLogger("datachain")
33
+
28
34
  # how to create file path when exporting
29
35
  ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
30
36
 
@@ -251,14 +257,18 @@ class File(DataModel):
251
257
  dump = self.model_dump()
252
258
  return UniqueId(*(dump[k] for k in self._unique_id_keys))
253
259
 
254
- def get_local_path(self) -> Optional[str]:
260
+ def get_local_path(self, download: bool = False) -> Optional[str]:
255
261
  """Returns path to a file in a local cache.
256
262
  Return None if file is not cached. Throws an exception if cache is not setup."""
257
263
  if self._catalog is None:
258
264
  raise RuntimeError(
259
265
  "cannot resolve local file path because catalog is not setup"
260
266
  )
261
- return self._catalog.cache.get_path(self.get_uid())
267
+ uid = self.get_uid()
268
+ if download:
269
+ client = self._catalog.get_client(self.source)
270
+ client.download(uid, callback=self._download_cb)
271
+ return self._catalog.cache.get_path(uid)
262
272
 
263
273
  def get_file_suffix(self):
264
274
  """Returns last part of file name with `.`."""
@@ -313,6 +323,70 @@ class File(DataModel):
313
323
  """Returns `fsspec` filesystem for the file."""
314
324
  return self._catalog.get_client(self.source).fs
315
325
 
326
+ def resolve(self) -> "Self":
327
+ """
328
+ Resolve a File object by checking its existence and updating its metadata.
329
+
330
+ Returns:
331
+ File: The resolved File object with updated metadata.
332
+ """
333
+ if self._catalog is None:
334
+ raise RuntimeError("Cannot resolve file: catalog is not set")
335
+
336
+ try:
337
+ client = self._catalog.get_client(self.source)
338
+ except NotImplementedError as e:
339
+ raise RuntimeError(
340
+ f"Unsupported protocol for file source: {self.source}"
341
+ ) from e
342
+
343
+ try:
344
+ info = client.fs.info(client.get_full_path(self.path))
345
+ converted_info = client.info_to_file(info, self.source)
346
+ return type(self)(
347
+ path=self.path,
348
+ source=self.source,
349
+ size=converted_info.size,
350
+ etag=converted_info.etag,
351
+ version=converted_info.version,
352
+ is_latest=converted_info.is_latest,
353
+ last_modified=converted_info.last_modified,
354
+ location=self.location,
355
+ )
356
+ except (FileNotFoundError, PermissionError, OSError) as e:
357
+ logger.warning("File system error when resolving %s: %s", self.path, str(e))
358
+
359
+ return type(self)(
360
+ path=self.path,
361
+ source=self.source,
362
+ size=0,
363
+ etag="",
364
+ version="",
365
+ is_latest=True,
366
+ last_modified=TIME_ZERO,
367
+ location=self.location,
368
+ )
369
+
370
+
371
+ def resolve(file: File) -> File:
372
+ """
373
+ Resolve a File object by checking its existence and updating its metadata.
374
+
375
+ This function is a wrapper around the File.resolve() method, designed to be
376
+ used as a mapper in DataChain operations.
377
+
378
+ Args:
379
+ file (File): The File object to resolve.
380
+
381
+ Returns:
382
+ File: The resolved File object with updated metadata.
383
+
384
+ Raises:
385
+ RuntimeError: If the file's catalog is not set or if
386
+ the file source protocol is unsupported.
387
+ """
388
+ return file.resolve()
389
+
316
390
 
317
391
  class TextFile(File):
318
392
  """`DataModel` for reading text files."""
@@ -15,7 +15,7 @@ try:
15
15
  Value,
16
16
  load_dataset,
17
17
  )
18
- from datasets.features.features import string_to_arrow
18
+ from datasets.features.features import Features, string_to_arrow
19
19
  from datasets.features.image import image_to_bytes
20
20
 
21
21
  except ImportError as exc:
@@ -36,6 +36,7 @@ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
36
36
  from datachain.lib.udf import Generator
37
37
 
38
38
  if TYPE_CHECKING:
39
+ import pyarrow as pa
39
40
  from pydantic import BaseModel
40
41
 
41
42
 
@@ -71,6 +72,15 @@ class HFGenerator(Generator):
71
72
  *args,
72
73
  **kwargs,
73
74
  ):
75
+ """
76
+ Generator for chain from huggingface datasets.
77
+
78
+ Parameters:
79
+
80
+ ds : Path or name of the dataset to read from Hugging Face Hub,
81
+ or an instance of `datasets.Dataset`-like object.
82
+ output_schema : Pydantic model for validation.
83
+ """
74
84
  super().__init__()
75
85
  self.ds = ds
76
86
  self.output_schema = output_schema
@@ -92,7 +102,7 @@ class HFGenerator(Generator):
92
102
  output_dict["split"] = split
93
103
  for name, feat in ds.features.items():
94
104
  anno = self.output_schema.model_fields[name].annotation
95
- output_dict[name] = _convert_feature(row[name], feat, anno)
105
+ output_dict[name] = convert_feature(row[name], feat, anno)
96
106
  yield self.output_schema(**output_dict)
97
107
  pbar.update(1)
98
108
 
@@ -106,7 +116,7 @@ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
106
116
  return {"": ds}
107
117
 
108
118
 
109
- def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
119
+ def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
110
120
  if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
111
121
  return val
112
122
  if isinstance(feat, ClassLabel):
@@ -117,20 +127,23 @@ def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
117
127
  for sname in val:
118
128
  sfeat = feat.feature[sname]
119
129
  sanno = anno.model_fields[sname].annotation
120
- sdict[sname] = [_convert_feature(v, sfeat, sanno) for v in val[sname]]
130
+ sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
121
131
  return anno(**sdict)
122
132
  return val
123
133
  if isinstance(feat, Image):
134
+ if isinstance(val, dict):
135
+ return HFImage(img=val["bytes"])
124
136
  return HFImage(img=image_to_bytes(val))
125
137
  if isinstance(feat, Audio):
126
138
  return HFAudio(**val)
127
139
 
128
140
 
129
141
  def get_output_schema(
130
- ds: Union[Dataset, IterableDataset], model_name: str = ""
142
+ features: Features, model_name: str = "", stream: bool = True
131
143
  ) -> dict[str, DataType]:
144
+ """Generate UDF output schema from huggingface datasets features."""
132
145
  fields_dict = {}
133
- for name, val in ds.features.items():
146
+ for name, val in features.items():
134
147
  fields_dict[name] = _feature_to_chain_type(name, val) # type: ignore[assignment]
135
148
  return fields_dict # type: ignore[return-value]
136
149
 
@@ -165,3 +178,7 @@ def _feature_to_chain_type(name: str, val: Any) -> type: # noqa: PLR0911
165
178
  if isinstance(val, Audio):
166
179
  return HFAudio
167
180
  raise TypeError(f"Unknown huggingface datasets type {type(val)}")
181
+
182
+
183
+ def schema_from_arrow(schema: "pa.Schema"):
184
+ return Features.from_arrow_schema(schema)
@@ -20,7 +20,7 @@ LISTING_TTL = 4 * 60 * 60 # cached listing lasts 4 hours
20
20
  LISTING_PREFIX = "lst__" # listing datasets start with this name
21
21
 
22
22
 
23
- def list_bucket(uri: str, client_config=None) -> Callable:
23
+ def list_bucket(uri: str, cache, client_config=None) -> Callable:
24
24
  """
25
25
  Function that returns another generator function that yields File objects
26
26
  from bucket where each File represents one bucket entry.
@@ -28,7 +28,8 @@ def list_bucket(uri: str, client_config=None) -> Callable:
28
28
 
29
29
  def list_func() -> Iterator[File]:
30
30
  config = client_config or {}
31
- client, path = Client.parse_url(uri, None, **config) # type: ignore[arg-type]
31
+ client = Client.get_client(uri, cache, **config) # type: ignore[arg-type]
32
+ _, path = Client.parse_url(uri)
32
33
  for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
33
34
  yield from entries
34
35
 
@@ -76,16 +77,17 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
76
77
  """
77
78
  Parsing uri and returns listing dataset name, listing uri and listing path
78
79
  """
79
- client, path = Client.parse_url(uri, cache, **client_config)
80
+ client = Client.get_client(uri, cache, **client_config)
81
+ storage_uri, path = Client.parse_url(uri)
80
82
 
81
83
  # clean path without globs
82
84
  lst_uri_path = (
83
85
  posixpath.dirname(path) if uses_glob(path) or client.fs.isfile(uri) else path
84
86
  )
85
87
 
86
- lst_uri = f"{client.uri}/{lst_uri_path.lstrip('/')}"
88
+ lst_uri = f"{storage_uri}/{lst_uri_path.lstrip('/')}"
87
89
  ds_name = (
88
- f"{LISTING_PREFIX}{client.uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
90
+ f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
89
91
  )
90
92
 
91
93
  return ds_name, lst_uri, path
@@ -13,8 +13,8 @@ class ListingInfo(DatasetInfo):
13
13
 
14
14
  @property
15
15
  def storage_uri(self) -> str:
16
- client, _ = Client.parse_url(self.uri, None) # type: ignore[arg-type]
17
- return client.uri
16
+ uri, _ = Client.parse_url(self.uri)
17
+ return uri
18
18
 
19
19
  @property
20
20
  def expires(self) -> Optional[datetime]:
@@ -386,11 +386,20 @@ class SignalSchema:
386
386
  else:
387
387
  json, pos = unflatten_to_json_pos(fr, row, pos) # type: ignore[union-attr]
388
388
  obj = fr(**json)
389
- if isinstance(obj, File):
390
- obj._set_stream(catalog, caching_enabled=cache)
389
+ SignalSchema._set_file_stream(obj, catalog, cache)
391
390
  res.append(obj)
392
391
  return res
393
392
 
393
+ @staticmethod
394
+ def _set_file_stream(
395
+ obj: BaseModel, catalog: "Catalog", cache: bool = False
396
+ ) -> None:
397
+ if isinstance(obj, File):
398
+ obj._set_stream(catalog, caching_enabled=cache)
399
+ for field, finfo in obj.model_fields.items():
400
+ if ModelStore.is_pydantic(finfo.annotation):
401
+ SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
402
+
394
403
  def db_signals(
395
404
  self, name: Optional[str] = None, as_columns=False
396
405
  ) -> Union[list[str], list[Column]]:
@@ -0,0 +1,33 @@
1
+ import hashlib
2
+ import tarfile
3
+ from collections.abc import Iterator
4
+
5
+ from datachain.lib.file import File, TarVFile
6
+
7
+
8
+ def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
9
+ new_parent = parent.get_full_name()
10
+ etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
11
+ etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
12
+ return File(
13
+ source=parent.source,
14
+ path=f"{new_parent}/{info.name}",
15
+ version=parent.version,
16
+ size=info.size,
17
+ etag=etag,
18
+ location=[
19
+ {
20
+ "vtype": TarVFile.get_vtype(),
21
+ "parent": parent.model_dump_custom(),
22
+ "size": info.size,
23
+ "offset": info.offset_data,
24
+ }
25
+ ],
26
+ )
27
+
28
+
29
+ def process_tar(file: File) -> Iterator[File]:
30
+ with file.open() as fd:
31
+ with tarfile.open(fileobj=fd) as tar:
32
+ for entry in tar.getmembers():
33
+ yield build_tar_member(file, entry)