datachain 0.6.1__tar.gz → 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (262) hide show
  1. {datachain-0.6.1 → datachain-0.6.2}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.6.1/src/datachain.egg-info → datachain-0.6.2}/PKG-INFO +7 -6
  3. {datachain-0.6.1 → datachain-0.6.2}/pyproject.toml +7 -6
  4. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/catalog/catalog.py +61 -219
  5. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/cli.py +136 -22
  6. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/client/fsspec.py +9 -0
  7. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/client/local.py +11 -32
  8. datachain-0.6.2/src/datachain/config.py +137 -0
  9. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/data_storage/schema.py +66 -33
  10. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/data_storage/sqlite.py +4 -4
  11. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/data_storage/warehouse.py +101 -125
  12. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/dc.py +211 -52
  13. datachain-0.6.2/src/datachain/lib/func/__init__.py +32 -0
  14. datachain-0.6.2/src/datachain/lib/func/aggregate.py +353 -0
  15. datachain-0.6.2/src/datachain/lib/func/func.py +152 -0
  16. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/listing.py +6 -21
  17. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/listing_info.py +4 -0
  18. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/signal_schema.py +8 -5
  19. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/udf.py +3 -3
  20. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/listing.py +22 -48
  21. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/query/dataset.py +11 -3
  22. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/remote/studio.py +63 -14
  23. datachain-0.6.2/src/datachain/studio.py +129 -0
  24. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/utils.py +58 -0
  25. {datachain-0.6.1 → datachain-0.6.2/src/datachain.egg-info}/PKG-INFO +7 -6
  26. {datachain-0.6.1 → datachain-0.6.2}/src/datachain.egg-info/SOURCES.txt +4 -0
  27. {datachain-0.6.1 → datachain-0.6.2}/src/datachain.egg-info/requires.txt +6 -7
  28. {datachain-0.6.1 → datachain-0.6.2}/tests/conftest.py +39 -15
  29. {datachain-0.6.1 → datachain-0.6.2}/tests/examples/test_wds_e2e.py +1 -2
  30. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_catalog.py +71 -57
  31. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_datachain.py +191 -26
  32. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_dataset_query.py +5 -5
  33. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_datasets.py +3 -4
  34. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_listing.py +2 -10
  35. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_ls.py +47 -72
  36. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_pull.py +32 -41
  37. {datachain-0.6.1 → datachain-0.6.2}/tests/test_cli_e2e.py +0 -1
  38. datachain-0.6.2/tests/test_cli_studio.py +120 -0
  39. {datachain-0.6.1 → datachain-0.6.2}/tests/test_query_e2e.py +0 -1
  40. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_datachain.py +287 -10
  41. datachain-0.6.2/tests/unit/lib/test_listing_info.py +34 -0
  42. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_cli_parsing.py +1 -2
  43. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_client.py +16 -34
  44. datachain-0.6.2/tests/unit/test_config.py +174 -0
  45. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_data_storage.py +30 -40
  46. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_listing.py +22 -42
  47. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_utils.py +47 -0
  48. {datachain-0.6.1 → datachain-0.6.2}/tests/utils.py +0 -15
  49. datachain-0.6.1/src/datachain/config.py +0 -62
  50. datachain-0.6.1/src/datachain/lib/func/__init__.py +0 -14
  51. datachain-0.6.1/src/datachain/lib/func/aggregate.py +0 -42
  52. datachain-0.6.1/src/datachain/lib/func/func.py +0 -64
  53. {datachain-0.6.1 → datachain-0.6.2}/.cruft.json +0 -0
  54. {datachain-0.6.1 → datachain-0.6.2}/.gitattributes +0 -0
  55. {datachain-0.6.1 → datachain-0.6.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  56. {datachain-0.6.1 → datachain-0.6.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  57. {datachain-0.6.1 → datachain-0.6.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  58. {datachain-0.6.1 → datachain-0.6.2}/.github/codecov.yaml +0 -0
  59. {datachain-0.6.1 → datachain-0.6.2}/.github/dependabot.yml +0 -0
  60. {datachain-0.6.1 → datachain-0.6.2}/.github/workflows/benchmarks.yml +0 -0
  61. {datachain-0.6.1 → datachain-0.6.2}/.github/workflows/release.yml +0 -0
  62. {datachain-0.6.1 → datachain-0.6.2}/.github/workflows/tests-studio.yml +0 -0
  63. {datachain-0.6.1 → datachain-0.6.2}/.github/workflows/tests.yml +0 -0
  64. {datachain-0.6.1 → datachain-0.6.2}/.github/workflows/update-template.yaml +0 -0
  65. {datachain-0.6.1 → datachain-0.6.2}/.gitignore +0 -0
  66. {datachain-0.6.1 → datachain-0.6.2}/CODE_OF_CONDUCT.rst +0 -0
  67. {datachain-0.6.1 → datachain-0.6.2}/CONTRIBUTING.rst +0 -0
  68. {datachain-0.6.1 → datachain-0.6.2}/LICENSE +0 -0
  69. {datachain-0.6.1 → datachain-0.6.2}/README.rst +0 -0
  70. {datachain-0.6.1 → datachain-0.6.2}/docs/assets/captioned_cartoons.png +0 -0
  71. {datachain-0.6.1 → datachain-0.6.2}/docs/assets/datachain-white.svg +0 -0
  72. {datachain-0.6.1 → datachain-0.6.2}/docs/assets/datachain.svg +0 -0
  73. {datachain-0.6.1 → datachain-0.6.2}/docs/assets/flowchart.png +0 -0
  74. {datachain-0.6.1 → datachain-0.6.2}/docs/index.md +0 -0
  75. {datachain-0.6.1 → datachain-0.6.2}/docs/references/datachain.md +0 -0
  76. {datachain-0.6.1 → datachain-0.6.2}/docs/references/datatype.md +0 -0
  77. {datachain-0.6.1 → datachain-0.6.2}/docs/references/file.md +0 -0
  78. {datachain-0.6.1 → datachain-0.6.2}/docs/references/index.md +0 -0
  79. {datachain-0.6.1 → datachain-0.6.2}/docs/references/sql.md +0 -0
  80. {datachain-0.6.1 → datachain-0.6.2}/docs/references/torch.md +0 -0
  81. {datachain-0.6.1 → datachain-0.6.2}/docs/references/udf.md +0 -0
  82. {datachain-0.6.1 → datachain-0.6.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  83. {datachain-0.6.1 → datachain-0.6.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  84. {datachain-0.6.1 → datachain-0.6.2}/examples/computer_vision/openimage-detect.py +0 -0
  85. {datachain-0.6.1 → datachain-0.6.2}/examples/get_started/common_sql_functions.py +0 -0
  86. {datachain-0.6.1 → datachain-0.6.2}/examples/get_started/json-csv-reader.py +0 -0
  87. {datachain-0.6.1 → datachain-0.6.2}/examples/get_started/torch-loader.py +0 -0
  88. {datachain-0.6.1 → datachain-0.6.2}/examples/get_started/udfs/parallel.py +0 -0
  89. {datachain-0.6.1 → datachain-0.6.2}/examples/get_started/udfs/simple.py +0 -0
  90. {datachain-0.6.1 → datachain-0.6.2}/examples/get_started/udfs/stateful.py +0 -0
  91. {datachain-0.6.1 → datachain-0.6.2}/examples/llm_and_nlp/claude-query.py +0 -0
  92. {datachain-0.6.1 → datachain-0.6.2}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  93. {datachain-0.6.1 → datachain-0.6.2}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  94. {datachain-0.6.1 → datachain-0.6.2}/examples/multimodal/clip_inference.py +0 -0
  95. {datachain-0.6.1 → datachain-0.6.2}/examples/multimodal/hf_pipeline.py +0 -0
  96. {datachain-0.6.1 → datachain-0.6.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
  97. {datachain-0.6.1 → datachain-0.6.2}/examples/multimodal/wds.py +0 -0
  98. {datachain-0.6.1 → datachain-0.6.2}/examples/multimodal/wds_filtered.py +0 -0
  99. {datachain-0.6.1 → datachain-0.6.2}/mkdocs.yml +0 -0
  100. {datachain-0.6.1 → datachain-0.6.2}/noxfile.py +0 -0
  101. {datachain-0.6.1 → datachain-0.6.2}/overrides/main.html +0 -0
  102. {datachain-0.6.1 → datachain-0.6.2}/setup.cfg +0 -0
  103. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/__init__.py +0 -0
  104. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/__main__.py +0 -0
  105. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/asyn.py +0 -0
  106. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/cache.py +0 -0
  107. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/catalog/__init__.py +0 -0
  108. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/catalog/datasource.py +0 -0
  109. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/catalog/loader.py +0 -0
  110. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/cli_utils.py +0 -0
  111. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/client/__init__.py +0 -0
  112. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/client/azure.py +0 -0
  113. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/client/fileslice.py +0 -0
  114. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/client/gcs.py +0 -0
  115. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/client/hf.py +0 -0
  116. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/client/s3.py +0 -0
  117. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/data_storage/__init__.py +0 -0
  118. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/data_storage/db_engine.py +0 -0
  119. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/data_storage/id_generator.py +0 -0
  120. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/data_storage/job.py +0 -0
  121. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/data_storage/metastore.py +0 -0
  122. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/data_storage/serializer.py +0 -0
  123. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/dataset.py +0 -0
  124. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/error.py +0 -0
  125. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/job.py +0 -0
  126. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/__init__.py +0 -0
  127. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/arrow.py +0 -0
  128. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/clip.py +0 -0
  129. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/convert/__init__.py +0 -0
  130. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/convert/flatten.py +0 -0
  131. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
  132. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
  133. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/convert/unflatten.py +0 -0
  134. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  135. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/data_model.py +0 -0
  136. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/dataset_info.py +0 -0
  137. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/file.py +0 -0
  138. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/hf.py +0 -0
  139. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/image.py +0 -0
  140. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/meta_formats.py +0 -0
  141. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/model_store.py +0 -0
  142. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/pytorch.py +0 -0
  143. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/settings.py +0 -0
  144. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/tar.py +0 -0
  145. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/text.py +0 -0
  146. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/udf_signature.py +0 -0
  147. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/utils.py +0 -0
  148. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/vfile.py +0 -0
  149. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/webdataset.py +0 -0
  150. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/lib/webdataset_laion.py +0 -0
  151. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/node.py +0 -0
  152. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/nodes_fetcher.py +0 -0
  153. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/nodes_thread_pool.py +0 -0
  154. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/progress.py +0 -0
  155. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/py.typed +0 -0
  156. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/query/__init__.py +0 -0
  157. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/query/batch.py +0 -0
  158. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/query/dispatch.py +0 -0
  159. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/query/metrics.py +0 -0
  160. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/query/params.py +0 -0
  161. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/query/queue.py +0 -0
  162. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/query/schema.py +0 -0
  163. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/query/session.py +0 -0
  164. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/remote/__init__.py +0 -0
  165. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/__init__.py +0 -0
  166. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/default/__init__.py +0 -0
  167. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/default/base.py +0 -0
  168. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/functions/__init__.py +0 -0
  169. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/functions/aggregate.py +0 -0
  170. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/functions/array.py +0 -0
  171. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/functions/conditional.py +0 -0
  172. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/functions/path.py +0 -0
  173. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/functions/random.py +0 -0
  174. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/functions/string.py +0 -0
  175. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/selectable.py +0 -0
  176. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/sqlite/__init__.py +0 -0
  177. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/sqlite/base.py +0 -0
  178. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/sqlite/types.py +0 -0
  179. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/sqlite/vector.py +0 -0
  180. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/types.py +0 -0
  181. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/sql/utils.py +0 -0
  182. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/storage.py +0 -0
  183. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/telemetry.py +0 -0
  184. {datachain-0.6.1 → datachain-0.6.2}/src/datachain/torch/__init__.py +0 -0
  185. {datachain-0.6.1 → datachain-0.6.2}/src/datachain.egg-info/dependency_links.txt +0 -0
  186. {datachain-0.6.1 → datachain-0.6.2}/src/datachain.egg-info/entry_points.txt +0 -0
  187. {datachain-0.6.1 → datachain-0.6.2}/src/datachain.egg-info/top_level.txt +0 -0
  188. {datachain-0.6.1 → datachain-0.6.2}/tests/__init__.py +0 -0
  189. {datachain-0.6.1 → datachain-0.6.2}/tests/benchmarks/__init__.py +0 -0
  190. {datachain-0.6.1 → datachain-0.6.2}/tests/benchmarks/conftest.py +0 -0
  191. {datachain-0.6.1 → datachain-0.6.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  192. {datachain-0.6.1 → datachain-0.6.2}/tests/benchmarks/datasets/.dvc/config +0 -0
  193. {datachain-0.6.1 → datachain-0.6.2}/tests/benchmarks/datasets/.gitignore +0 -0
  194. {datachain-0.6.1 → datachain-0.6.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  195. {datachain-0.6.1 → datachain-0.6.2}/tests/benchmarks/test_datachain.py +0 -0
  196. {datachain-0.6.1 → datachain-0.6.2}/tests/benchmarks/test_ls.py +0 -0
  197. {datachain-0.6.1 → datachain-0.6.2}/tests/benchmarks/test_version.py +0 -0
  198. {datachain-0.6.1 → datachain-0.6.2}/tests/data.py +0 -0
  199. {datachain-0.6.1 → datachain-0.6.2}/tests/examples/__init__.py +0 -0
  200. {datachain-0.6.1 → datachain-0.6.2}/tests/examples/test_examples.py +0 -0
  201. {datachain-0.6.1 → datachain-0.6.2}/tests/examples/wds_data.py +0 -0
  202. {datachain-0.6.1 → datachain-0.6.2}/tests/func/__init__.py +0 -0
  203. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_client.py +0 -0
  204. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_feature_pickling.py +0 -0
  205. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_meta_formats.py +0 -0
  206. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_metrics.py +0 -0
  207. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_pytorch.py +0 -0
  208. {datachain-0.6.1 → datachain-0.6.2}/tests/func/test_query.py +0 -0
  209. {datachain-0.6.1 → datachain-0.6.2}/tests/scripts/feature_class.py +0 -0
  210. {datachain-0.6.1 → datachain-0.6.2}/tests/scripts/feature_class_exception.py +0 -0
  211. {datachain-0.6.1 → datachain-0.6.2}/tests/scripts/feature_class_parallel.py +0 -0
  212. {datachain-0.6.1 → datachain-0.6.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  213. {datachain-0.6.1 → datachain-0.6.2}/tests/scripts/name_len_slow.py +0 -0
  214. {datachain-0.6.1 → datachain-0.6.2}/tests/test_atomicity.py +0 -0
  215. {datachain-0.6.1 → datachain-0.6.2}/tests/test_telemetry.py +0 -0
  216. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/__init__.py +0 -0
  217. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/__init__.py +0 -0
  218. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/conftest.py +0 -0
  219. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_arrow.py +0 -0
  220. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_clip.py +0 -0
  221. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  222. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_datachain_merge.py +0 -0
  223. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_feature.py +0 -0
  224. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_feature_utils.py +0 -0
  225. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_file.py +0 -0
  226. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_hf.py +0 -0
  227. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_image.py +0 -0
  228. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_schema.py +0 -0
  229. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_signal_schema.py +0 -0
  230. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_sql_to_python.py +0 -0
  231. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_text.py +0 -0
  232. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_udf_signature.py +0 -0
  233. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_utils.py +0 -0
  234. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/lib/test_webdataset.py +0 -0
  235. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/sql/__init__.py +0 -0
  236. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/sql/sqlite/__init__.py +0 -0
  237. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
  238. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/sql/test_array.py +0 -0
  239. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/sql/test_conditional.py +0 -0
  240. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/sql/test_path.py +0 -0
  241. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/sql/test_random.py +0 -0
  242. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/sql/test_selectable.py +0 -0
  243. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/sql/test_string.py +0 -0
  244. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_asyn.py +0 -0
  245. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_cache.py +0 -0
  246. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_catalog.py +0 -0
  247. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_catalog_loader.py +0 -0
  248. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_client_s3.py +0 -0
  249. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_database_engine.py +0 -0
  250. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_dataset.py +0 -0
  251. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_dispatch.py +0 -0
  252. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_fileslice.py +0 -0
  253. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_id_generator.py +0 -0
  254. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_metastore.py +0 -0
  255. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_module_exports.py +0 -0
  256. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_query.py +0 -0
  257. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_query_metrics.py +0 -0
  258. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_query_params.py +0 -0
  259. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_serializer.py +0 -0
  260. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_session.py +0 -0
  261. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_storage.py +0 -0
  262. {datachain-0.6.1 → datachain-0.6.2}/tests/unit/test_warehouse.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.6.9'
27
+ rev: 'v0.7.0'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.1
3
+ Version: 0.6.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -19,8 +19,7 @@ License-File: LICENSE
19
19
  Requires-Dist: pyyaml
20
20
  Requires-Dist: tomlkit
21
21
  Requires-Dist: tqdm
22
- Requires-Dist: numpy
23
- Requires-Dist: numpy<2,>=1; sys_platform == "win32"
22
+ Requires-Dist: numpy<3,>=1
24
23
  Requires-Dist: pandas>=2.0.0
25
24
  Requires-Dist: pyarrow
26
25
  Requires-Dist: typing-extensions
@@ -39,11 +38,13 @@ Requires-Dist: orjson>=3.10.5
39
38
  Requires-Dist: pydantic<3,>=2
40
39
  Requires-Dist: jmespath>=1.0
41
40
  Requires-Dist: datamodel-code-generator>=0.25
42
- Requires-Dist: Pillow<11,>=10.0.0
41
+ Requires-Dist: Pillow<12,>=10.0.0
43
42
  Requires-Dist: msgpack<2,>=1.0.4
44
43
  Requires-Dist: psutil
45
44
  Requires-Dist: huggingface_hub
46
45
  Requires-Dist: iterative-telemetry>=0.0.9
46
+ Requires-Dist: platformdirs
47
+ Requires-Dist: dvc-studio-client<1,>=0.21
47
48
  Provides-Extra: docs
48
49
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
49
50
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -81,7 +82,7 @@ Requires-Dist: requests-mock; extra == "tests"
81
82
  Requires-Dist: scipy; extra == "tests"
82
83
  Provides-Extra: dev
83
84
  Requires-Dist: datachain[docs,tests]; extra == "dev"
84
- Requires-Dist: mypy==1.12.0; extra == "dev"
85
+ Requires-Dist: mypy==1.12.1; extra == "dev"
85
86
  Requires-Dist: types-python-dateutil; extra == "dev"
86
87
  Requires-Dist: types-pytz; extra == "dev"
87
88
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -91,7 +92,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
91
92
  Requires-Dist: numpy<2,>=1; extra == "examples"
92
93
  Requires-Dist: defusedxml; extra == "examples"
93
94
  Requires-Dist: accelerate; extra == "examples"
94
- Requires-Dist: unstructured[embed-huggingface,pdf]; extra == "examples"
95
+ Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
95
96
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
96
97
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
97
98
  Requires-Dist: onnx==1.16.1; extra == "examples"
@@ -22,8 +22,7 @@ dependencies = [
22
22
  "pyyaml",
23
23
  "tomlkit",
24
24
  "tqdm",
25
- "numpy",
26
- 'numpy>=1,<2; sys_platform == "win32"',
25
+ "numpy>=1,<3",
27
26
  "pandas>=2.0.0",
28
27
  "pyarrow",
29
28
  "typing-extensions",
@@ -42,11 +41,13 @@ dependencies = [
42
41
  "pydantic>=2,<3",
43
42
  "jmespath>=1.0",
44
43
  "datamodel-code-generator>=0.25",
45
- "Pillow>=10.0.0,<11",
44
+ "Pillow>=10.0.0,<12",
46
45
  "msgpack>=1.0.4,<2",
47
46
  "psutil",
48
47
  "huggingface_hub",
49
- "iterative-telemetry>=0.0.9"
48
+ "iterative-telemetry>=0.0.9",
49
+ "platformdirs",
50
+ "dvc-studio-client>=0.21,<1"
50
51
  ]
51
52
 
52
53
  [project.optional-dependencies]
@@ -93,7 +94,7 @@ tests = [
93
94
  ]
94
95
  dev = [
95
96
  "datachain[docs,tests]",
96
- "mypy==1.12.0",
97
+ "mypy==1.12.1",
97
98
  "types-python-dateutil",
98
99
  "types-pytz",
99
100
  "types-PyYAML",
@@ -104,7 +105,7 @@ examples = [
104
105
  "numpy>=1,<2",
105
106
  "defusedxml",
106
107
  "accelerate",
107
- "unstructured[pdf, embed-huggingface]",
108
+ "unstructured[pdf,embed-huggingface]<0.16.0",
108
109
  "pdfplumber==0.11.4",
109
110
  "huggingface_hub[hf_transfer]",
110
111
  "onnx==1.16.1"
@@ -1,4 +1,3 @@
1
- import glob
2
1
  import io
3
2
  import json
4
3
  import logging
@@ -35,7 +34,6 @@ from tqdm import tqdm
35
34
 
36
35
  from datachain.cache import DataChainCache
37
36
  from datachain.client import Client
38
- from datachain.config import get_remote_config, read_config
39
37
  from datachain.dataset import (
40
38
  DATASET_PREFIX,
41
39
  QUERY_DATASET_PREFIX,
@@ -48,12 +46,10 @@ from datachain.dataset import (
48
46
  parse_dataset_uri,
49
47
  )
50
48
  from datachain.error import (
51
- ClientError,
52
49
  DataChainError,
53
50
  DatasetInvalidVersionError,
54
51
  DatasetNotFoundError,
55
52
  DatasetVersionNotFoundError,
56
- PendingIndexingError,
57
53
  QueryScriptCancelError,
58
54
  QueryScriptRunError,
59
55
  )
@@ -61,8 +57,8 @@ from datachain.listing import Listing
61
57
  from datachain.node import DirType, Node, NodeWithPath
62
58
  from datachain.nodes_thread_pool import NodesThreadPool
63
59
  from datachain.remote.studio import StudioClient
64
- from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
65
- from datachain.storage import Storage, StorageStatus, StorageURI
60
+ from datachain.sql.types import DateTime, SQLType, String
61
+ from datachain.storage import StorageURI
66
62
  from datachain.utils import (
67
63
  DataChainDir,
68
64
  batched,
@@ -102,7 +98,7 @@ PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be av
102
98
  PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
103
99
 
104
100
 
105
- def _raise_remote_error(error_message: str) -> NoReturn:
101
+ def raise_remote_error(error_message: str) -> NoReturn:
106
102
  raise DataChainError(f"Error from server: {error_message}")
107
103
 
108
104
 
@@ -130,7 +126,6 @@ class DatasetRowsFetcher(NodesThreadPool):
130
126
  self,
131
127
  metastore: "AbstractMetastore",
132
128
  warehouse: "AbstractWarehouse",
133
- remote_config: dict[str, Any],
134
129
  dataset_name: str,
135
130
  dataset_version: int,
136
131
  schema: dict[str, Union[SQLType, type[SQLType]]],
@@ -144,10 +139,7 @@ class DatasetRowsFetcher(NodesThreadPool):
144
139
  self.dataset_version = dataset_version
145
140
  self.schema = schema
146
141
  self.last_status_check: Optional[float] = None
147
-
148
- self.studio_client = StudioClient(
149
- remote_config["url"], remote_config["username"], remote_config["token"]
150
- )
142
+ self.studio_client = StudioClient()
151
143
 
152
144
  def done_task(self, done):
153
145
  for task in done:
@@ -181,14 +173,14 @@ class DatasetRowsFetcher(NodesThreadPool):
181
173
  self.dataset_name, self.dataset_version
182
174
  )
183
175
  if not export_status_response.ok:
184
- _raise_remote_error(export_status_response.message)
176
+ raise_remote_error(export_status_response.message)
185
177
 
186
178
  export_status = export_status_response.data["status"] # type: ignore [index]
187
179
 
188
180
  if export_status == "failed":
189
- _raise_remote_error("Dataset export failed in Studio")
181
+ raise_remote_error("Dataset export failed in Studio")
190
182
  if export_status == "removed":
191
- _raise_remote_error("Dataset export removed in Studio")
183
+ raise_remote_error("Dataset export removed in Studio")
192
184
 
193
185
  self.last_status_check = time.time()
194
186
 
@@ -483,17 +475,12 @@ def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
483
475
  if not node_group.sources:
484
476
  continue
485
477
  listing: Listing = node_group.listing
486
- source_path: str = node_group.source_path
487
- if not node_group.is_dataset:
488
- assert listing.storage
489
- data_source = listing.storage.to_dict(source_path)
490
- else:
491
- data_source = {"uri": listing.metastore.uri}
492
-
493
- metafile_group = {"data-source": data_source, "files": []}
478
+ metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
494
479
  for node in node_group.instantiated_nodes:
495
480
  if not node.n.is_dir:
496
- metafile_group["files"].append(node.get_metafile_data())
481
+ metafile_group["files"].append( # type: ignore [attr-defined]
482
+ node.get_metafile_data()
483
+ )
497
484
  if metafile_group["files"]:
498
485
  metafile_data.append(metafile_group)
499
486
 
@@ -569,6 +556,12 @@ class Catalog:
569
556
 
570
557
  return self._warehouse
571
558
 
559
+ @cached_property
560
+ def session(self):
561
+ from datachain.query.session import Session
562
+
563
+ return Session.get(catalog=self)
564
+
572
565
  def get_init_params(self) -> dict[str, Any]:
573
566
  return {
574
567
  **self._init_params,
@@ -599,162 +592,29 @@ class Catalog:
599
592
  def enlist_source(
600
593
  self,
601
594
  source: str,
602
- ttl: int,
603
- force_update=False,
604
- skip_indexing=False,
595
+ update=False,
605
596
  client_config=None,
597
+ object_name="file",
598
+ skip_indexing=False,
606
599
  ) -> tuple[Listing, str]:
607
- if force_update and skip_indexing:
608
- raise ValueError(
609
- "Both force_update and skip_indexing flags"
610
- " cannot be True at the same time"
611
- )
612
-
613
- partial_id: Optional[int]
614
- partial_path: Optional[str]
600
+ from datachain.lib.dc import DataChain
615
601
 
616
- client_config = client_config or self.client_config
617
- uri, path = Client.parse_url(source)
618
- client = Client.get_client(source, self.cache, **client_config)
619
- stem = os.path.basename(os.path.normpath(path))
620
- prefix = (
621
- posixpath.dirname(path)
622
- if glob.has_magic(stem) or client.fs.isfile(source)
623
- else path
602
+ DataChain.from_storage(
603
+ source, session=self.session, update=update, object_name=object_name
624
604
  )
625
- storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
626
- source_metastore = self.metastore.clone(uri)
627
-
628
- columns = [
629
- Column("path", String),
630
- Column("etag", String),
631
- Column("version", String),
632
- Column("is_latest", Boolean),
633
- Column("last_modified", DateTime(timezone=True)),
634
- Column("size", Int64),
635
- Column("location", JSON),
636
- Column("source", String),
637
- ]
638
-
639
- if skip_indexing:
640
- source_metastore.create_storage_if_not_registered(uri)
641
- storage = source_metastore.get_storage(uri)
642
- source_metastore.init_partial_id(uri)
643
- partial_id = source_metastore.get_next_partial_id(uri)
644
-
645
- source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
646
- source_metastore.init(uri)
647
-
648
- source_warehouse = self.warehouse.clone()
649
- dataset = self.create_dataset(
650
- storage_dataset_name, columns=columns, listing=True
651
- )
652
-
653
- return (
654
- Listing(storage, source_metastore, source_warehouse, client, dataset),
655
- path,
656
- )
657
-
658
- (
659
- storage,
660
- need_index,
661
- in_progress,
662
- partial_id,
663
- partial_path,
664
- ) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
665
- if in_progress:
666
- raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
667
-
668
- if not need_index:
669
- assert partial_id is not None
670
- assert partial_path is not None
671
- source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
672
- source_warehouse = self.warehouse.clone()
673
- dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
674
- lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
675
- logger.debug(
676
- "Using cached listing %s. Valid till: %s",
677
- storage.uri,
678
- storage.expires_to_local,
679
- )
680
- # Listing has to have correct version of data storage
681
- # initialized with correct Storage
682
-
683
- self.update_dataset_version_with_warehouse_info(
684
- dataset,
685
- dataset.latest_version,
686
- )
687
-
688
- return lst, path
689
605
 
690
- source_metastore.init_partial_id(uri)
691
- partial_id = source_metastore.get_next_partial_id(uri)
692
-
693
- source_metastore.init(uri)
694
- source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
695
-
696
- source_warehouse = self.warehouse.clone()
697
-
698
- dataset = self.create_dataset(
699
- storage_dataset_name, columns=columns, listing=True
606
+ list_ds_name, list_uri, list_path, _ = DataChain.parse_uri(
607
+ source, self.session, update=update
700
608
  )
701
609
 
702
- lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
703
-
704
- try:
705
- lst.fetch(prefix)
706
-
707
- source_metastore.mark_storage_indexed(
708
- storage.uri,
709
- StorageStatus.PARTIAL if prefix else StorageStatus.COMPLETE,
710
- ttl,
711
- prefix=prefix,
712
- partial_id=partial_id,
713
- dataset=dataset,
714
- )
715
-
716
- self.update_dataset_version_with_warehouse_info(
717
- dataset,
718
- dataset.latest_version,
719
- )
720
-
721
- except ClientError as e:
722
- # for handling cloud errors
723
- error_message = INDEX_INTERNAL_ERROR_MESSAGE
724
- if e.error_code in ["InvalidAccessKeyId", "SignatureDoesNotMatch"]:
725
- error_message = "Invalid cloud credentials"
726
-
727
- source_metastore.mark_storage_indexed(
728
- storage.uri,
729
- StorageStatus.FAILED,
730
- ttl,
731
- prefix=prefix,
732
- error_message=error_message,
733
- error_stack=traceback.format_exc(),
734
- dataset=dataset,
735
- )
736
- self._remove_dataset_rows_and_warehouse_info(
737
- dataset, dataset.latest_version
738
- )
739
- raise
740
- except:
741
- source_metastore.mark_storage_indexed(
742
- storage.uri,
743
- StorageStatus.FAILED,
744
- ttl,
745
- prefix=prefix,
746
- error_message=INDEX_INTERNAL_ERROR_MESSAGE,
747
- error_stack=traceback.format_exc(),
748
- dataset=dataset,
749
- )
750
- self._remove_dataset_rows_and_warehouse_info(
751
- dataset, dataset.latest_version
752
- )
753
- raise
754
-
755
- lst.storage = storage
610
+ lst = Listing(
611
+ self.warehouse.clone(),
612
+ Client.get_client(list_uri, self.cache, **self.client_config),
613
+ self.get_dataset(list_ds_name),
614
+ object_name=object_name,
615
+ )
756
616
 
757
- return lst, path
617
+ return lst, list_path
758
618
 
759
619
  def _remove_dataset_rows_and_warehouse_info(
760
620
  self, dataset: DatasetRecord, version: int, **kwargs
@@ -770,7 +630,6 @@ class Catalog:
770
630
  def enlist_sources(
771
631
  self,
772
632
  sources: list[str],
773
- ttl: int,
774
633
  update: bool,
775
634
  skip_indexing=False,
776
635
  client_config=None,
@@ -780,10 +639,9 @@ class Catalog:
780
639
  for src in sources: # Opt: parallel
781
640
  listing, file_path = self.enlist_source(
782
641
  src,
783
- ttl,
784
642
  update,
785
- skip_indexing=skip_indexing,
786
643
  client_config=client_config or self.client_config,
644
+ skip_indexing=skip_indexing,
787
645
  )
788
646
  enlisted_sources.append((listing, file_path))
789
647
 
@@ -802,7 +660,6 @@ class Catalog:
802
660
  def enlist_sources_grouped(
803
661
  self,
804
662
  sources: list[str],
805
- ttl: int,
806
663
  update: bool,
807
664
  no_glob: bool = False,
808
665
  client_config=None,
@@ -823,7 +680,6 @@ class Catalog:
823
680
  for ds in edatachain_data:
824
681
  listing, source_path = self.enlist_source(
825
682
  ds["data-source"]["uri"],
826
- ttl,
827
683
  update,
828
684
  client_config=client_config,
829
685
  )
@@ -843,11 +699,13 @@ class Catalog:
843
699
  )
844
700
  indexed_sources = []
845
701
  for source in dataset_sources:
702
+ from datachain.lib.dc import DataChain
703
+
846
704
  client = self.get_client(source, **client_config)
847
705
  uri = client.uri
848
- ms = self.metastore.clone(uri, None)
849
706
  st = self.warehouse.clone()
850
- listing = Listing(None, ms, st, client, None)
707
+ dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
708
+ listing = Listing(st, client, self.get_dataset(dataset_name))
851
709
  rows = DatasetQuery(
852
710
  name=dataset.name, version=ds_version, catalog=self
853
711
  ).to_db_records()
@@ -864,7 +722,7 @@ class Catalog:
864
722
  enlisted_sources.append((False, True, indexed_sources))
865
723
  else:
866
724
  listing, source_path = self.enlist_source(
867
- src, ttl, update, client_config=client_config
725
+ src, update, client_config=client_config
868
726
  )
869
727
  enlisted_sources.append((False, False, (listing, source_path)))
870
728
 
@@ -1115,19 +973,16 @@ class Catalog:
1115
973
  raise ValueError("Sources needs to be non empty list")
1116
974
 
1117
975
  from datachain.lib.dc import DataChain
1118
- from datachain.query.session import Session
1119
-
1120
- session = Session.get(catalog=self, client_config=client_config)
1121
976
 
1122
977
  chains = []
1123
978
  for source in sources:
1124
979
  if source.startswith(DATASET_PREFIX):
1125
980
  dc = DataChain.from_dataset(
1126
- source[len(DATASET_PREFIX) :], session=session
981
+ source[len(DATASET_PREFIX) :], session=self.session
1127
982
  )
1128
983
  else:
1129
984
  dc = DataChain.from_storage(
1130
- source, session=session, recursive=recursive
985
+ source, session=self.session, recursive=recursive
1131
986
  )
1132
987
 
1133
988
  chains.append(dc)
@@ -1239,17 +1094,12 @@ class Catalog:
1239
1094
  def get_dataset(self, name: str) -> DatasetRecord:
1240
1095
  return self.metastore.get_dataset(name)
1241
1096
 
1242
- def get_remote_dataset(self, name: str, *, remote_config=None) -> DatasetRecord:
1243
- remote_config = remote_config or get_remote_config(
1244
- read_config(DataChainDir.find().root), remote=""
1245
- )
1246
- studio_client = StudioClient(
1247
- remote_config["url"], remote_config["username"], remote_config["token"]
1248
- )
1097
+ def get_remote_dataset(self, name: str) -> DatasetRecord:
1098
+ studio_client = StudioClient()
1249
1099
 
1250
1100
  info_response = studio_client.dataset_info(name)
1251
1101
  if not info_response.ok:
1252
- _raise_remote_error(info_response.message)
1102
+ raise_remote_error(info_response.message)
1253
1103
 
1254
1104
  dataset_info = info_response.data
1255
1105
  assert isinstance(dataset_info, dict)
@@ -1306,6 +1156,20 @@ class Catalog:
1306
1156
  for v in d.versions
1307
1157
  )
1308
1158
 
1159
+ def listings(self):
1160
+ """
1161
+ Returns list of ListingInfo objects which are representing specific
1162
+ storage listing datasets
1163
+ """
1164
+ from datachain.lib.listing import is_listing_dataset
1165
+ from datachain.lib.listing_info import ListingInfo
1166
+
1167
+ return [
1168
+ ListingInfo.from_models(d, v, j)
1169
+ for d, v, j in self.list_datasets_versions(include_listing=True)
1170
+ if is_listing_dataset(d.name)
1171
+ ]
1172
+
1309
1173
  def ls_dataset_rows(
1310
1174
  self, name: str, version: int, offset=None, limit=None
1311
1175
  ) -> list[dict]:
@@ -1430,7 +1294,6 @@ class Catalog:
1430
1294
  self,
1431
1295
  sources: list[str],
1432
1296
  fields: Iterable[str],
1433
- ttl=TTL_INT,
1434
1297
  update=False,
1435
1298
  skip_indexing=False,
1436
1299
  *,
@@ -1438,7 +1301,6 @@ class Catalog:
1438
1301
  ) -> Iterator[tuple[DataSource, Iterable[tuple]]]:
1439
1302
  data_sources = self.enlist_sources(
1440
1303
  sources,
1441
- ttl,
1442
1304
  update,
1443
1305
  skip_indexing=skip_indexing,
1444
1306
  client_config=client_config or self.client_config,
@@ -1457,7 +1319,6 @@ class Catalog:
1457
1319
  edatachain_file: Optional[str] = None,
1458
1320
  *,
1459
1321
  client_config=None,
1460
- remote_config=None,
1461
1322
  ) -> None:
1462
1323
  # TODO add progress bar https://github.com/iterative/dvcx/issues/750
1463
1324
  # TODO copy correct remote dates https://github.com/iterative/dvcx/issues/new
@@ -1479,13 +1340,8 @@ class Catalog:
1479
1340
  raise ValueError("Please provide output directory for instantiation")
1480
1341
 
1481
1342
  client_config = client_config or self.client_config
1482
- remote_config = remote_config or get_remote_config(
1483
- read_config(DataChainDir.find().root), remote=""
1484
- )
1485
1343
 
1486
- studio_client = StudioClient(
1487
- remote_config["url"], remote_config["username"], remote_config["token"]
1488
- )
1344
+ studio_client = StudioClient()
1489
1345
 
1490
1346
  try:
1491
1347
  remote_dataset_name, version = parse_dataset_uri(dataset_uri)
@@ -1499,9 +1355,7 @@ class Catalog:
1499
1355
  # we will create new one if it doesn't exist
1500
1356
  pass
1501
1357
 
1502
- remote_dataset = self.get_remote_dataset(
1503
- remote_dataset_name, remote_config=remote_config
1504
- )
1358
+ remote_dataset = self.get_remote_dataset(remote_dataset_name)
1505
1359
  # if version is not specified in uri, take the latest one
1506
1360
  if not version:
1507
1361
  version = remote_dataset.latest_version
@@ -1526,7 +1380,7 @@ class Catalog:
1526
1380
 
1527
1381
  stats_response = studio_client.dataset_stats(remote_dataset_name, version)
1528
1382
  if not stats_response.ok:
1529
- _raise_remote_error(stats_response.message)
1383
+ raise_remote_error(stats_response.message)
1530
1384
  dataset_stats = stats_response.data
1531
1385
 
1532
1386
  dataset_save_progress_bar = tqdm(
@@ -1558,7 +1412,7 @@ class Catalog:
1558
1412
  remote_dataset_name, version
1559
1413
  )
1560
1414
  if not export_response.ok:
1561
- _raise_remote_error(export_response.message)
1415
+ raise_remote_error(export_response.message)
1562
1416
 
1563
1417
  signed_urls = export_response.data
1564
1418
 
@@ -1572,7 +1426,6 @@ class Catalog:
1572
1426
  rows_fetcher = DatasetRowsFetcher(
1573
1427
  metastore,
1574
1428
  warehouse,
1575
- remote_config,
1576
1429
  dataset.name,
1577
1430
  version,
1578
1431
  schema,
@@ -1615,7 +1468,6 @@ class Catalog:
1615
1468
  no_cp: bool = False,
1616
1469
  edatachain: bool = False,
1617
1470
  edatachain_file: Optional[str] = None,
1618
- ttl: int = TTL_INT,
1619
1471
  *,
1620
1472
  client_config=None,
1621
1473
  ) -> None:
@@ -1637,7 +1489,6 @@ class Catalog:
1637
1489
  edatachain_only=no_cp,
1638
1490
  no_edatachain_file=not edatachain,
1639
1491
  edatachain_file=edatachain_file,
1640
- ttl=ttl,
1641
1492
  client_config=client_config,
1642
1493
  )
1643
1494
  else:
@@ -1645,7 +1496,6 @@ class Catalog:
1645
1496
  # it needs to be done here
1646
1497
  self.enlist_sources(
1647
1498
  sources,
1648
- ttl,
1649
1499
  update,
1650
1500
  client_config=client_config or self.client_config,
1651
1501
  )
@@ -1705,7 +1555,6 @@ class Catalog:
1705
1555
  edatachain_only: bool = False,
1706
1556
  no_edatachain_file: bool = False,
1707
1557
  no_glob: bool = False,
1708
- ttl: int = TTL_INT,
1709
1558
  *,
1710
1559
  client_config=None,
1711
1560
  ) -> list[dict[str, Any]]:
@@ -1717,7 +1566,6 @@ class Catalog:
1717
1566
  client_config = client_config or self.client_config
1718
1567
  node_groups = self.enlist_sources_grouped(
1719
1568
  sources,
1720
- ttl,
1721
1569
  update,
1722
1570
  no_glob,
1723
1571
  client_config=client_config,
@@ -1776,14 +1624,12 @@ class Catalog:
1776
1624
  self,
1777
1625
  sources,
1778
1626
  depth=0,
1779
- ttl=TTL_INT,
1780
1627
  update=False,
1781
1628
  *,
1782
1629
  client_config=None,
1783
1630
  ) -> Iterable[tuple[str, float]]:
1784
1631
  sources = self.enlist_sources(
1785
1632
  sources,
1786
- ttl,
1787
1633
  update,
1788
1634
  client_config=client_config or self.client_config,
1789
1635
  )
@@ -1804,7 +1650,6 @@ class Catalog:
1804
1650
  def find(
1805
1651
  self,
1806
1652
  sources,
1807
- ttl=TTL_INT,
1808
1653
  update=False,
1809
1654
  names=None,
1810
1655
  inames=None,
@@ -1818,7 +1663,6 @@ class Catalog:
1818
1663
  ) -> Iterator[str]:
1819
1664
  sources = self.enlist_sources(
1820
1665
  sources,
1821
- ttl,
1822
1666
  update,
1823
1667
  client_config=client_config or self.client_config,
1824
1668
  )
@@ -1854,7 +1698,6 @@ class Catalog:
1854
1698
  def index(
1855
1699
  self,
1856
1700
  sources,
1857
- ttl=TTL_INT,
1858
1701
  update=False,
1859
1702
  *,
1860
1703
  client_config=None,
@@ -1880,7 +1723,6 @@ class Catalog:
1880
1723
 
1881
1724
  self.enlist_sources(
1882
1725
  non_root_sources,
1883
- ttl,
1884
1726
  update,
1885
1727
  client_config=client_config,
1886
1728
  only_index=True,