datachain 0.3.17__tar.gz → 0.3.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (247) hide show
  1. {datachain-0.3.17/src/datachain.egg-info → datachain-0.3.18}/PKG-INFO +5 -3
  2. datachain-0.3.18/examples/llm_and_nlp/unstructured-embeddings-gen.py +76 -0
  3. datachain-0.3.17/examples/llm_and_nlp/unstructured-text.py → datachain-0.3.18/examples/llm_and_nlp/unstructured-summary-map.py +7 -3
  4. {datachain-0.3.17 → datachain-0.3.18}/examples/multimodal/hf_pipeline.py +7 -1
  5. {datachain-0.3.17 → datachain-0.3.18}/examples/multimodal/openai_image_desc_lib.py +0 -2
  6. {datachain-0.3.17 → datachain-0.3.18}/pyproject.toml +6 -4
  7. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/cache.py +14 -55
  8. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/catalog/catalog.py +8 -18
  9. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/cli.py +7 -1
  10. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/fsspec.py +29 -63
  11. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/local.py +2 -3
  12. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/arrow.py +2 -1
  13. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/dc.py +4 -0
  14. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/file.py +41 -23
  15. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/listing.py +2 -0
  16. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/listing.py +4 -4
  17. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/node.py +6 -6
  18. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/nodes_fetcher.py +12 -5
  19. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/nodes_thread_pool.py +1 -1
  20. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/progress.py +2 -12
  21. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/dataset.py +6 -18
  22. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/dispatch.py +2 -15
  23. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/schema.py +25 -24
  24. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/udf.py +0 -106
  25. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/types.py +4 -2
  26. datachain-0.3.18/src/datachain/telemetry.py +37 -0
  27. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/utils.py +11 -0
  28. {datachain-0.3.17 → datachain-0.3.18/src/datachain.egg-info}/PKG-INFO +5 -3
  29. {datachain-0.3.17 → datachain-0.3.18}/src/datachain.egg-info/SOURCES.txt +4 -1
  30. {datachain-0.3.17 → datachain-0.3.18}/src/datachain.egg-info/requires.txt +4 -2
  31. {datachain-0.3.17 → datachain-0.3.18}/tests/conftest.py +5 -0
  32. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_datachain.py +3 -4
  33. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_datasets.py +6 -8
  34. datachain-0.3.18/tests/test_telemetry.py +20 -0
  35. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_file.py +3 -26
  36. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_cache.py +9 -4
  37. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_data_storage.py +1 -1
  38. {datachain-0.3.17 → datachain-0.3.18}/.cruft.json +0 -0
  39. {datachain-0.3.17 → datachain-0.3.18}/.gitattributes +0 -0
  40. {datachain-0.3.17 → datachain-0.3.18}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  41. {datachain-0.3.17 → datachain-0.3.18}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  42. {datachain-0.3.17 → datachain-0.3.18}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  43. {datachain-0.3.17 → datachain-0.3.18}/.github/codecov.yaml +0 -0
  44. {datachain-0.3.17 → datachain-0.3.18}/.github/dependabot.yml +0 -0
  45. {datachain-0.3.17 → datachain-0.3.18}/.github/workflows/benchmarks.yml +0 -0
  46. {datachain-0.3.17 → datachain-0.3.18}/.github/workflows/release.yml +0 -0
  47. {datachain-0.3.17 → datachain-0.3.18}/.github/workflows/tests-studio.yml +0 -0
  48. {datachain-0.3.17 → datachain-0.3.18}/.github/workflows/tests.yml +0 -0
  49. {datachain-0.3.17 → datachain-0.3.18}/.github/workflows/update-template.yaml +0 -0
  50. {datachain-0.3.17 → datachain-0.3.18}/.gitignore +0 -0
  51. {datachain-0.3.17 → datachain-0.3.18}/.pre-commit-config.yaml +0 -0
  52. {datachain-0.3.17 → datachain-0.3.18}/CODE_OF_CONDUCT.rst +0 -0
  53. {datachain-0.3.17 → datachain-0.3.18}/CONTRIBUTING.rst +0 -0
  54. {datachain-0.3.17 → datachain-0.3.18}/LICENSE +0 -0
  55. {datachain-0.3.17 → datachain-0.3.18}/README.rst +0 -0
  56. {datachain-0.3.17 → datachain-0.3.18}/docs/assets/captioned_cartoons.png +0 -0
  57. {datachain-0.3.17 → datachain-0.3.18}/docs/assets/datachain-white.svg +0 -0
  58. {datachain-0.3.17 → datachain-0.3.18}/docs/assets/datachain.svg +0 -0
  59. {datachain-0.3.17 → datachain-0.3.18}/docs/assets/flowchart.png +0 -0
  60. {datachain-0.3.17 → datachain-0.3.18}/docs/index.md +0 -0
  61. {datachain-0.3.17 → datachain-0.3.18}/docs/references/datachain.md +0 -0
  62. {datachain-0.3.17 → datachain-0.3.18}/docs/references/datatype.md +0 -0
  63. {datachain-0.3.17 → datachain-0.3.18}/docs/references/file.md +0 -0
  64. {datachain-0.3.17 → datachain-0.3.18}/docs/references/index.md +0 -0
  65. {datachain-0.3.17 → datachain-0.3.18}/docs/references/sql.md +0 -0
  66. {datachain-0.3.17 → datachain-0.3.18}/docs/references/torch.md +0 -0
  67. {datachain-0.3.17 → datachain-0.3.18}/docs/references/udf.md +0 -0
  68. {datachain-0.3.17 → datachain-0.3.18}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  69. {datachain-0.3.17 → datachain-0.3.18}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  70. {datachain-0.3.17 → datachain-0.3.18}/examples/computer_vision/openimage-detect.py +0 -0
  71. {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/common_sql_functions.py +0 -0
  72. {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/json-csv-reader.py +0 -0
  73. {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/torch-loader.py +0 -0
  74. {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/udfs/parallel.py +0 -0
  75. {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/udfs/simple.py +0 -0
  76. {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/udfs/stateful.py +0 -0
  77. {datachain-0.3.17 → datachain-0.3.18}/examples/llm_and_nlp/claude-query.py +0 -0
  78. {datachain-0.3.17 → datachain-0.3.18}/examples/multimodal/clip_inference.py +0 -0
  79. {datachain-0.3.17 → datachain-0.3.18}/examples/multimodal/wds.py +0 -0
  80. {datachain-0.3.17 → datachain-0.3.18}/examples/multimodal/wds_filtered.py +0 -0
  81. {datachain-0.3.17 → datachain-0.3.18}/mkdocs.yml +0 -0
  82. {datachain-0.3.17 → datachain-0.3.18}/noxfile.py +0 -0
  83. {datachain-0.3.17 → datachain-0.3.18}/setup.cfg +0 -0
  84. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/__init__.py +0 -0
  85. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/__main__.py +0 -0
  86. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/asyn.py +0 -0
  87. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/catalog/__init__.py +0 -0
  88. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/catalog/datasource.py +0 -0
  89. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/catalog/loader.py +0 -0
  90. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/cli_utils.py +0 -0
  91. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/__init__.py +0 -0
  92. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/azure.py +0 -0
  93. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/fileslice.py +0 -0
  94. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/gcs.py +0 -0
  95. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/hf.py +0 -0
  96. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/s3.py +0 -0
  97. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/config.py +0 -0
  98. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/__init__.py +0 -0
  99. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/db_engine.py +0 -0
  100. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/id_generator.py +0 -0
  101. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/job.py +0 -0
  102. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/metastore.py +0 -0
  103. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/schema.py +0 -0
  104. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/serializer.py +0 -0
  105. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/sqlite.py +0 -0
  106. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/warehouse.py +0 -0
  107. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/dataset.py +0 -0
  108. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/error.py +0 -0
  109. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/job.py +0 -0
  110. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/__init__.py +0 -0
  111. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/clip.py +0 -0
  112. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/__init__.py +0 -0
  113. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/flatten.py +0 -0
  114. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/python_to_sql.py +0 -0
  115. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/sql_to_python.py +0 -0
  116. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/unflatten.py +0 -0
  117. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  118. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/data_model.py +0 -0
  119. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/dataset_info.py +0 -0
  120. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/hf.py +0 -0
  121. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/image.py +0 -0
  122. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/listing_info.py +0 -0
  123. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/meta_formats.py +0 -0
  124. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/model_store.py +0 -0
  125. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/pytorch.py +0 -0
  126. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/settings.py +0 -0
  127. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/signal_schema.py +0 -0
  128. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/tar.py +0 -0
  129. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/text.py +0 -0
  130. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/udf.py +0 -0
  131. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/udf_signature.py +0 -0
  132. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/utils.py +0 -0
  133. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/vfile.py +0 -0
  134. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/webdataset.py +0 -0
  135. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/webdataset_laion.py +0 -0
  136. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/py.typed +0 -0
  137. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/__init__.py +0 -0
  138. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/batch.py +0 -0
  139. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/metrics.py +0 -0
  140. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/params.py +0 -0
  141. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/queue.py +0 -0
  142. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/session.py +0 -0
  143. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/remote/__init__.py +0 -0
  144. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/remote/studio.py +0 -0
  145. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/__init__.py +0 -0
  146. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/default/__init__.py +0 -0
  147. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/default/base.py +0 -0
  148. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/__init__.py +0 -0
  149. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/array.py +0 -0
  150. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/conditional.py +0 -0
  151. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/path.py +0 -0
  152. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/random.py +0 -0
  153. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/string.py +0 -0
  154. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/selectable.py +0 -0
  155. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/sqlite/__init__.py +0 -0
  156. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/sqlite/base.py +0 -0
  157. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/sqlite/types.py +0 -0
  158. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/sqlite/vector.py +0 -0
  159. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/utils.py +0 -0
  160. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/storage.py +0 -0
  161. {datachain-0.3.17 → datachain-0.3.18}/src/datachain/torch/__init__.py +0 -0
  162. {datachain-0.3.17 → datachain-0.3.18}/src/datachain.egg-info/dependency_links.txt +0 -0
  163. {datachain-0.3.17 → datachain-0.3.18}/src/datachain.egg-info/entry_points.txt +0 -0
  164. {datachain-0.3.17 → datachain-0.3.18}/src/datachain.egg-info/top_level.txt +0 -0
  165. {datachain-0.3.17 → datachain-0.3.18}/tests/__init__.py +0 -0
  166. {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/__init__.py +0 -0
  167. {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/conftest.py +0 -0
  168. {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  169. {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/datasets/.dvc/config +0 -0
  170. {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/datasets/.gitignore +0 -0
  171. {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  172. {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/test_datachain.py +0 -0
  173. {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/test_ls.py +0 -0
  174. {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/test_version.py +0 -0
  175. {datachain-0.3.17 → datachain-0.3.18}/tests/data.py +0 -0
  176. {datachain-0.3.17 → datachain-0.3.18}/tests/examples/__init__.py +0 -0
  177. {datachain-0.3.17 → datachain-0.3.18}/tests/examples/test_examples.py +0 -0
  178. {datachain-0.3.17 → datachain-0.3.18}/tests/examples/test_wds_e2e.py +0 -0
  179. {datachain-0.3.17 → datachain-0.3.18}/tests/examples/wds_data.py +0 -0
  180. {datachain-0.3.17 → datachain-0.3.18}/tests/func/__init__.py +0 -0
  181. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_catalog.py +0 -0
  182. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_client.py +0 -0
  183. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_dataset_query.py +0 -0
  184. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_feature_pickling.py +0 -0
  185. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_listing.py +0 -0
  186. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_ls.py +0 -0
  187. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_meta_formats.py +0 -0
  188. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_metrics.py +0 -0
  189. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_pull.py +0 -0
  190. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_pytorch.py +0 -0
  191. {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_query.py +0 -0
  192. {datachain-0.3.17 → datachain-0.3.18}/tests/scripts/feature_class.py +0 -0
  193. {datachain-0.3.17 → datachain-0.3.18}/tests/scripts/feature_class_parallel.py +0 -0
  194. {datachain-0.3.17 → datachain-0.3.18}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  195. {datachain-0.3.17 → datachain-0.3.18}/tests/scripts/name_len_slow.py +0 -0
  196. {datachain-0.3.17 → datachain-0.3.18}/tests/test_cli_e2e.py +0 -0
  197. {datachain-0.3.17 → datachain-0.3.18}/tests/test_query_e2e.py +0 -0
  198. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/__init__.py +0 -0
  199. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/__init__.py +0 -0
  200. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/conftest.py +0 -0
  201. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_arrow.py +0 -0
  202. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_clip.py +0 -0
  203. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_datachain.py +0 -0
  204. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  205. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_datachain_merge.py +0 -0
  206. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_feature.py +0 -0
  207. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_feature_utils.py +0 -0
  208. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_hf.py +0 -0
  209. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_image.py +0 -0
  210. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_schema.py +0 -0
  211. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_signal_schema.py +0 -0
  212. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_sql_to_python.py +0 -0
  213. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_text.py +0 -0
  214. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_udf_signature.py +0 -0
  215. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_utils.py +0 -0
  216. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_webdataset.py +0 -0
  217. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/__init__.py +0 -0
  218. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/sqlite/__init__.py +0 -0
  219. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/sqlite/test_utils.py +0 -0
  220. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_array.py +0 -0
  221. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_conditional.py +0 -0
  222. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_path.py +0 -0
  223. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_random.py +0 -0
  224. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_selectable.py +0 -0
  225. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_string.py +0 -0
  226. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_asyn.py +0 -0
  227. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_catalog.py +0 -0
  228. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_catalog_loader.py +0 -0
  229. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_cli_parsing.py +0 -0
  230. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_client.py +0 -0
  231. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_client_s3.py +0 -0
  232. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_database_engine.py +0 -0
  233. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_dataset.py +0 -0
  234. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_dispatch.py +0 -0
  235. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_fileslice.py +0 -0
  236. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_id_generator.py +0 -0
  237. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_listing.py +0 -0
  238. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_metastore.py +0 -0
  239. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_module_exports.py +0 -0
  240. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_query_metrics.py +0 -0
  241. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_query_params.py +0 -0
  242. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_serializer.py +0 -0
  243. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_session.py +0 -0
  244. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_storage.py +0 -0
  245. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_utils.py +0 -0
  246. {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_warehouse.py +0 -0
  247. {datachain-0.3.17 → datachain-0.3.18}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.17
3
+ Version: 0.3.18
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -44,6 +44,7 @@ Requires-Dist: Pillow<11,>=10.0.0
44
44
  Requires-Dist: msgpack<2,>=1.0.4
45
45
  Requires-Dist: psutil
46
46
  Requires-Dist: huggingface_hub
47
+ Requires-Dist: iterative-telemetry>=0.0.9
47
48
  Provides-Extra: docs
48
49
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
49
50
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -69,7 +70,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
69
70
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
70
71
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
71
72
  Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
72
- Requires-Dist: pytest-servers[all]>=0.5.5; extra == "tests"
73
+ Requires-Dist: pytest-servers[all]>=0.5.7; extra == "tests"
73
74
  Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
74
75
  Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
75
76
  Requires-Dist: virtualenv; extra == "tests"
@@ -91,9 +92,10 @@ Requires-Dist: datachain[tests]; extra == "examples"
91
92
  Requires-Dist: numpy<2,>=1; extra == "examples"
92
93
  Requires-Dist: defusedxml; extra == "examples"
93
94
  Requires-Dist: accelerate; extra == "examples"
94
- Requires-Dist: unstructured[pdf]; extra == "examples"
95
+ Requires-Dist: unstructured[embed-huggingface,pdf]; extra == "examples"
95
96
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
96
97
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
98
+ Requires-Dist: onnx==1.16.1; extra == "examples"
97
99
 
98
100
  ================
99
101
  |logo| DataChain
@@ -0,0 +1,76 @@
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
8
+ from collections.abc import Iterator
9
+
10
+ from unstructured.cleaners.core import (
11
+ clean,
12
+ group_broken_paragraphs,
13
+ replace_unicode_quotes,
14
+ )
15
+ from unstructured.embed.huggingface import (
16
+ HuggingFaceEmbeddingConfig,
17
+ HuggingFaceEmbeddingEncoder,
18
+ )
19
+ from unstructured.partition.pdf import partition_pdf
20
+
21
+ from datachain import C, DataChain, DataModel, File
22
+
23
+ source = "gs://datachain-demo/neurips/1987/"
24
+
25
+
26
+ # Define the output as a DataModel class
27
+ class Chunk(DataModel):
28
+ key: str
29
+ text: str
30
+ embeddings: list[float]
31
+
32
+
33
+ # Define embedding encoder
34
+
35
+ embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
36
+
37
+
38
+ # Use signatures to define UDF input/output
39
+ # these can be pydantic model or regular Python types
40
+ def process_pdf(file: File) -> Iterator[Chunk]:
41
+ # Ingest the file
42
+ with file.open() as f:
43
+ chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
44
+
45
+ # Clean the chunks and add new columns
46
+ for chunk in chunks:
47
+ chunk.apply(
48
+ lambda text: clean(
49
+ text, bullets=True, extra_whitespace=True, trailing_punctuation=True
50
+ )
51
+ )
52
+ chunk.apply(replace_unicode_quotes)
53
+ chunk.apply(group_broken_paragraphs)
54
+
55
+ # create embeddings
56
+ chunks_embedded = embedding_encoder.embed_documents(chunks)
57
+
58
+ # Add new rows to DataChain
59
+ for chunk in chunks_embedded:
60
+ yield Chunk(
61
+ key=file.path,
62
+ text=chunk.text,
63
+ embeddings=chunk.embeddings,
64
+ )
65
+
66
+
67
+ dc = (
68
+ DataChain.from_storage(source)
69
+ .settings(parallel=-1)
70
+ .filter(C.file.path.glob("*.pdf"))
71
+ .gen(document=process_pdf)
72
+ )
73
+
74
+ dc.save("embedded-documents")
75
+
76
+ DataChain.from_dataset("embedded-documents").show()
@@ -1,6 +1,10 @@
1
- #
2
- # pip install unstructured[pdf] huggingface_hub[hf_transfer]
3
- #
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
4
8
  import os
5
9
 
6
10
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
@@ -1,4 +1,10 @@
1
- # pip install scipy torch transformers huggingface_hub[hf_transfer]
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
2
8
  # NOTE: also need to install ffmpeg binary
3
9
  import json
4
10
  import os
@@ -1,5 +1,3 @@
1
- # pip install Pillow
2
-
3
1
  import base64
4
2
  import os
5
3
 
@@ -46,7 +46,8 @@ dependencies = [
46
46
  "Pillow>=10.0.0,<11",
47
47
  "msgpack>=1.0.4,<2",
48
48
  "psutil",
49
- "huggingface_hub"
49
+ "huggingface_hub",
50
+ "iterative-telemetry>=0.0.9"
50
51
  ]
51
52
 
52
53
  [project.optional-dependencies]
@@ -80,7 +81,7 @@ tests = [
80
81
  "pytest-sugar>=0.9.6",
81
82
  "pytest-cov>=4.1.0",
82
83
  "pytest-mock>=3.12.0",
83
- "pytest-servers[all]>=0.5.5",
84
+ "pytest-servers[all]>=0.5.7",
84
85
  "pytest-benchmark[histogram]",
85
86
  "pytest-xdist>=3.3.1",
86
87
  "virtualenv",
@@ -104,9 +105,10 @@ examples = [
104
105
  "numpy>=1,<2",
105
106
  "defusedxml",
106
107
  "accelerate",
107
- "unstructured[pdf]",
108
+ "unstructured[pdf, embed-huggingface]",
108
109
  "pdfplumber==0.11.4",
109
- "huggingface_hub[hf_transfer]"
110
+ "huggingface_hub[hf_transfer]",
111
+ "onnx==1.16.1"
110
112
  ]
111
113
 
112
114
  [project.urls]
@@ -1,56 +1,15 @@
1
- import hashlib
2
- import json
3
1
  import os
4
- from datetime import datetime
5
- from functools import partial
6
2
  from typing import TYPE_CHECKING, Optional
7
3
 
8
- import attrs
9
4
  from dvc_data.hashfile.db.local import LocalHashFileDB
10
5
  from dvc_objects.fs.local import LocalFileSystem
11
6
  from fsspec.callbacks import Callback, TqdmCallback
12
7
 
13
- from datachain.utils import TIME_ZERO
14
-
15
8
  from .progress import Tqdm
16
9
 
17
10
  if TYPE_CHECKING:
18
11
  from datachain.client import Client
19
- from datachain.storage import StorageURI
20
-
21
- sha256 = partial(hashlib.sha256, usedforsecurity=False)
22
-
23
-
24
- @attrs.frozen
25
- class UniqueId:
26
- storage: "StorageURI"
27
- path: str
28
- size: int
29
- etag: str
30
- version: str = ""
31
- is_latest: bool = True
32
- location: Optional[str] = None
33
- last_modified: datetime = TIME_ZERO
34
-
35
- def get_parsed_location(self) -> Optional[dict]:
36
- if not self.location:
37
- return None
38
-
39
- loc_stack = (
40
- json.loads(self.location)
41
- if isinstance(self.location, str)
42
- else self.location
43
- )
44
- if len(loc_stack) > 1:
45
- raise NotImplementedError("Nested v-objects are not supported yet.")
46
-
47
- return loc_stack[0]
48
-
49
- def get_hash(self) -> str:
50
- fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
51
- if self.location:
52
- fingerprint += f"/{self.location}"
53
- return sha256(fingerprint.encode()).hexdigest()
12
+ from datachain.lib.file import File
54
13
 
55
14
 
56
15
  def try_scandir(path):
@@ -77,30 +36,30 @@ class DataChainCache:
77
36
  def tmp_dir(self):
78
37
  return self.odb.tmp_dir
79
38
 
80
- def get_path(self, uid: UniqueId) -> Optional[str]:
81
- if self.contains(uid):
82
- return self.path_from_checksum(uid.get_hash())
39
+ def get_path(self, file: "File") -> Optional[str]:
40
+ if self.contains(file):
41
+ return self.path_from_checksum(file.get_hash())
83
42
  return None
84
43
 
85
- def contains(self, uid: UniqueId) -> bool:
86
- return self.odb.exists(uid.get_hash())
44
+ def contains(self, file: "File") -> bool:
45
+ return self.odb.exists(file.get_hash())
87
46
 
88
47
  def path_from_checksum(self, checksum: str) -> str:
89
48
  assert checksum
90
49
  return self.odb.oid_to_path(checksum)
91
50
 
92
- def remove(self, uid: UniqueId) -> None:
93
- self.odb.delete(uid.get_hash())
51
+ def remove(self, file: "File") -> None:
52
+ self.odb.delete(file.get_hash())
94
53
 
95
54
  async def download(
96
- self, uid: UniqueId, client: "Client", callback: Optional[Callback] = None
55
+ self, file: "File", client: "Client", callback: Optional[Callback] = None
97
56
  ) -> None:
98
- from_path = f"{uid.storage}/{uid.path}"
57
+ from_path = f"{file.source}/{file.path}"
99
58
  from dvc_objects.fs.utils import tmp_fname
100
59
 
101
60
  odb_fs = self.odb.fs
102
61
  tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
103
- size = uid.size
62
+ size = file.size
104
63
  if size < 0:
105
64
  size = await client.get_size(from_path)
106
65
  cb = callback or TqdmCallback(
@@ -115,13 +74,13 @@ class DataChainCache:
115
74
  cb.close()
116
75
 
117
76
  try:
118
- oid = uid.get_hash()
77
+ oid = file.get_hash()
119
78
  self.odb.add(tmp_info, self.odb.fs, oid)
120
79
  finally:
121
80
  os.unlink(tmp_info)
122
81
 
123
- def store_data(self, uid: UniqueId, contents: bytes) -> None:
124
- checksum = uid.get_hash()
82
+ def store_data(self, file: "File", contents: bytes) -> None:
83
+ checksum = file.get_hash()
125
84
  dst = self.path_from_checksum(checksum)
126
85
  if not os.path.exists(dst):
127
86
  # Create the file only if it's not already in cache
@@ -34,7 +34,7 @@ import yaml
34
34
  from sqlalchemy import Column
35
35
  from tqdm import tqdm
36
36
 
37
- from datachain.cache import DataChainCache, UniqueId
37
+ from datachain.cache import DataChainCache
38
38
  from datachain.client import Client
39
39
  from datachain.config import get_remote_config, read_config
40
40
  from datachain.dataset import (
@@ -619,13 +619,13 @@ class Catalog:
619
619
  code_ast.body[-1:] = new_expressions
620
620
  return code_ast
621
621
 
622
- def get_client(self, uri: StorageURI, **config: Any) -> Client:
622
+ def get_client(self, uri: str, **config: Any) -> Client:
623
623
  """
624
624
  Return the client corresponding to the given source `uri`.
625
625
  """
626
626
  config = config or self.client_config
627
627
  cls = Client.get_implementation(uri)
628
- return cls.from_source(uri, self.cache, **config)
628
+ return cls.from_source(StorageURI(uri), self.cache, **config)
629
629
 
630
630
  def enlist_source(
631
631
  self,
@@ -1431,7 +1431,7 @@ class Catalog:
1431
1431
 
1432
1432
  def get_file_signals(
1433
1433
  self, dataset_name: str, dataset_version: int, row: RowDict
1434
- ) -> Optional[dict]:
1434
+ ) -> Optional[RowDict]:
1435
1435
  """
1436
1436
  Function that returns file signals from dataset row.
1437
1437
  Note that signal names are without prefix, so if there was 'laion__file__source'
@@ -1448,7 +1448,7 @@ class Catalog:
1448
1448
 
1449
1449
  version = self.get_dataset(dataset_name).get_version(dataset_version)
1450
1450
 
1451
- file_signals_values = {}
1451
+ file_signals_values = RowDict()
1452
1452
 
1453
1453
  schema = SignalSchema.deserialize(version.feature_schema)
1454
1454
  for file_signals in schema.get_signals(File):
@@ -1476,6 +1476,8 @@ class Catalog:
1476
1476
  use_cache: bool = True,
1477
1477
  **config: Any,
1478
1478
  ):
1479
+ from datachain.lib.file import File
1480
+
1479
1481
  file_signals = self.get_file_signals(dataset_name, dataset_version, row)
1480
1482
  if not file_signals:
1481
1483
  raise RuntimeError("Cannot open object without file signals")
@@ -1483,22 +1485,10 @@ class Catalog:
1483
1485
  config = config or self.client_config
1484
1486
  client = self.get_client(file_signals["source"], **config)
1485
1487
  return client.open_object(
1486
- self._get_row_uid(file_signals), # type: ignore [arg-type]
1488
+ File._from_row(file_signals),
1487
1489
  use_cache=use_cache,
1488
1490
  )
1489
1491
 
1490
- def _get_row_uid(self, row: RowDict) -> UniqueId:
1491
- return UniqueId(
1492
- row["source"],
1493
- row["path"],
1494
- row["size"],
1495
- row["etag"],
1496
- row["version"],
1497
- row["is_latest"],
1498
- row["location"],
1499
- row["last_modified"],
1500
- )
1501
-
1502
1492
  def ls(
1503
1493
  self,
1504
1494
  sources: list[str],
@@ -15,6 +15,7 @@ import shtab
15
15
  from datachain import utils
16
16
  from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
17
17
  from datachain.lib.dc import DataChain
18
+ from datachain.telemetry import telemetry
18
19
  from datachain.utils import DataChainDir
19
20
 
20
21
  if TYPE_CHECKING:
@@ -872,6 +873,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
872
873
  # This also sets this environment variable for any subprocesses
873
874
  os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
874
875
 
876
+ error = None
875
877
  try:
876
878
  catalog = get_catalog(client_config=client_config)
877
879
  if args.command == "cp":
@@ -1003,14 +1005,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1003
1005
  print(f"invalid command: {args.command}", file=sys.stderr)
1004
1006
  return 1
1005
1007
  return 0
1006
- except BrokenPipeError:
1008
+ except BrokenPipeError as exc:
1007
1009
  # Python flushes standard streams on exit; redirect remaining output
1008
1010
  # to devnull to avoid another BrokenPipeError at shutdown
1009
1011
  # See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
1012
+ error = str(exc)
1010
1013
  devnull = os.open(os.devnull, os.O_WRONLY)
1011
1014
  os.dup2(devnull, sys.stdout.fileno())
1012
1015
  return 141 # 128 + 13 (SIGPIPE)
1013
1016
  except (KeyboardInterrupt, Exception) as exc:
1017
+ error = str(exc)
1014
1018
  if isinstance(exc, KeyboardInterrupt):
1015
1019
  msg = "Operation cancelled by the user"
1016
1020
  else:
@@ -1028,3 +1032,5 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1028
1032
 
1029
1033
  pdb.post_mortem()
1030
1034
  return 1
1035
+ finally:
1036
+ telemetry.send_cli_call(args.command, error=error)
@@ -3,7 +3,6 @@ import functools
3
3
  import logging
4
4
  import multiprocessing
5
5
  import os
6
- import posixpath
7
6
  import re
8
7
  import sys
9
8
  from abc import ABC, abstractmethod
@@ -26,8 +25,8 @@ from fsspec.asyn import get_loop, sync
26
25
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
27
26
  from tqdm import tqdm
28
27
 
29
- from datachain.cache import DataChainCache, UniqueId
30
- from datachain.client.fileslice import FileSlice, FileWrapper
28
+ from datachain.cache import DataChainCache
29
+ from datachain.client.fileslice import FileWrapper
31
30
  from datachain.error import ClientError as DataChainClientError
32
31
  from datachain.lib.file import File
33
32
  from datachain.nodes_fetcher import NodesFetcher
@@ -187,8 +186,8 @@ class Client(ABC):
187
186
  def url(self, path: str, expires: int = 3600, **kwargs) -> str:
188
187
  return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
189
188
 
190
- async def get_current_etag(self, uid: UniqueId) -> str:
191
- info = await self.fs._info(self.get_full_path(uid.path))
189
+ async def get_current_etag(self, file: "File") -> str:
190
+ info = await self.fs._info(self.get_full_path(file.path))
192
191
  return self.info_to_file(info, "").etag
193
192
 
194
193
  async def get_size(self, path: str) -> int:
@@ -317,7 +316,7 @@ class Client(ABC):
317
316
 
318
317
  def instantiate_object(
319
318
  self,
320
- uid: UniqueId,
319
+ file: "File",
321
320
  dst: str,
322
321
  progress_bar: tqdm,
323
322
  force: bool = False,
@@ -328,10 +327,10 @@ class Client(ABC):
328
327
  else:
329
328
  progress_bar.close()
330
329
  raise FileExistsError(f"Path {dst} already exists")
331
- self.do_instantiate_object(uid, dst)
330
+ self.do_instantiate_object(file, dst)
332
331
 
333
- def do_instantiate_object(self, uid: "UniqueId", dst: str) -> None:
334
- src = self.cache.get_path(uid)
332
+ def do_instantiate_object(self, file: "File", dst: str) -> None:
333
+ src = self.cache.get_path(file)
335
334
  assert src is not None
336
335
 
337
336
  try:
@@ -341,66 +340,33 @@ class Client(ABC):
341
340
  copy2(src, dst)
342
341
 
343
342
  def open_object(
344
- self, uid: UniqueId, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
343
+ self, file: File, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
345
344
  ) -> BinaryIO:
346
345
  """Open a file, including files in tar archives."""
347
- location = uid.get_parsed_location()
348
- if use_cache and (cache_path := self.cache.get_path(uid)):
346
+ if use_cache and (cache_path := self.cache.get_path(file)):
349
347
  return open(cache_path, mode="rb") # noqa: SIM115
350
- if location and location["vtype"] == "tar":
351
- return self._open_tar(uid, use_cache=True)
352
- return FileWrapper(self.fs.open(self.get_full_path(uid.path)), cb) # type: ignore[return-value]
353
-
354
- def _open_tar(self, uid: UniqueId, use_cache: bool = True):
355
- location = uid.get_parsed_location()
356
- assert location
357
-
358
- offset = location["offset"]
359
- size = location["size"]
360
- parent = location["parent"]
361
-
362
- parent_uid = UniqueId(
363
- parent["source"],
364
- parent["path"],
365
- parent["size"],
366
- parent["etag"],
367
- location=parent["location"],
368
- )
369
- f = self.open_object(parent_uid, use_cache=use_cache)
370
- return FileSlice(f, offset, size, posixpath.basename(uid.path))
371
-
372
- def download(self, uid: UniqueId, *, callback: Callback = DEFAULT_CALLBACK) -> None:
373
- sync(get_loop(), functools.partial(self._download, uid, callback=callback))
374
-
375
- async def _download(self, uid: UniqueId, *, callback: "Callback" = None) -> None:
376
- if self.cache.contains(uid):
348
+ assert not file.location
349
+ return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb) # type: ignore[return-value]
350
+
351
+ def download(self, file: File, *, callback: Callback = DEFAULT_CALLBACK) -> None:
352
+ sync(get_loop(), functools.partial(self._download, file, callback=callback))
353
+
354
+ async def _download(self, file: File, *, callback: "Callback" = None) -> None:
355
+ if self.cache.contains(file):
377
356
  # Already in cache, so there's nothing to do.
378
357
  return
379
- await self._put_in_cache(uid, callback=callback)
358
+ await self._put_in_cache(file, callback=callback)
380
359
 
381
- def put_in_cache(self, uid: UniqueId, *, callback: "Callback" = None) -> None:
382
- sync(get_loop(), functools.partial(self._put_in_cache, uid, callback=callback))
360
+ def put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
361
+ sync(get_loop(), functools.partial(self._put_in_cache, file, callback=callback))
383
362
 
384
- async def _put_in_cache(
385
- self, uid: UniqueId, *, callback: "Callback" = None
386
- ) -> None:
387
- location = uid.get_parsed_location()
388
- if location and location["vtype"] == "tar":
389
- loop = asyncio.get_running_loop()
390
- await loop.run_in_executor(
391
- None, functools.partial(self._download_from_tar, uid, callback=callback)
392
- )
393
- return
394
- if uid.etag:
395
- etag = await self.get_current_etag(uid)
396
- if uid.etag != etag:
363
+ async def _put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
364
+ assert not file.location
365
+ if file.etag:
366
+ etag = await self.get_current_etag(file)
367
+ if file.etag != etag:
397
368
  raise FileNotFoundError(
398
- f"Invalid etag for {uid.storage}/{uid.path}: "
399
- f"expected {uid.etag}, got {etag}"
369
+ f"Invalid etag for {file.source}/{file.path}: "
370
+ f"expected {file.etag}, got {etag}"
400
371
  )
401
- await self.cache.download(uid, self, callback=callback)
402
-
403
- def _download_from_tar(self, uid, *, callback: "Callback" = None):
404
- with self._open_tar(uid, use_cache=False) as f:
405
- contents = f.read()
406
- self.cache.store_data(uid, contents)
372
+ await self.cache.download(file, self, callback=callback)
@@ -7,7 +7,6 @@ from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
9
9
 
10
- from datachain.cache import UniqueId
11
10
  from datachain.lib.file import File
12
11
  from datachain.storage import StorageURI
13
12
 
@@ -114,8 +113,8 @@ class FileClient(Client):
114
113
  use_symlinks=use_symlinks,
115
114
  )
116
115
 
117
- async def get_current_etag(self, uid: UniqueId) -> str:
118
- info = self.fs.info(self.get_full_path(uid.path))
116
+ async def get_current_etag(self, file: "File") -> str:
117
+ info = self.fs.info(self.get_full_path(file.path))
119
118
  return self.info_to_file(info, "").etag
120
119
 
121
120
  async def get_size(self, path: str) -> int:
@@ -49,7 +49,8 @@ class ArrowGenerator(Generator):
49
49
 
50
50
  def process(self, file: File):
51
51
  if file._caching_enabled:
52
- path = file.get_local_path(download=True)
52
+ file.ensure_cached()
53
+ path = file.get_local_path()
53
54
  ds = dataset(path, schema=self.input_schema, **self.kwargs)
54
55
  elif self.nrows:
55
56
  path = _nrows_file(file, self.nrows)
@@ -58,6 +58,7 @@ from datachain.query.dataset import (
58
58
  )
59
59
  from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
60
60
  from datachain.sql.functions import path as pathfunc
61
+ from datachain.telemetry import telemetry
61
62
  from datachain.utils import inside_notebook
62
63
 
63
64
  if TYPE_CHECKING:
@@ -246,6 +247,9 @@ class DataChain(DatasetQuery):
246
247
  **kwargs,
247
248
  indexing_column_types=File._datachain_column_types,
248
249
  )
250
+
251
+ telemetry.send_event_once("class", "datachain_init", **kwargs)
252
+
249
253
  if settings:
250
254
  self._settings = Settings(**settings)
251
255
  else: