datachain 0.3.17__tar.gz → 0.3.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (249) hide show
  1. {datachain-0.3.17 → datachain-0.3.19}/.github/workflows/benchmarks.yml +11 -8
  2. {datachain-0.3.17 → datachain-0.3.19}/.github/workflows/release.yml +6 -7
  3. {datachain-0.3.17 → datachain-0.3.19}/.github/workflows/tests-studio.yml +8 -5
  4. {datachain-0.3.17 → datachain-0.3.19}/.github/workflows/tests.yml +27 -18
  5. {datachain-0.3.17/src/datachain.egg-info → datachain-0.3.19}/PKG-INFO +5 -4
  6. {datachain-0.3.17 → datachain-0.3.19}/docs/references/file.md +2 -2
  7. datachain-0.3.19/examples/llm_and_nlp/unstructured-embeddings-gen.py +76 -0
  8. datachain-0.3.17/examples/llm_and_nlp/unstructured-text.py → datachain-0.3.19/examples/llm_and_nlp/unstructured-summary-map.py +7 -3
  9. {datachain-0.3.17 → datachain-0.3.19}/examples/multimodal/hf_pipeline.py +7 -1
  10. {datachain-0.3.17 → datachain-0.3.19}/examples/multimodal/openai_image_desc_lib.py +0 -2
  11. {datachain-0.3.17 → datachain-0.3.19}/noxfile.py +2 -2
  12. {datachain-0.3.17 → datachain-0.3.19}/pyproject.toml +6 -5
  13. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/__init__.py +5 -2
  14. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/cache.py +14 -55
  15. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/catalog/catalog.py +17 -97
  16. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/cli.py +7 -2
  17. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/fsspec.py +29 -63
  18. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/local.py +2 -3
  19. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/dataset.py +7 -2
  20. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/error.py +6 -4
  21. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/arrow.py +10 -4
  22. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/dc.py +6 -2
  23. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/file.py +64 -28
  24. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/listing.py +2 -0
  25. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/listing.py +4 -4
  26. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/node.py +6 -6
  27. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/nodes_fetcher.py +12 -5
  28. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/nodes_thread_pool.py +1 -1
  29. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/progress.py +2 -12
  30. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/dataset.py +6 -40
  31. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/dispatch.py +2 -15
  32. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/schema.py +25 -24
  33. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/udf.py +0 -106
  34. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/types.py +4 -2
  35. datachain-0.3.19/src/datachain/telemetry.py +37 -0
  36. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/utils.py +11 -0
  37. {datachain-0.3.17 → datachain-0.3.19/src/datachain.egg-info}/PKG-INFO +5 -4
  38. {datachain-0.3.17 → datachain-0.3.19}/src/datachain.egg-info/SOURCES.txt +5 -1
  39. {datachain-0.3.17 → datachain-0.3.19}/src/datachain.egg-info/requires.txt +4 -3
  40. {datachain-0.3.17 → datachain-0.3.19}/tests/conftest.py +6 -0
  41. {datachain-0.3.17 → datachain-0.3.19}/tests/examples/test_examples.py +38 -30
  42. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_catalog.py +2 -108
  43. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_datachain.py +46 -5
  44. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_dataset_query.py +6 -2
  45. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_datasets.py +12 -10
  46. datachain-0.3.19/tests/func/test_query.py +112 -0
  47. datachain-0.3.19/tests/test_telemetry.py +20 -0
  48. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_arrow.py +8 -9
  49. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_file.py +3 -26
  50. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_cache.py +9 -4
  51. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_data_storage.py +1 -1
  52. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_module_exports.py +2 -2
  53. datachain-0.3.19/tests/unit/test_query.py +65 -0
  54. datachain-0.3.17/tests/func/test_query.py +0 -182
  55. {datachain-0.3.17 → datachain-0.3.19}/.cruft.json +0 -0
  56. {datachain-0.3.17 → datachain-0.3.19}/.gitattributes +0 -0
  57. {datachain-0.3.17 → datachain-0.3.19}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  58. {datachain-0.3.17 → datachain-0.3.19}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  59. {datachain-0.3.17 → datachain-0.3.19}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  60. {datachain-0.3.17 → datachain-0.3.19}/.github/codecov.yaml +0 -0
  61. {datachain-0.3.17 → datachain-0.3.19}/.github/dependabot.yml +0 -0
  62. {datachain-0.3.17 → datachain-0.3.19}/.github/workflows/update-template.yaml +0 -0
  63. {datachain-0.3.17 → datachain-0.3.19}/.gitignore +0 -0
  64. {datachain-0.3.17 → datachain-0.3.19}/.pre-commit-config.yaml +0 -0
  65. {datachain-0.3.17 → datachain-0.3.19}/CODE_OF_CONDUCT.rst +0 -0
  66. {datachain-0.3.17 → datachain-0.3.19}/CONTRIBUTING.rst +0 -0
  67. {datachain-0.3.17 → datachain-0.3.19}/LICENSE +0 -0
  68. {datachain-0.3.17 → datachain-0.3.19}/README.rst +0 -0
  69. {datachain-0.3.17 → datachain-0.3.19}/docs/assets/captioned_cartoons.png +0 -0
  70. {datachain-0.3.17 → datachain-0.3.19}/docs/assets/datachain-white.svg +0 -0
  71. {datachain-0.3.17 → datachain-0.3.19}/docs/assets/datachain.svg +0 -0
  72. {datachain-0.3.17 → datachain-0.3.19}/docs/assets/flowchart.png +0 -0
  73. {datachain-0.3.17 → datachain-0.3.19}/docs/index.md +0 -0
  74. {datachain-0.3.17 → datachain-0.3.19}/docs/references/datachain.md +0 -0
  75. {datachain-0.3.17 → datachain-0.3.19}/docs/references/datatype.md +0 -0
  76. {datachain-0.3.17 → datachain-0.3.19}/docs/references/index.md +0 -0
  77. {datachain-0.3.17 → datachain-0.3.19}/docs/references/sql.md +0 -0
  78. {datachain-0.3.17 → datachain-0.3.19}/docs/references/torch.md +0 -0
  79. {datachain-0.3.17 → datachain-0.3.19}/docs/references/udf.md +0 -0
  80. {datachain-0.3.17 → datachain-0.3.19}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  81. {datachain-0.3.17 → datachain-0.3.19}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  82. {datachain-0.3.17 → datachain-0.3.19}/examples/computer_vision/openimage-detect.py +0 -0
  83. {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/common_sql_functions.py +0 -0
  84. {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/json-csv-reader.py +0 -0
  85. {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/torch-loader.py +0 -0
  86. {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/udfs/parallel.py +0 -0
  87. {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/udfs/simple.py +0 -0
  88. {datachain-0.3.17 → datachain-0.3.19}/examples/get_started/udfs/stateful.py +0 -0
  89. {datachain-0.3.17 → datachain-0.3.19}/examples/llm_and_nlp/claude-query.py +0 -0
  90. {datachain-0.3.17 → datachain-0.3.19}/examples/multimodal/clip_inference.py +0 -0
  91. {datachain-0.3.17 → datachain-0.3.19}/examples/multimodal/wds.py +0 -0
  92. {datachain-0.3.17 → datachain-0.3.19}/examples/multimodal/wds_filtered.py +0 -0
  93. {datachain-0.3.17 → datachain-0.3.19}/mkdocs.yml +0 -0
  94. {datachain-0.3.17 → datachain-0.3.19}/setup.cfg +0 -0
  95. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/__main__.py +0 -0
  96. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/asyn.py +0 -0
  97. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/catalog/__init__.py +0 -0
  98. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/catalog/datasource.py +0 -0
  99. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/catalog/loader.py +0 -0
  100. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/cli_utils.py +0 -0
  101. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/__init__.py +0 -0
  102. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/azure.py +0 -0
  103. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/fileslice.py +0 -0
  104. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/gcs.py +0 -0
  105. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/hf.py +0 -0
  106. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/client/s3.py +0 -0
  107. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/config.py +0 -0
  108. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/__init__.py +0 -0
  109. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/db_engine.py +0 -0
  110. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/id_generator.py +0 -0
  111. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/job.py +0 -0
  112. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/metastore.py +0 -0
  113. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/schema.py +0 -0
  114. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/serializer.py +0 -0
  115. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/sqlite.py +0 -0
  116. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/data_storage/warehouse.py +0 -0
  117. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/job.py +0 -0
  118. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/__init__.py +0 -0
  119. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/clip.py +0 -0
  120. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/__init__.py +0 -0
  121. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/flatten.py +0 -0
  122. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/python_to_sql.py +0 -0
  123. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/sql_to_python.py +0 -0
  124. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/unflatten.py +0 -0
  125. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  126. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/data_model.py +0 -0
  127. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/dataset_info.py +0 -0
  128. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/hf.py +0 -0
  129. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/image.py +0 -0
  130. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/listing_info.py +0 -0
  131. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/meta_formats.py +0 -0
  132. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/model_store.py +0 -0
  133. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/pytorch.py +0 -0
  134. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/settings.py +0 -0
  135. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/signal_schema.py +0 -0
  136. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/tar.py +0 -0
  137. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/text.py +0 -0
  138. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/udf.py +0 -0
  139. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/udf_signature.py +0 -0
  140. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/utils.py +0 -0
  141. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/vfile.py +0 -0
  142. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/webdataset.py +0 -0
  143. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/lib/webdataset_laion.py +0 -0
  144. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/py.typed +0 -0
  145. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/__init__.py +0 -0
  146. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/batch.py +0 -0
  147. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/metrics.py +0 -0
  148. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/params.py +0 -0
  149. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/queue.py +0 -0
  150. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/query/session.py +0 -0
  151. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/remote/__init__.py +0 -0
  152. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/remote/studio.py +0 -0
  153. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/__init__.py +0 -0
  154. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/default/__init__.py +0 -0
  155. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/default/base.py +0 -0
  156. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/__init__.py +0 -0
  157. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/array.py +0 -0
  158. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/conditional.py +0 -0
  159. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/path.py +0 -0
  160. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/random.py +0 -0
  161. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/functions/string.py +0 -0
  162. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/selectable.py +0 -0
  163. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/sqlite/__init__.py +0 -0
  164. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/sqlite/base.py +0 -0
  165. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/sqlite/types.py +0 -0
  166. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/sqlite/vector.py +0 -0
  167. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/sql/utils.py +0 -0
  168. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/storage.py +0 -0
  169. {datachain-0.3.17 → datachain-0.3.19}/src/datachain/torch/__init__.py +0 -0
  170. {datachain-0.3.17 → datachain-0.3.19}/src/datachain.egg-info/dependency_links.txt +0 -0
  171. {datachain-0.3.17 → datachain-0.3.19}/src/datachain.egg-info/entry_points.txt +0 -0
  172. {datachain-0.3.17 → datachain-0.3.19}/src/datachain.egg-info/top_level.txt +0 -0
  173. {datachain-0.3.17 → datachain-0.3.19}/tests/__init__.py +0 -0
  174. {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/__init__.py +0 -0
  175. {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/conftest.py +0 -0
  176. {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  177. {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/datasets/.dvc/config +0 -0
  178. {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/datasets/.gitignore +0 -0
  179. {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  180. {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/test_datachain.py +0 -0
  181. {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/test_ls.py +0 -0
  182. {datachain-0.3.17 → datachain-0.3.19}/tests/benchmarks/test_version.py +0 -0
  183. {datachain-0.3.17 → datachain-0.3.19}/tests/data.py +0 -0
  184. {datachain-0.3.17 → datachain-0.3.19}/tests/examples/__init__.py +0 -0
  185. {datachain-0.3.17 → datachain-0.3.19}/tests/examples/test_wds_e2e.py +0 -0
  186. {datachain-0.3.17 → datachain-0.3.19}/tests/examples/wds_data.py +0 -0
  187. {datachain-0.3.17 → datachain-0.3.19}/tests/func/__init__.py +0 -0
  188. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_client.py +0 -0
  189. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_feature_pickling.py +0 -0
  190. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_listing.py +0 -0
  191. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_ls.py +0 -0
  192. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_meta_formats.py +0 -0
  193. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_metrics.py +0 -0
  194. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_pull.py +0 -0
  195. {datachain-0.3.17 → datachain-0.3.19}/tests/func/test_pytorch.py +0 -0
  196. {datachain-0.3.17 → datachain-0.3.19}/tests/scripts/feature_class.py +0 -0
  197. {datachain-0.3.17 → datachain-0.3.19}/tests/scripts/feature_class_parallel.py +0 -0
  198. {datachain-0.3.17 → datachain-0.3.19}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  199. {datachain-0.3.17 → datachain-0.3.19}/tests/scripts/name_len_slow.py +0 -0
  200. {datachain-0.3.17 → datachain-0.3.19}/tests/test_cli_e2e.py +0 -0
  201. {datachain-0.3.17 → datachain-0.3.19}/tests/test_query_e2e.py +0 -0
  202. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/__init__.py +0 -0
  203. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/__init__.py +0 -0
  204. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/conftest.py +0 -0
  205. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_clip.py +0 -0
  206. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_datachain.py +0 -0
  207. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  208. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_datachain_merge.py +0 -0
  209. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_feature.py +0 -0
  210. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_feature_utils.py +0 -0
  211. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_hf.py +0 -0
  212. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_image.py +0 -0
  213. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_schema.py +0 -0
  214. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_signal_schema.py +0 -0
  215. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_sql_to_python.py +0 -0
  216. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_text.py +0 -0
  217. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_udf_signature.py +0 -0
  218. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_utils.py +0 -0
  219. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/lib/test_webdataset.py +0 -0
  220. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/__init__.py +0 -0
  221. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/sqlite/__init__.py +0 -0
  222. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/sqlite/test_utils.py +0 -0
  223. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_array.py +0 -0
  224. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_conditional.py +0 -0
  225. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_path.py +0 -0
  226. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_random.py +0 -0
  227. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_selectable.py +0 -0
  228. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/sql/test_string.py +0 -0
  229. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_asyn.py +0 -0
  230. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_catalog.py +0 -0
  231. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_catalog_loader.py +0 -0
  232. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_cli_parsing.py +0 -0
  233. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_client.py +0 -0
  234. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_client_s3.py +0 -0
  235. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_database_engine.py +0 -0
  236. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_dataset.py +0 -0
  237. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_dispatch.py +0 -0
  238. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_fileslice.py +0 -0
  239. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_id_generator.py +0 -0
  240. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_listing.py +0 -0
  241. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_metastore.py +0 -0
  242. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_query_metrics.py +0 -0
  243. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_query_params.py +0 -0
  244. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_serializer.py +0 -0
  245. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_session.py +0 -0
  246. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_storage.py +0 -0
  247. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_utils.py +0 -0
  248. {datachain-0.3.17 → datachain-0.3.19}/tests/unit/test_warehouse.py +0 -0
  249. {datachain-0.3.17 → datachain-0.3.19}/tests/utils.py +0 -0
@@ -23,15 +23,18 @@ jobs:
23
23
  uses: actions/setup-python@v5
24
24
  with:
25
25
  python-version: '3.12'
26
- cache: 'pip'
27
26
 
28
- - name: Upgrade nox and uv
29
- run: |
30
- python -m pip install --upgrade 'nox[uv]'
31
- nox --version
32
- uv --version
27
+ - name: Setup uv
28
+ uses: astral-sh/setup-uv@v3
29
+ with:
30
+ enable-cache: true
31
+ cache-suffix: benchmarks
32
+ cache-dependency-glob: pyproject.toml
33
+
34
+ - name: Install nox and dvc
35
+ run: uv pip install dvc[gs] nox --system
33
36
 
34
- - run: uv pip install dvc[gs] --system
35
- - run: dvc --cd tests/benchmarks/datasets pull
37
+ - name: Pull dataset
38
+ run: dvc --cd tests/benchmarks/datasets pull
36
39
  - name: Run benchmarks
37
40
  run: nox -s bench
@@ -21,17 +21,16 @@ jobs:
21
21
  with:
22
22
  fetch-depth: 0
23
23
 
24
- - name: Set up Python 3.10
24
+ - name: Set up Python 3.12
25
25
  uses: actions/setup-python@v5
26
26
  with:
27
- python-version: '3.10'
27
+ python-version: '3.12'
28
28
 
29
- - name: Upgrade nox and uv
30
- run: |
31
- python -m pip install --upgrade 'nox[uv]'
32
- nox --version
33
- uv --version
29
+ - name: Setup uv
30
+ uses: astral-sh/setup-uv@v3
34
31
 
32
+ - name: Install nox
33
+ run: uv pip install nox --system
35
34
  - name: Build package
36
35
  run: nox -s build
37
36
 
@@ -82,12 +82,15 @@ jobs:
82
82
  uses: actions/setup-python@v5
83
83
  with:
84
84
  python-version: ${{ matrix.pyv }}
85
- cache: 'pip'
86
85
 
87
- - name: Install uv
88
- run: |
89
- python -m pip install --upgrade uv
90
- uv --version
86
+ - name: Setup uv
87
+ uses: astral-sh/setup-uv@v3
88
+ with:
89
+ enable-cache: true
90
+ cache-suffix: studio
91
+ cache-dependency-glob: |
92
+ backend/datachain_server/pyproject.toml
93
+ backend/datachain/pyproject.toml
91
94
 
92
95
  - name: Install dependencies
93
96
  run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
@@ -26,13 +26,16 @@ jobs:
26
26
  uses: actions/setup-python@v5
27
27
  with:
28
28
  python-version: '3.9'
29
- cache: 'pip'
30
29
 
31
- - name: Upgrade nox and uv
32
- run: |
33
- python -m pip install --upgrade 'nox[uv]'
34
- nox --version
35
- uv --version
30
+ - name: Setup uv
31
+ uses: astral-sh/setup-uv@v3
32
+ with:
33
+ enable-cache: true
34
+ cache-suffix: lint
35
+ cache-dependency-glob: pyproject.toml
36
+
37
+ - name: Install nox
38
+ run: uv pip install nox --system
36
39
 
37
40
  - name: Cache mypy
38
41
  uses: actions/cache@v4
@@ -77,13 +80,16 @@ jobs:
77
80
  uses: actions/setup-python@v5
78
81
  with:
79
82
  python-version: ${{ matrix.pyv }}
80
- cache: 'pip'
81
83
 
82
- - name: Upgrade nox and uv
83
- run: |
84
- python -m pip install --upgrade 'nox[uv]'
85
- nox --version
86
- uv --version
84
+ - name: Setup uv
85
+ uses: astral-sh/setup-uv@v3
86
+ with:
87
+ enable-cache: true
88
+ cache-suffix: tests-${{ matrix.pyv }}
89
+ cache-dependency-glob: pyproject.toml
90
+
91
+ - name: Install nox
92
+ run: uv pip install nox --system
87
93
 
88
94
  - name: Skip flaky azure, gs remotes on macOS
89
95
  if: runner.os == 'macOS'
@@ -134,13 +140,16 @@ jobs:
134
140
  uses: actions/setup-python@v5
135
141
  with:
136
142
  python-version: ${{ matrix.pyv }}
137
- cache: 'pip'
138
143
 
139
- - name: Upgrade nox and uv
140
- run: |
141
- python -m pip install --upgrade 'nox[uv]'
142
- nox --version
143
- uv --version
144
+ - name: Setup uv
145
+ uses: astral-sh/setup-uv@v3
146
+ with:
147
+ enable-cache: true
148
+ cache-suffix: examples-${{ matrix.pyv }}
149
+ cache-dependency-glob: pyproject.toml
150
+
151
+ - name: Install nox
152
+ run: uv pip install nox --system
144
153
 
145
154
  - name: Run examples
146
155
  run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.17
3
+ Version: 0.3.19
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -34,7 +34,6 @@ Requires-Dist: dvc-objects<6,>=4
34
34
  Requires-Dist: shtab<2,>=1.3.4
35
35
  Requires-Dist: sqlalchemy>=2
36
36
  Requires-Dist: multiprocess==0.70.16
37
- Requires-Dist: dill==0.3.8
38
37
  Requires-Dist: cloudpickle
39
38
  Requires-Dist: orjson>=3.10.5
40
39
  Requires-Dist: pydantic<3,>=2
@@ -44,6 +43,7 @@ Requires-Dist: Pillow<11,>=10.0.0
44
43
  Requires-Dist: msgpack<2,>=1.0.4
45
44
  Requires-Dist: psutil
46
45
  Requires-Dist: huggingface_hub
46
+ Requires-Dist: iterative-telemetry>=0.0.9
47
47
  Provides-Extra: docs
48
48
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
49
49
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -69,7 +69,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
69
69
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
70
70
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
71
71
  Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
72
- Requires-Dist: pytest-servers[all]>=0.5.5; extra == "tests"
72
+ Requires-Dist: pytest-servers[all]>=0.5.7; extra == "tests"
73
73
  Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
74
74
  Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
75
75
  Requires-Dist: virtualenv; extra == "tests"
@@ -91,9 +91,10 @@ Requires-Dist: datachain[tests]; extra == "examples"
91
91
  Requires-Dist: numpy<2,>=1; extra == "examples"
92
92
  Requires-Dist: defusedxml; extra == "examples"
93
93
  Requires-Dist: accelerate; extra == "examples"
94
- Requires-Dist: unstructured[pdf]; extra == "examples"
94
+ Requires-Dist: unstructured[embed-huggingface,pdf]; extra == "examples"
95
95
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
96
96
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
97
+ Requires-Dist: onnx==1.16.1; extra == "examples"
97
98
 
98
99
  ================
99
100
  |logo| DataChain
@@ -7,6 +7,8 @@ automatically when creating a `DataChain` from files, like in
7
7
  classes include various metadata fields about the underlying file as well as methods to
8
8
  read from the files and otherwise work with the file contents.
9
9
 
10
+ ::: datachain.lib.file.ArrowRow
11
+
10
12
  ::: datachain.lib.file.ExportPlacement
11
13
 
12
14
  ::: datachain.lib.file.File
@@ -15,8 +17,6 @@ read from the files and otherwise work with the file contents.
15
17
 
16
18
  ::: datachain.lib.file.ImageFile
17
19
 
18
- ::: datachain.lib.file.IndexedFile
19
-
20
20
  ::: datachain.lib.file.TarVFile
21
21
 
22
22
  ::: datachain.lib.file.TextFile
@@ -0,0 +1,76 @@
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
8
+ from collections.abc import Iterator
9
+
10
+ from unstructured.cleaners.core import (
11
+ clean,
12
+ group_broken_paragraphs,
13
+ replace_unicode_quotes,
14
+ )
15
+ from unstructured.embed.huggingface import (
16
+ HuggingFaceEmbeddingConfig,
17
+ HuggingFaceEmbeddingEncoder,
18
+ )
19
+ from unstructured.partition.pdf import partition_pdf
20
+
21
+ from datachain import C, DataChain, DataModel, File
22
+
23
+ source = "gs://datachain-demo/neurips/1987/"
24
+
25
+
26
+ # Define the output as a DataModel class
27
+ class Chunk(DataModel):
28
+ key: str
29
+ text: str
30
+ embeddings: list[float]
31
+
32
+
33
+ # Define embedding encoder
34
+
35
+ embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
36
+
37
+
38
+ # Use signatures to define UDF input/output
39
+ # these can be pydantic model or regular Python types
40
+ def process_pdf(file: File) -> Iterator[Chunk]:
41
+ # Ingest the file
42
+ with file.open() as f:
43
+ chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
44
+
45
+ # Clean the chunks and add new columns
46
+ for chunk in chunks:
47
+ chunk.apply(
48
+ lambda text: clean(
49
+ text, bullets=True, extra_whitespace=True, trailing_punctuation=True
50
+ )
51
+ )
52
+ chunk.apply(replace_unicode_quotes)
53
+ chunk.apply(group_broken_paragraphs)
54
+
55
+ # create embeddings
56
+ chunks_embedded = embedding_encoder.embed_documents(chunks)
57
+
58
+ # Add new rows to DataChain
59
+ for chunk in chunks_embedded:
60
+ yield Chunk(
61
+ key=file.path,
62
+ text=chunk.text,
63
+ embeddings=chunk.embeddings,
64
+ )
65
+
66
+
67
+ dc = (
68
+ DataChain.from_storage(source)
69
+ .settings(parallel=-1)
70
+ .filter(C.file.path.glob("*.pdf"))
71
+ .gen(document=process_pdf)
72
+ )
73
+
74
+ dc.save("embedded-documents")
75
+
76
+ DataChain.from_dataset("embedded-documents").show()
@@ -1,6 +1,10 @@
1
- #
2
- # pip install unstructured[pdf] huggingface_hub[hf_transfer]
3
- #
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
4
8
  import os
5
9
 
6
10
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
@@ -1,4 +1,10 @@
1
- # pip install scipy torch transformers huggingface_hub[hf_transfer]
1
+ """
2
+ To install the required dependencies:
3
+
4
+ pip install datachain[examples]
5
+
6
+ """
7
+
2
8
  # NOTE: also need to install ffmpeg binary
3
9
  import json
4
10
  import os
@@ -1,5 +1,3 @@
1
- # pip install Pillow
2
-
3
1
  import base64
4
2
  import os
5
3
 
@@ -57,8 +57,8 @@ def lint(session: nox.Session) -> None:
57
57
 
58
58
  @nox.session
59
59
  def build(session: nox.Session) -> None:
60
- session.install("build", "twine", "uv")
61
- session.run("python", "-m", "build", "--installer", "uv")
60
+ session.install("twine", "uv")
61
+ session.run("uv", "build")
62
62
  dists = glob.glob("dist/*")
63
63
  session.run("twine", "check", *dists, silent=True)
64
64
 
@@ -37,7 +37,6 @@ dependencies = [
37
37
  "shtab>=1.3.4,<2",
38
38
  "sqlalchemy>=2",
39
39
  "multiprocess==0.70.16",
40
- "dill==0.3.8",
41
40
  "cloudpickle",
42
41
  "orjson>=3.10.5",
43
42
  "pydantic>=2,<3",
@@ -46,7 +45,8 @@ dependencies = [
46
45
  "Pillow>=10.0.0,<11",
47
46
  "msgpack>=1.0.4,<2",
48
47
  "psutil",
49
- "huggingface_hub"
48
+ "huggingface_hub",
49
+ "iterative-telemetry>=0.0.9"
50
50
  ]
51
51
 
52
52
  [project.optional-dependencies]
@@ -80,7 +80,7 @@ tests = [
80
80
  "pytest-sugar>=0.9.6",
81
81
  "pytest-cov>=4.1.0",
82
82
  "pytest-mock>=3.12.0",
83
- "pytest-servers[all]>=0.5.5",
83
+ "pytest-servers[all]>=0.5.7",
84
84
  "pytest-benchmark[histogram]",
85
85
  "pytest-xdist>=3.3.1",
86
86
  "virtualenv",
@@ -104,9 +104,10 @@ examples = [
104
104
  "numpy>=1,<2",
105
105
  "defusedxml",
106
106
  "accelerate",
107
- "unstructured[pdf]",
107
+ "unstructured[pdf, embed-huggingface]",
108
108
  "pdfplumber==0.11.4",
109
- "huggingface_hub[hf_transfer]"
109
+ "huggingface_hub[hf_transfer]",
110
+ "onnx==1.16.1"
110
111
  ]
111
112
 
112
113
  [project.urls]
@@ -1,21 +1,23 @@
1
1
  from datachain.lib.data_model import DataModel, DataType, is_chain_type
2
2
  from datachain.lib.dc import C, Column, DataChain, Sys
3
3
  from datachain.lib.file import (
4
+ ArrowRow,
4
5
  File,
5
6
  FileError,
6
7
  ImageFile,
7
- IndexedFile,
8
8
  TarVFile,
9
9
  TextFile,
10
10
  )
11
11
  from datachain.lib.model_store import ModelStore
12
12
  from datachain.lib.udf import Aggregator, Generator, Mapper
13
13
  from datachain.lib.utils import AbstractUDF, DataChainError
14
+ from datachain.query import metrics, param
14
15
  from datachain.query.session import Session
15
16
 
16
17
  __all__ = [
17
18
  "AbstractUDF",
18
19
  "Aggregator",
20
+ "ArrowRow",
19
21
  "C",
20
22
  "Column",
21
23
  "DataChain",
@@ -26,7 +28,6 @@ __all__ = [
26
28
  "FileError",
27
29
  "Generator",
28
30
  "ImageFile",
29
- "IndexedFile",
30
31
  "Mapper",
31
32
  "ModelStore",
32
33
  "Session",
@@ -34,4 +35,6 @@ __all__ = [
34
35
  "TarVFile",
35
36
  "TextFile",
36
37
  "is_chain_type",
38
+ "metrics",
39
+ "param",
37
40
  ]
@@ -1,56 +1,15 @@
1
- import hashlib
2
- import json
3
1
  import os
4
- from datetime import datetime
5
- from functools import partial
6
2
  from typing import TYPE_CHECKING, Optional
7
3
 
8
- import attrs
9
4
  from dvc_data.hashfile.db.local import LocalHashFileDB
10
5
  from dvc_objects.fs.local import LocalFileSystem
11
6
  from fsspec.callbacks import Callback, TqdmCallback
12
7
 
13
- from datachain.utils import TIME_ZERO
14
-
15
8
  from .progress import Tqdm
16
9
 
17
10
  if TYPE_CHECKING:
18
11
  from datachain.client import Client
19
- from datachain.storage import StorageURI
20
-
21
- sha256 = partial(hashlib.sha256, usedforsecurity=False)
22
-
23
-
24
- @attrs.frozen
25
- class UniqueId:
26
- storage: "StorageURI"
27
- path: str
28
- size: int
29
- etag: str
30
- version: str = ""
31
- is_latest: bool = True
32
- location: Optional[str] = None
33
- last_modified: datetime = TIME_ZERO
34
-
35
- def get_parsed_location(self) -> Optional[dict]:
36
- if not self.location:
37
- return None
38
-
39
- loc_stack = (
40
- json.loads(self.location)
41
- if isinstance(self.location, str)
42
- else self.location
43
- )
44
- if len(loc_stack) > 1:
45
- raise NotImplementedError("Nested v-objects are not supported yet.")
46
-
47
- return loc_stack[0]
48
-
49
- def get_hash(self) -> str:
50
- fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
51
- if self.location:
52
- fingerprint += f"/{self.location}"
53
- return sha256(fingerprint.encode()).hexdigest()
12
+ from datachain.lib.file import File
54
13
 
55
14
 
56
15
  def try_scandir(path):
@@ -77,30 +36,30 @@ class DataChainCache:
77
36
  def tmp_dir(self):
78
37
  return self.odb.tmp_dir
79
38
 
80
- def get_path(self, uid: UniqueId) -> Optional[str]:
81
- if self.contains(uid):
82
- return self.path_from_checksum(uid.get_hash())
39
+ def get_path(self, file: "File") -> Optional[str]:
40
+ if self.contains(file):
41
+ return self.path_from_checksum(file.get_hash())
83
42
  return None
84
43
 
85
- def contains(self, uid: UniqueId) -> bool:
86
- return self.odb.exists(uid.get_hash())
44
+ def contains(self, file: "File") -> bool:
45
+ return self.odb.exists(file.get_hash())
87
46
 
88
47
  def path_from_checksum(self, checksum: str) -> str:
89
48
  assert checksum
90
49
  return self.odb.oid_to_path(checksum)
91
50
 
92
- def remove(self, uid: UniqueId) -> None:
93
- self.odb.delete(uid.get_hash())
51
+ def remove(self, file: "File") -> None:
52
+ self.odb.delete(file.get_hash())
94
53
 
95
54
  async def download(
96
- self, uid: UniqueId, client: "Client", callback: Optional[Callback] = None
55
+ self, file: "File", client: "Client", callback: Optional[Callback] = None
97
56
  ) -> None:
98
- from_path = f"{uid.storage}/{uid.path}"
57
+ from_path = f"{file.source}/{file.path}"
99
58
  from dvc_objects.fs.utils import tmp_fname
100
59
 
101
60
  odb_fs = self.odb.fs
102
61
  tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
103
- size = uid.size
62
+ size = file.size
104
63
  if size < 0:
105
64
  size = await client.get_size(from_path)
106
65
  cb = callback or TqdmCallback(
@@ -115,13 +74,13 @@ class DataChainCache:
115
74
  cb.close()
116
75
 
117
76
  try:
118
- oid = uid.get_hash()
77
+ oid = file.get_hash()
119
78
  self.odb.add(tmp_info, self.odb.fs, oid)
120
79
  finally:
121
80
  os.unlink(tmp_info)
122
81
 
123
- def store_data(self, uid: UniqueId, contents: bytes) -> None:
124
- checksum = uid.get_hash()
82
+ def store_data(self, file: "File", contents: bytes) -> None:
83
+ checksum = file.get_hash()
125
84
  dst = self.path_from_checksum(checksum)
126
85
  if not os.path.exists(dst):
127
86
  # Create the file only if it's not already in cache