datachain 0.3.3__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (243) hide show
  1. {datachain-0.3.3 → datachain-0.3.5}/.github/workflows/tests-studio.yml +1 -1
  2. {datachain-0.3.3 → datachain-0.3.5}/.github/workflows/tests.yml +11 -3
  3. {datachain-0.3.3 → datachain-0.3.5}/.pre-commit-config.yaml +1 -1
  4. {datachain-0.3.3/src/datachain.egg-info → datachain-0.3.5}/PKG-INFO +3 -4
  5. {datachain-0.3.3 → datachain-0.3.5}/examples/get_started/torch-loader.py +1 -1
  6. {datachain-0.3.3 → datachain-0.3.5}/pyproject.toml +4 -5
  7. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/convert/flatten.py +0 -28
  8. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/dc.py +26 -12
  9. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/signal_schema.py +10 -5
  10. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/dataset.py +42 -22
  11. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/functions/string.py +12 -0
  12. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/sqlite/base.py +12 -0
  13. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/types.py +14 -8
  14. {datachain-0.3.3 → datachain-0.3.5/src/datachain.egg-info}/PKG-INFO +3 -4
  15. {datachain-0.3.3 → datachain-0.3.5}/src/datachain.egg-info/requires.txt +2 -3
  16. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_datachain.py +97 -0
  17. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_signal_schema.py +43 -2
  18. datachain-0.3.5/tests/unit/sql/test_string.py +48 -0
  19. datachain-0.3.3/tests/unit/sql/test_string.py +0 -23
  20. {datachain-0.3.3 → datachain-0.3.5}/.cruft.json +0 -0
  21. {datachain-0.3.3 → datachain-0.3.5}/.gitattributes +0 -0
  22. {datachain-0.3.3 → datachain-0.3.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  23. {datachain-0.3.3 → datachain-0.3.5}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  24. {datachain-0.3.3 → datachain-0.3.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  25. {datachain-0.3.3 → datachain-0.3.5}/.github/codecov.yaml +0 -0
  26. {datachain-0.3.3 → datachain-0.3.5}/.github/dependabot.yml +0 -0
  27. {datachain-0.3.3 → datachain-0.3.5}/.github/workflows/benchmarks.yml +0 -0
  28. {datachain-0.3.3 → datachain-0.3.5}/.github/workflows/release.yml +0 -0
  29. {datachain-0.3.3 → datachain-0.3.5}/.github/workflows/update-template.yaml +0 -0
  30. {datachain-0.3.3 → datachain-0.3.5}/.gitignore +0 -0
  31. {datachain-0.3.3 → datachain-0.3.5}/CODE_OF_CONDUCT.rst +0 -0
  32. {datachain-0.3.3 → datachain-0.3.5}/CONTRIBUTING.rst +0 -0
  33. {datachain-0.3.3 → datachain-0.3.5}/LICENSE +0 -0
  34. {datachain-0.3.3 → datachain-0.3.5}/README.rst +0 -0
  35. {datachain-0.3.3 → datachain-0.3.5}/docs/assets/captioned_cartoons.png +0 -0
  36. {datachain-0.3.3 → datachain-0.3.5}/docs/assets/datachain.png +0 -0
  37. {datachain-0.3.3 → datachain-0.3.5}/docs/assets/flowchart.png +0 -0
  38. {datachain-0.3.3 → datachain-0.3.5}/docs/index.md +0 -0
  39. {datachain-0.3.3 → datachain-0.3.5}/docs/references/datachain.md +0 -0
  40. {datachain-0.3.3 → datachain-0.3.5}/docs/references/datatype.md +0 -0
  41. {datachain-0.3.3 → datachain-0.3.5}/docs/references/file.md +0 -0
  42. {datachain-0.3.3 → datachain-0.3.5}/docs/references/index.md +0 -0
  43. {datachain-0.3.3 → datachain-0.3.5}/docs/references/sql.md +0 -0
  44. {datachain-0.3.3 → datachain-0.3.5}/docs/references/torch.md +0 -0
  45. {datachain-0.3.3 → datachain-0.3.5}/docs/references/udf.md +0 -0
  46. {datachain-0.3.3 → datachain-0.3.5}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  47. {datachain-0.3.3 → datachain-0.3.5}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  48. {datachain-0.3.3 → datachain-0.3.5}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  49. {datachain-0.3.3 → datachain-0.3.5}/examples/computer_vision/openimage-detect.py +0 -0
  50. {datachain-0.3.3 → datachain-0.3.5}/examples/get_started/common_sql_functions.py +0 -0
  51. {datachain-0.3.3 → datachain-0.3.5}/examples/get_started/json-csv-reader.py +0 -0
  52. {datachain-0.3.3 → datachain-0.3.5}/examples/get_started/udfs/parallel.py +0 -0
  53. {datachain-0.3.3 → datachain-0.3.5}/examples/get_started/udfs/simple.py +0 -0
  54. {datachain-0.3.3 → datachain-0.3.5}/examples/get_started/udfs/stateful.py +0 -0
  55. {datachain-0.3.3 → datachain-0.3.5}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  56. {datachain-0.3.3 → datachain-0.3.5}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  57. {datachain-0.3.3 → datachain-0.3.5}/examples/llm_and_nlp/llm-claude.py +0 -0
  58. {datachain-0.3.3 → datachain-0.3.5}/examples/llm_and_nlp/unstructured-text.py +0 -0
  59. {datachain-0.3.3 → datachain-0.3.5}/examples/multimodal/clip_inference.py +0 -0
  60. {datachain-0.3.3 → datachain-0.3.5}/examples/multimodal/hf_pipeline.py +0 -0
  61. {datachain-0.3.3 → datachain-0.3.5}/examples/multimodal/openai_image_desc_lib.py +0 -0
  62. {datachain-0.3.3 → datachain-0.3.5}/examples/multimodal/wds.py +0 -0
  63. {datachain-0.3.3 → datachain-0.3.5}/examples/multimodal/wds_filtered.py +0 -0
  64. {datachain-0.3.3 → datachain-0.3.5}/mkdocs.yml +0 -0
  65. {datachain-0.3.3 → datachain-0.3.5}/noxfile.py +0 -0
  66. {datachain-0.3.3 → datachain-0.3.5}/setup.cfg +0 -0
  67. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/__init__.py +0 -0
  68. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/__main__.py +0 -0
  69. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/asyn.py +0 -0
  70. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/cache.py +0 -0
  71. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/catalog/__init__.py +0 -0
  72. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/catalog/catalog.py +0 -0
  73. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/catalog/datasource.py +0 -0
  74. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/catalog/loader.py +0 -0
  75. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/catalog/subclass.py +0 -0
  76. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/cli.py +0 -0
  77. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/cli_utils.py +0 -0
  78. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/client/__init__.py +0 -0
  79. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/client/azure.py +0 -0
  80. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/client/fileslice.py +0 -0
  81. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/client/fsspec.py +0 -0
  82. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/client/gcs.py +0 -0
  83. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/client/local.py +0 -0
  84. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/client/s3.py +0 -0
  85. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/config.py +0 -0
  86. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/data_storage/__init__.py +0 -0
  87. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/data_storage/db_engine.py +0 -0
  88. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/data_storage/id_generator.py +0 -0
  89. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/data_storage/job.py +0 -0
  90. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/data_storage/metastore.py +0 -0
  91. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/data_storage/schema.py +0 -0
  92. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/data_storage/serializer.py +0 -0
  93. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/data_storage/sqlite.py +0 -0
  94. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/data_storage/warehouse.py +0 -0
  95. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/dataset.py +0 -0
  96. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/error.py +0 -0
  97. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/job.py +0 -0
  98. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/__init__.py +0 -0
  99. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/arrow.py +0 -0
  100. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/clip.py +0 -0
  101. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/convert/__init__.py +0 -0
  102. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/convert/python_to_sql.py +0 -0
  103. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/convert/sql_to_python.py +0 -0
  104. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/convert/unflatten.py +0 -0
  105. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  106. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/data_model.py +0 -0
  107. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/dataset_info.py +0 -0
  108. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/file.py +0 -0
  109. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/image.py +0 -0
  110. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/listing.py +0 -0
  111. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/meta_formats.py +0 -0
  112. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/model_store.py +0 -0
  113. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/pytorch.py +0 -0
  114. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/settings.py +0 -0
  115. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/text.py +0 -0
  116. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/udf.py +0 -0
  117. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/udf_signature.py +0 -0
  118. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/utils.py +0 -0
  119. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/vfile.py +0 -0
  120. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/webdataset.py +0 -0
  121. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/lib/webdataset_laion.py +0 -0
  122. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/listing.py +0 -0
  123. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/node.py +0 -0
  124. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/nodes_fetcher.py +0 -0
  125. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/nodes_thread_pool.py +0 -0
  126. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/progress.py +0 -0
  127. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/py.typed +0 -0
  128. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/__init__.py +0 -0
  129. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/batch.py +0 -0
  130. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/builtins.py +0 -0
  131. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/dispatch.py +0 -0
  132. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/metrics.py +0 -0
  133. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/params.py +0 -0
  134. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/queue.py +0 -0
  135. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/schema.py +0 -0
  136. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/session.py +0 -0
  137. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/query/udf.py +0 -0
  138. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/remote/__init__.py +0 -0
  139. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/remote/studio.py +0 -0
  140. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/__init__.py +0 -0
  141. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/default/__init__.py +0 -0
  142. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/default/base.py +0 -0
  143. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/functions/__init__.py +0 -0
  144. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/functions/array.py +0 -0
  145. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/functions/conditional.py +0 -0
  146. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/functions/path.py +0 -0
  147. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/functions/random.py +0 -0
  148. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/selectable.py +0 -0
  149. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/sqlite/__init__.py +0 -0
  150. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/sqlite/types.py +0 -0
  151. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/sqlite/vector.py +0 -0
  152. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/sql/utils.py +0 -0
  153. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/storage.py +0 -0
  154. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/torch/__init__.py +0 -0
  155. {datachain-0.3.3 → datachain-0.3.5}/src/datachain/utils.py +0 -0
  156. {datachain-0.3.3 → datachain-0.3.5}/src/datachain.egg-info/SOURCES.txt +0 -0
  157. {datachain-0.3.3 → datachain-0.3.5}/src/datachain.egg-info/dependency_links.txt +0 -0
  158. {datachain-0.3.3 → datachain-0.3.5}/src/datachain.egg-info/entry_points.txt +0 -0
  159. {datachain-0.3.3 → datachain-0.3.5}/src/datachain.egg-info/top_level.txt +0 -0
  160. {datachain-0.3.3 → datachain-0.3.5}/tests/__init__.py +0 -0
  161. {datachain-0.3.3 → datachain-0.3.5}/tests/benchmarks/__init__.py +0 -0
  162. {datachain-0.3.3 → datachain-0.3.5}/tests/benchmarks/conftest.py +0 -0
  163. {datachain-0.3.3 → datachain-0.3.5}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  164. {datachain-0.3.3 → datachain-0.3.5}/tests/benchmarks/datasets/.dvc/config +0 -0
  165. {datachain-0.3.3 → datachain-0.3.5}/tests/benchmarks/datasets/.gitignore +0 -0
  166. {datachain-0.3.3 → datachain-0.3.5}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  167. {datachain-0.3.3 → datachain-0.3.5}/tests/benchmarks/test_datachain.py +0 -0
  168. {datachain-0.3.3 → datachain-0.3.5}/tests/benchmarks/test_ls.py +0 -0
  169. {datachain-0.3.3 → datachain-0.3.5}/tests/benchmarks/test_version.py +0 -0
  170. {datachain-0.3.3 → datachain-0.3.5}/tests/conftest.py +0 -0
  171. {datachain-0.3.3 → datachain-0.3.5}/tests/data.py +0 -0
  172. {datachain-0.3.3 → datachain-0.3.5}/tests/examples/__init__.py +0 -0
  173. {datachain-0.3.3 → datachain-0.3.5}/tests/examples/test_examples.py +0 -0
  174. {datachain-0.3.3 → datachain-0.3.5}/tests/examples/test_wds_e2e.py +0 -0
  175. {datachain-0.3.3 → datachain-0.3.5}/tests/examples/wds_data.py +0 -0
  176. {datachain-0.3.3 → datachain-0.3.5}/tests/func/__init__.py +0 -0
  177. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_catalog.py +0 -0
  178. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_client.py +0 -0
  179. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_datachain.py +0 -0
  180. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_dataset_query.py +0 -0
  181. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_datasets.py +0 -0
  182. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_feature_pickling.py +0 -0
  183. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_listing.py +0 -0
  184. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_ls.py +0 -0
  185. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_pull.py +0 -0
  186. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_pytorch.py +0 -0
  187. {datachain-0.3.3 → datachain-0.3.5}/tests/func/test_query.py +0 -0
  188. {datachain-0.3.3 → datachain-0.3.5}/tests/scripts/feature_class.py +0 -0
  189. {datachain-0.3.3 → datachain-0.3.5}/tests/scripts/feature_class_parallel.py +0 -0
  190. {datachain-0.3.3 → datachain-0.3.5}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  191. {datachain-0.3.3 → datachain-0.3.5}/tests/scripts/name_len_slow.py +0 -0
  192. {datachain-0.3.3 → datachain-0.3.5}/tests/test_cli_e2e.py +0 -0
  193. {datachain-0.3.3 → datachain-0.3.5}/tests/test_query_e2e.py +0 -0
  194. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/__init__.py +0 -0
  195. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/__init__.py +0 -0
  196. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/conftest.py +0 -0
  197. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_arrow.py +0 -0
  198. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_clip.py +0 -0
  199. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  200. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_datachain_merge.py +0 -0
  201. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_feature.py +0 -0
  202. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_feature_utils.py +0 -0
  203. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_file.py +0 -0
  204. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_image.py +0 -0
  205. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_schema.py +0 -0
  206. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_sql_to_python.py +0 -0
  207. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_text.py +0 -0
  208. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_udf_signature.py +0 -0
  209. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_utils.py +0 -0
  210. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/lib/test_webdataset.py +0 -0
  211. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/sql/__init__.py +0 -0
  212. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/sql/sqlite/__init__.py +0 -0
  213. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/sql/sqlite/test_utils.py +0 -0
  214. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/sql/test_array.py +0 -0
  215. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/sql/test_conditional.py +0 -0
  216. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/sql/test_path.py +0 -0
  217. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/sql/test_random.py +0 -0
  218. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/sql/test_selectable.py +0 -0
  219. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_asyn.py +0 -0
  220. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_cache.py +0 -0
  221. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_catalog.py +0 -0
  222. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_catalog_loader.py +0 -0
  223. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_cli_parsing.py +0 -0
  224. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_client.py +0 -0
  225. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_client_s3.py +0 -0
  226. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_data_storage.py +0 -0
  227. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_database_engine.py +0 -0
  228. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_dataset.py +0 -0
  229. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_dispatch.py +0 -0
  230. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_fileslice.py +0 -0
  231. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_id_generator.py +0 -0
  232. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_listing.py +0 -0
  233. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_metastore.py +0 -0
  234. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_module_exports.py +0 -0
  235. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_query_metrics.py +0 -0
  236. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_query_params.py +0 -0
  237. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_serializer.py +0 -0
  238. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_session.py +0 -0
  239. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_storage.py +0 -0
  240. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_udf.py +0 -0
  241. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_utils.py +0 -0
  242. {datachain-0.3.3 → datachain-0.3.5}/tests/unit/test_warehouse.py +0 -0
  243. {datachain-0.3.3 → datachain-0.3.5}/tests/utils.py +0 -0
@@ -17,7 +17,7 @@ concurrency:
17
17
  jobs:
18
18
  studio:
19
19
  if: '!github.event.pull_request.head.repo.fork'
20
- runs-on: ubuntu-latest-16-cores
20
+ runs-on: ubuntu-latest
21
21
  strategy:
22
22
  matrix:
23
23
  pyv: ['3.12']
@@ -62,9 +62,9 @@ jobs:
62
62
  pyv: '3.9'
63
63
  - os: macos-latest
64
64
  pyv: '3.12'
65
- - os: windows-latest-8-cores
65
+ - os: windows-latest
66
66
  pyv: '3.9'
67
- - os: windows-latest-8-cores
67
+ - os: windows-latest
68
68
  pyv: '3.12'
69
69
 
70
70
  steps:
@@ -116,9 +116,17 @@ jobs:
116
116
  strategy:
117
117
  fail-fast: false
118
118
  matrix:
119
- os: [ubuntu-latest-16-cores, macos-latest, windows-latest-8-cores]
119
+ os: [ubuntu-latest, macos-latest, windows-latest]
120
120
  pyv: ['3.9', '3.12']
121
121
  group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
122
+ exclude:
123
+ - {os: ubuntu-latest, pyv: '3.9', group: 'multimodal'}
124
+ - {os: ubuntu-latest, pyv: '3.12', group: 'multimodal'}
125
+ include:
126
+ - {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
127
+ - {os: ubuntu-latest-4-cores, pyv: "3.12", group: multimodal}
128
+
129
+
122
130
  steps:
123
131
  - uses: actions/checkout@v4
124
132
 
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.5.7'
27
+ rev: 'v0.6.1'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -41,6 +41,7 @@ Requires-Dist: pydantic<3,>=2
41
41
  Requires-Dist: jmespath>=1.0
42
42
  Requires-Dist: datamodel-code-generator>=0.25
43
43
  Requires-Dist: Pillow<11,>=10.0.0
44
+ Requires-Dist: msgpack<2,>=1.0.4
44
45
  Provides-Extra: docs
45
46
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
46
47
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -54,7 +55,6 @@ Requires-Dist: torchvision; extra == "torch"
54
55
  Requires-Dist: transformers>=4.36.0; extra == "torch"
55
56
  Provides-Extra: remote
56
57
  Requires-Dist: lz4; extra == "remote"
57
- Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
58
58
  Requires-Dist: requests>=2.22.0; extra == "remote"
59
59
  Provides-Extra: vector
60
60
  Requires-Dist: usearch; extra == "vector"
@@ -87,9 +87,8 @@ Requires-Dist: numpy<2,>=1; extra == "examples"
87
87
  Requires-Dist: defusedxml; extra == "examples"
88
88
  Requires-Dist: accelerate; extra == "examples"
89
89
  Requires-Dist: unstructured[pdf]; extra == "examples"
90
- Requires-Dist: pdfplumber==0.11.3; extra == "examples"
90
+ Requires-Dist: pdfplumber==0.11.4; extra == "examples"
91
91
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
92
- Requires-Dist: nltk==3.8.1; extra == "examples"
93
92
 
94
93
  |PyPI| |Python Version| |Codecov| |Tests|
95
94
 
@@ -81,6 +81,6 @@ if __name__ == "__main__":
81
81
  loss.backward()
82
82
  optimizer.step()
83
83
 
84
- print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, loss.item()))
84
+ print(f"[{epoch + 1}, {i + 1:5d}] loss: {loss.item():.3f}")
85
85
 
86
86
  print("Finished Training")
@@ -43,7 +43,8 @@ dependencies = [
43
43
  "pydantic>=2,<3",
44
44
  "jmespath>=1.0",
45
45
  "datamodel-code-generator>=0.25",
46
- "Pillow>=10.0.0,<11"
46
+ "Pillow>=10.0.0,<11",
47
+ "msgpack>=1.0.4,<2"
47
48
  ]
48
49
 
49
50
  [project.optional-dependencies]
@@ -62,7 +63,6 @@ torch = [
62
63
  ]
63
64
  remote = [
64
65
  "lz4",
65
- "msgpack>=1.0.4,<2",
66
66
  "requests>=2.22.0"
67
67
  ]
68
68
  vector = [
@@ -99,9 +99,8 @@ examples = [
99
99
  "defusedxml",
100
100
  "accelerate",
101
101
  "unstructured[pdf]",
102
- "pdfplumber==0.11.3",
103
- "huggingface_hub[hf_transfer]",
104
- "nltk==3.8.1"
102
+ "pdfplumber==0.11.4",
103
+ "huggingface_hub[hf_transfer]"
105
104
  ]
106
105
 
107
106
  [project.urls]
@@ -1,34 +1,6 @@
1
- from datetime import datetime
2
-
3
1
  from pydantic import BaseModel
4
2
 
5
3
  from datachain.lib.model_store import ModelStore
6
- from datachain.sql.types import (
7
- JSON,
8
- Array,
9
- Binary,
10
- Boolean,
11
- DateTime,
12
- Float,
13
- Int,
14
- Int32,
15
- Int64,
16
- NullType,
17
- String,
18
- )
19
-
20
- DATACHAIN_TO_TYPE = {
21
- Int: int,
22
- Int32: int,
23
- Int64: int,
24
- String: str,
25
- Float: float,
26
- Boolean: bool,
27
- DateTime: datetime,
28
- Binary: bytes,
29
- Array(NullType): list,
30
- JSON: dict,
31
- }
32
4
 
33
5
 
34
6
  def flatten(obj: BaseModel):
@@ -1224,14 +1224,11 @@ class DataChain(DatasetQuery):
1224
1224
  """
1225
1225
  headers, max_length = self._effective_signals_schema.get_headers_with_length()
1226
1226
  if flatten or max_length < 2:
1227
- columns = []
1228
- if headers:
1229
- columns = [".".join(filter(None, header)) for header in headers]
1230
- return pd.DataFrame.from_records(self.to_records(), columns=columns)
1227
+ columns = [".".join(filter(None, header)) for header in headers]
1228
+ else:
1229
+ columns = pd.MultiIndex.from_tuples(map(tuple, headers))
1231
1230
 
1232
- return pd.DataFrame(
1233
- self.results(), columns=pd.MultiIndex.from_tuples(map(tuple, headers))
1234
- )
1231
+ return pd.DataFrame.from_records(self.results(), columns=columns)
1235
1232
 
1236
1233
  def show(
1237
1234
  self,
@@ -1524,6 +1521,7 @@ class DataChain(DatasetQuery):
1524
1521
  to_insert: Optional[Union[dict, list[dict]]],
1525
1522
  session: Optional[Session] = None,
1526
1523
  in_memory: bool = False,
1524
+ schema: Optional[dict[str, DataType]] = None,
1527
1525
  ) -> "DataChain":
1528
1526
  """Create a DataChain from the provided records. This method can be used for
1529
1527
  programmatically generating a chain in contrast of reading data from storages
@@ -1532,10 +1530,10 @@ class DataChain(DatasetQuery):
1532
1530
  Parameters:
1533
1531
  to_insert : records (or a single record) to insert. Each record is
1534
1532
  a dictionary of signals and theirs values.
1533
+ schema : describes chain signals and their corresponding types
1535
1534
 
1536
1535
  Example:
1537
1536
  ```py
1538
- empty = DataChain.from_records()
1539
1537
  single_record = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
1540
1538
  ```
1541
1539
  """
@@ -1543,11 +1541,27 @@ class DataChain(DatasetQuery):
1543
1541
  catalog = session.catalog
1544
1542
 
1545
1543
  name = session.generate_temp_dataset_name()
1546
- columns: tuple[sqlalchemy.Column[Any], ...] = tuple(
1547
- sqlalchemy.Column(name, typ)
1548
- for name, typ in File._datachain_column_types.items()
1544
+ signal_schema = None
1545
+ columns: list[sqlalchemy.Column] = []
1546
+
1547
+ if schema:
1548
+ signal_schema = SignalSchema(schema)
1549
+ columns = signal_schema.db_signals(as_columns=True) # type: ignore[assignment]
1550
+ else:
1551
+ columns = [
1552
+ sqlalchemy.Column(name, typ)
1553
+ for name, typ in File._datachain_column_types.items()
1554
+ ]
1555
+
1556
+ dsr = catalog.create_dataset(
1557
+ name,
1558
+ columns=columns,
1559
+ feature_schema=(
1560
+ signal_schema.clone_without_sys_signals().serialize()
1561
+ if signal_schema
1562
+ else None
1563
+ ),
1549
1564
  )
1550
- dsr = catalog.create_dataset(name, columns=columns)
1551
1565
 
1552
1566
  if isinstance(to_insert, dict):
1553
1567
  to_insert = [to_insert]
@@ -2,6 +2,7 @@ import copy
2
2
  from collections.abc import Iterator, Sequence
3
3
  from dataclasses import dataclass
4
4
  from datetime import datetime
5
+ from inspect import isclass
5
6
  from typing import (
6
7
  TYPE_CHECKING,
7
8
  Annotated,
@@ -14,10 +15,10 @@ from typing import (
14
15
  get_origin,
15
16
  )
16
17
 
18
+ import sqlalchemy as sa
17
19
  from pydantic import BaseModel, create_model
18
20
  from typing_extensions import Literal as LiteralEx
19
21
 
20
- from datachain.lib.convert.flatten import DATACHAIN_TO_TYPE
21
22
  from datachain.lib.convert.python_to_sql import python_to_sql
22
23
  from datachain.lib.convert.sql_to_python import sql_to_python
23
24
  from datachain.lib.convert.unflatten import unflatten_to_json_pos
@@ -26,6 +27,7 @@ from datachain.lib.file import File
26
27
  from datachain.lib.model_store import ModelStore
27
28
  from datachain.lib.utils import DataChainParamsError
28
29
  from datachain.query.schema import DEFAULT_DELIMITER, Column
30
+ from datachain.sql.types import SQLType
29
31
 
30
32
  if TYPE_CHECKING:
31
33
  from datachain.catalog import Catalog
@@ -104,12 +106,15 @@ class SignalSchema:
104
106
  def from_column_types(col_types: dict[str, Any]) -> "SignalSchema":
105
107
  signals: dict[str, DataType] = {}
106
108
  for field, col_type in col_types.items():
107
- if (py_type := DATACHAIN_TO_TYPE.get(col_type, None)) is None:
109
+ if isinstance(col_type, SQLType):
110
+ signals[field] = col_type.python_type
111
+ elif isclass(col_type) and issubclass(col_type, SQLType):
112
+ signals[field] = col_type().python_type
113
+ else:
108
114
  raise SignalSchemaError(
109
115
  f"signal schema cannot be obtained for column '{field}':"
110
- f" unsupported type '{py_type}'"
116
+ f" unsupported type '{col_type}'"
111
117
  )
112
- signals[field] = py_type
113
118
  return SignalSchema(signals)
114
119
 
115
120
  def serialize(self) -> dict[str, str]:
@@ -232,7 +237,7 @@ class SignalSchema:
232
237
  signals = [
233
238
  DEFAULT_DELIMITER.join(path)
234
239
  if not as_columns
235
- else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
240
+ else sa.Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
236
241
  for path, _type, has_subtree, _ in self.get_flat_tree()
237
242
  if not has_subtree
238
243
  ]
@@ -878,17 +878,14 @@ class SQLUnion(Step):
878
878
  temp_tables.extend(self.query1.temp_table_names)
879
879
  q2 = self.query2.apply_steps().select().subquery()
880
880
  temp_tables.extend(self.query2.temp_table_names)
881
- columns1, columns2 = fill_columns(q1.columns, q2.columns)
881
+
882
+ columns1, columns2 = _order_columns(q1.columns, q2.columns)
882
883
 
883
884
  def q(*columns):
884
885
  names = {c.name for c in columns}
885
886
  col1 = [c for c in columns1 if c.name in names]
886
887
  col2 = [c for c in columns2 if c.name in names]
887
- res = (
888
- sqlalchemy.select(*col1)
889
- .select_from(q1)
890
- .union_all(sqlalchemy.select(*col2).select_from(q2))
891
- )
888
+ res = sqlalchemy.select(*col1).union_all(sqlalchemy.select(*col2))
892
889
 
893
890
  subquery = res.subquery()
894
891
  return sqlalchemy.select(*subquery.c).select_from(subquery)
@@ -1021,23 +1018,46 @@ class GroupBy(Step):
1021
1018
  return step_result(q, grouped_query.selected_columns)
1022
1019
 
1023
1020
 
1024
- def fill_columns(
1025
- *column_iterables: Iterable[ColumnElement],
1021
+ def _validate_columns(
1022
+ left_columns: Iterable[ColumnElement], right_columns: Iterable[ColumnElement]
1023
+ ) -> set[str]:
1024
+ left_names = {c.name for c in left_columns}
1025
+ right_names = {c.name for c in right_columns}
1026
+
1027
+ if left_names == right_names:
1028
+ return left_names
1029
+
1030
+ missing_right = left_names - right_names
1031
+ missing_left = right_names - left_names
1032
+
1033
+ def _prepare_msg_part(missing_columns: set[str], side: str) -> str:
1034
+ return f"{', '.join(sorted(missing_columns))} only present in {side}"
1035
+
1036
+ msg_parts = [
1037
+ _prepare_msg_part(missing_columns, found_side)
1038
+ for missing_columns, found_side in zip(
1039
+ [
1040
+ missing_right,
1041
+ missing_left,
1042
+ ],
1043
+ ["left", "right"],
1044
+ )
1045
+ if missing_columns
1046
+ ]
1047
+ msg = f"Cannot perform union. {'. '.join(msg_parts)}"
1048
+
1049
+ raise ValueError(msg)
1050
+
1051
+
1052
+ def _order_columns(
1053
+ left_columns: Iterable[ColumnElement], right_columns: Iterable[ColumnElement]
1026
1054
  ) -> list[list[ColumnElement]]:
1027
- column_dicts = [{c.name: c for c in columns} for columns in column_iterables]
1028
- combined_columns = {n: c for col_dict in column_dicts for n, c in col_dict.items()}
1029
-
1030
- result: list[list[ColumnElement]] = [[] for _ in column_dicts]
1031
- for n in combined_columns:
1032
- col = next(col_dict[n] for col_dict in column_dicts if n in col_dict)
1033
- for col_dict, out in zip(column_dicts, result):
1034
- if n in col_dict:
1035
- out.append(col_dict[n])
1036
- else:
1037
- # Cast the NULL to ensure all columns are aware of their type
1038
- # Label it to ensure it's aware of its name
1039
- out.append(sqlalchemy.cast(sqlalchemy.null(), col.type).label(n))
1040
- return result
1055
+ column_order = _validate_columns(left_columns, right_columns)
1056
+ column_dicts = [
1057
+ {c.name: c for c in columns} for columns in [left_columns, right_columns]
1058
+ ]
1059
+
1060
+ return [[d[n] for n in column_order] for d in column_dicts]
1041
1061
 
1042
1062
 
1043
1063
  @attrs.define
@@ -26,5 +26,17 @@ class split(GenericFunction): # noqa: N801
26
26
  inherit_cache = True
27
27
 
28
28
 
29
+ class regexp_replace(GenericFunction): # noqa: N801
30
+ """
31
+ Replaces substring that match a regular expression.
32
+ """
33
+
34
+ type = String()
35
+ package = "string"
36
+ name = "regexp_replace"
37
+ inherit_cache = True
38
+
39
+
29
40
  compiler_not_implemented(length)
30
41
  compiler_not_implemented(split)
42
+ compiler_not_implemented(regexp_replace)
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import re
2
3
  import sqlite3
3
4
  from collections.abc import Iterable
4
5
  from datetime import MAXYEAR, MINYEAR, datetime, timezone
@@ -77,6 +78,7 @@ def setup():
77
78
  compiles(array.length, "sqlite")(compile_array_length)
78
79
  compiles(string.length, "sqlite")(compile_string_length)
79
80
  compiles(string.split, "sqlite")(compile_string_split)
81
+ compiles(string.regexp_replace, "sqlite")(compile_regexp_replace)
80
82
  compiles(conditional.greatest, "sqlite")(compile_greatest)
81
83
  compiles(conditional.least, "sqlite")(compile_least)
82
84
  compiles(Values, "sqlite")(compile_values)
@@ -178,9 +180,15 @@ def register_user_defined_sql_functions() -> None:
178
180
 
179
181
  _registered_function_creators["vector_functions"] = create_vector_functions
180
182
 
183
+ def sqlite_regexp_replace(string: str, pattern: str, replacement: str) -> str:
184
+ return re.sub(pattern, replacement, string)
185
+
181
186
  def create_string_functions(conn):
182
187
  conn.create_function("split", 2, sqlite_string_split, deterministic=True)
183
188
  conn.create_function("split", 3, sqlite_string_split, deterministic=True)
189
+ conn.create_function(
190
+ "regexp_replace", 3, sqlite_regexp_replace, deterministic=True
191
+ )
184
192
 
185
193
  _registered_function_creators["string_functions"] = create_string_functions
186
194
 
@@ -265,6 +273,10 @@ def path_file_ext(path):
265
273
  return func.substr(path, func.length(path) - path_file_ext_length(path) + 1)
266
274
 
267
275
 
276
+ def compile_regexp_replace(element, compiler, **kwargs):
277
+ return f"regexp_replace({compiler.process(element.clauses, **kwargs)})"
278
+
279
+
268
280
  def compile_path_parent(element, compiler, **kwargs):
269
281
  return compiler.process(path_parent(*element.clauses.clauses), **kwargs)
270
282
 
@@ -20,6 +20,8 @@ from typing import Any, Union
20
20
  import sqlalchemy as sa
21
21
  from sqlalchemy import TypeDecorator, types
22
22
 
23
+ from datachain.lib.data_model import StandardType
24
+
23
25
  _registry: dict[str, "TypeConverter"] = {}
24
26
  registry = MappingProxyType(_registry)
25
27
 
@@ -91,6 +93,10 @@ class SQLType(TypeDecorator):
91
93
  impl: type[types.TypeEngine[Any]] = types.TypeEngine
92
94
  cache_ok = True
93
95
 
96
+ @property
97
+ def python_type(self) -> StandardType:
98
+ raise NotImplementedError
99
+
94
100
  def to_dict(self) -> dict[str, Any]:
95
101
  return {"type": self.__class__.__name__}
96
102
 
@@ -103,7 +109,7 @@ class String(SQLType):
103
109
  impl = types.String
104
110
 
105
111
  @property
106
- def python_type(self):
112
+ def python_type(self) -> StandardType:
107
113
  return str
108
114
 
109
115
  def load_dialect_impl(self, dialect):
@@ -125,7 +131,7 @@ class Boolean(SQLType):
125
131
  impl = types.Boolean
126
132
 
127
133
  @property
128
- def python_type(self):
134
+ def python_type(self) -> StandardType:
129
135
  return bool
130
136
 
131
137
  def load_dialect_impl(self, dialect):
@@ -147,7 +153,7 @@ class Int(SQLType):
147
153
  impl = types.INTEGER
148
154
 
149
155
  @property
150
- def python_type(self):
156
+ def python_type(self) -> StandardType:
151
157
  return int
152
158
 
153
159
  def load_dialect_impl(self, dialect):
@@ -217,7 +223,7 @@ class Float(SQLType):
217
223
  impl = types.FLOAT
218
224
 
219
225
  @property
220
- def python_type(self):
226
+ def python_type(self) -> StandardType:
221
227
  return float
222
228
 
223
229
  def load_dialect_impl(self, dialect):
@@ -271,7 +277,7 @@ class Array(SQLType):
271
277
  impl = types.ARRAY
272
278
 
273
279
  @property
274
- def python_type(self):
280
+ def python_type(self) -> StandardType:
275
281
  return list
276
282
 
277
283
  def load_dialect_impl(self, dialect):
@@ -314,7 +320,7 @@ class JSON(SQLType):
314
320
  impl = types.JSON
315
321
 
316
322
  @property
317
- def python_type(self):
323
+ def python_type(self) -> StandardType:
318
324
  return dict
319
325
 
320
326
  def load_dialect_impl(self, dialect):
@@ -336,7 +342,7 @@ class DateTime(SQLType):
336
342
  impl = types.DATETIME
337
343
 
338
344
  @property
339
- def python_type(self):
345
+ def python_type(self) -> StandardType:
340
346
  return datetime
341
347
 
342
348
  def load_dialect_impl(self, dialect):
@@ -358,7 +364,7 @@ class Binary(SQLType):
358
364
  impl = types.BINARY
359
365
 
360
366
  @property
361
- def python_type(self):
367
+ def python_type(self) -> StandardType:
362
368
  return bytes
363
369
 
364
370
  def load_dialect_impl(self, dialect):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -41,6 +41,7 @@ Requires-Dist: pydantic<3,>=2
41
41
  Requires-Dist: jmespath>=1.0
42
42
  Requires-Dist: datamodel-code-generator>=0.25
43
43
  Requires-Dist: Pillow<11,>=10.0.0
44
+ Requires-Dist: msgpack<2,>=1.0.4
44
45
  Provides-Extra: docs
45
46
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
46
47
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -54,7 +55,6 @@ Requires-Dist: torchvision; extra == "torch"
54
55
  Requires-Dist: transformers>=4.36.0; extra == "torch"
55
56
  Provides-Extra: remote
56
57
  Requires-Dist: lz4; extra == "remote"
57
- Requires-Dist: msgpack<2,>=1.0.4; extra == "remote"
58
58
  Requires-Dist: requests>=2.22.0; extra == "remote"
59
59
  Provides-Extra: vector
60
60
  Requires-Dist: usearch; extra == "vector"
@@ -87,9 +87,8 @@ Requires-Dist: numpy<2,>=1; extra == "examples"
87
87
  Requires-Dist: defusedxml; extra == "examples"
88
88
  Requires-Dist: accelerate; extra == "examples"
89
89
  Requires-Dist: unstructured[pdf]; extra == "examples"
90
- Requires-Dist: pdfplumber==0.11.3; extra == "examples"
90
+ Requires-Dist: pdfplumber==0.11.4; extra == "examples"
91
91
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
92
- Requires-Dist: nltk==3.8.1; extra == "examples"
93
92
 
94
93
  |PyPI| |Python Version| |Codecov| |Tests|
95
94
 
@@ -22,6 +22,7 @@ pydantic<3,>=2
22
22
  jmespath>=1.0
23
23
  datamodel-code-generator>=0.25
24
24
  Pillow<11,>=10.0.0
25
+ msgpack<2,>=1.0.4
25
26
 
26
27
  [:sys_platform == "win32"]
27
28
  numpy<2,>=1
@@ -48,13 +49,11 @@ numpy<2,>=1
48
49
  defusedxml
49
50
  accelerate
50
51
  unstructured[pdf]
51
- pdfplumber==0.11.3
52
+ pdfplumber==0.11.4
52
53
  huggingface_hub[hf_transfer]
53
- nltk==3.8.1
54
54
 
55
55
  [remote]
56
56
  lz4
57
- msgpack<2,>=1.0.4
58
57
  requests>=2.22.0
59
58
 
60
59
  [tests]