datachain 0.2.15__tar.gz → 0.2.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (256) hide show
  1. {datachain-0.2.15/src/datachain.egg-info → datachain-0.2.16}/PKG-INFO +1 -1
  2. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/iptc_exif_xmp_lib.py +2 -1
  3. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/openimage-detect.py +1 -1
  4. {datachain-0.2.15 → datachain-0.2.16}/examples/get_started/json-csv-reader.py +4 -14
  5. {datachain-0.2.15 → datachain-0.2.16}/examples/get_started/torch-loader.py +1 -1
  6. {datachain-0.2.15 → datachain-0.2.16}/examples/multimodal/wds.py +20 -11
  7. {datachain-0.2.15 → datachain-0.2.16}/examples/multimodal/wds_filtered.py +1 -0
  8. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/catalog/catalog.py +5 -7
  9. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/cli.py +1 -1
  10. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/data_storage/metastore.py +2 -2
  11. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/data_storage/warehouse.py +4 -6
  12. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/dc.py +8 -20
  13. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/signal_schema.py +4 -1
  14. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/query/dataset.py +5 -2
  15. {datachain-0.2.15 → datachain-0.2.16/src/datachain.egg-info}/PKG-INFO +1 -1
  16. {datachain-0.2.15 → datachain-0.2.16}/tests/func/test_catalog.py +1 -1
  17. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_datachain.py +68 -3
  18. {datachain-0.2.15 → datachain-0.2.16}/.cruft.json +0 -0
  19. {datachain-0.2.15 → datachain-0.2.16}/.gitattributes +0 -0
  20. {datachain-0.2.15 → datachain-0.2.16}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  21. {datachain-0.2.15 → datachain-0.2.16}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  22. {datachain-0.2.15 → datachain-0.2.16}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  23. {datachain-0.2.15 → datachain-0.2.16}/.github/codecov.yaml +0 -0
  24. {datachain-0.2.15 → datachain-0.2.16}/.github/dependabot.yml +0 -0
  25. {datachain-0.2.15 → datachain-0.2.16}/.github/workflows/benchmarks.yml +0 -0
  26. {datachain-0.2.15 → datachain-0.2.16}/.github/workflows/release.yml +0 -0
  27. {datachain-0.2.15 → datachain-0.2.16}/.github/workflows/tests.yml +0 -0
  28. {datachain-0.2.15 → datachain-0.2.16}/.github/workflows/update-template.yaml +0 -0
  29. {datachain-0.2.15 → datachain-0.2.16}/.gitignore +0 -0
  30. {datachain-0.2.15 → datachain-0.2.16}/.pre-commit-config.yaml +0 -0
  31. {datachain-0.2.15 → datachain-0.2.16}/CODE_OF_CONDUCT.rst +0 -0
  32. {datachain-0.2.15 → datachain-0.2.16}/CONTRIBUTING.rst +0 -0
  33. {datachain-0.2.15 → datachain-0.2.16}/LICENSE +0 -0
  34. {datachain-0.2.15 → datachain-0.2.16}/README.rst +0 -0
  35. {datachain-0.2.15 → datachain-0.2.16}/docs/assets/captioned_cartoons.png +0 -0
  36. {datachain-0.2.15 → datachain-0.2.16}/docs/assets/datachain.png +0 -0
  37. {datachain-0.2.15 → datachain-0.2.16}/docs/assets/flowchart.png +0 -0
  38. {datachain-0.2.15 → datachain-0.2.16}/docs/index.md +0 -0
  39. {datachain-0.2.15 → datachain-0.2.16}/docs/references/datachain.md +0 -0
  40. {datachain-0.2.15 → datachain-0.2.16}/docs/references/datatype.md +0 -0
  41. {datachain-0.2.15 → datachain-0.2.16}/docs/references/file.md +0 -0
  42. {datachain-0.2.15 → datachain-0.2.16}/docs/references/index.md +0 -0
  43. {datachain-0.2.15 → datachain-0.2.16}/docs/references/sql.md +0 -0
  44. {datachain-0.2.15 → datachain-0.2.16}/docs/references/torch.md +0 -0
  45. {datachain-0.2.15 → datachain-0.2.16}/docs/references/udf.md +0 -0
  46. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/blip2_image_desc_lib.py +0 -0
  47. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/.gitignore +0 -0
  48. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/1-quick-start.ipynb +0 -0
  49. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/2-working-with-image-datachains.ipynb +0 -0
  50. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/3-train-model.ipynb +0 -0
  51. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/4-inference.ipynb +0 -0
  52. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/README.md +0 -0
  53. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/requirements.txt +0 -0
  54. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/scripts/1-quick-start.py +0 -0
  55. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/scripts/2-basic-operations.py +0 -0
  56. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/scripts/2-embeddings.py +0 -0
  57. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/scripts/3-split-train-test.py +0 -0
  58. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/scripts/3-train-model.py +0 -0
  59. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/src/clustering.py +0 -0
  60. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/src/train.py +0 -0
  61. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/static/images/basic-operations.png +0 -0
  62. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/static/images/core-concepts.png +0 -0
  63. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/static/images/datachain-logo.png +0 -0
  64. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/static/images/datachain-overview.png +0 -0
  65. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/static/images/dataset-1.png +0 -0
  66. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/static/images/dataset-2.png +0 -0
  67. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/static/images/dataset-3.png +0 -0
  68. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/fashion_product_images/static/images/studio.png +0 -0
  69. {datachain-0.2.15 → datachain-0.2.16}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  70. {datachain-0.2.15 → datachain-0.2.16}/examples/get_started/common_sql_functions.py +0 -0
  71. {datachain-0.2.15 → datachain-0.2.16}/examples/get_started/json-metadata-tutorial.ipynb +0 -0
  72. {datachain-0.2.15 → datachain-0.2.16}/examples/get_started/udfs/parallel.py +0 -0
  73. {datachain-0.2.15 → datachain-0.2.16}/examples/get_started/udfs/simple.py +0 -0
  74. {datachain-0.2.15 → datachain-0.2.16}/examples/get_started/udfs/stateful.py +0 -0
  75. {datachain-0.2.15 → datachain-0.2.16}/examples/llm/llm_chatbot_evaluation.ipynb +0 -0
  76. {datachain-0.2.15 → datachain-0.2.16}/examples/llm_and_nlp/llm-claude-aggregate-query.py +0 -0
  77. {datachain-0.2.15 → datachain-0.2.16}/examples/llm_and_nlp/llm-claude-simple-query.py +0 -0
  78. {datachain-0.2.15 → datachain-0.2.16}/examples/llm_and_nlp/llm-claude.py +0 -0
  79. {datachain-0.2.15 → datachain-0.2.16}/examples/llm_and_nlp/unstructured-text.py +0 -0
  80. {datachain-0.2.15 → datachain-0.2.16}/examples/multimodal/clip_fine_tuning.ipynb +0 -0
  81. {datachain-0.2.15 → datachain-0.2.16}/examples/multimodal/clip_inference.py +0 -0
  82. {datachain-0.2.15 → datachain-0.2.16}/examples/multimodal/hf_pipeline.py +0 -0
  83. {datachain-0.2.15 → datachain-0.2.16}/examples/multimodal/openai_image_desc_lib.py +0 -0
  84. {datachain-0.2.15 → datachain-0.2.16}/mkdocs.yml +0 -0
  85. {datachain-0.2.15 → datachain-0.2.16}/noxfile.py +0 -0
  86. {datachain-0.2.15 → datachain-0.2.16}/pyproject.toml +0 -0
  87. {datachain-0.2.15 → datachain-0.2.16}/setup.cfg +0 -0
  88. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/__init__.py +0 -0
  89. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/__main__.py +0 -0
  90. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/asyn.py +0 -0
  91. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/cache.py +0 -0
  92. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/catalog/__init__.py +0 -0
  93. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/catalog/datasource.py +0 -0
  94. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/catalog/loader.py +0 -0
  95. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/catalog/subclass.py +0 -0
  96. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/cli_utils.py +0 -0
  97. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/client/__init__.py +0 -0
  98. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/client/azure.py +0 -0
  99. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/client/fileslice.py +0 -0
  100. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/client/fsspec.py +0 -0
  101. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/client/gcs.py +0 -0
  102. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/client/local.py +0 -0
  103. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/client/s3.py +0 -0
  104. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/config.py +0 -0
  105. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/data_storage/__init__.py +0 -0
  106. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/data_storage/db_engine.py +0 -0
  107. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/data_storage/id_generator.py +0 -0
  108. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/data_storage/job.py +0 -0
  109. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/data_storage/schema.py +0 -0
  110. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/data_storage/serializer.py +0 -0
  111. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/data_storage/sqlite.py +0 -0
  112. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/dataset.py +0 -0
  113. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/error.py +0 -0
  114. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/job.py +0 -0
  115. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/__init__.py +0 -0
  116. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/arrow.py +0 -0
  117. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/clip.py +0 -0
  118. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/convert/__init__.py +0 -0
  119. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/convert/flatten.py +0 -0
  120. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/convert/python_to_sql.py +0 -0
  121. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/convert/sql_to_python.py +0 -0
  122. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/convert/unflatten.py +0 -0
  123. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  124. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/data_model.py +0 -0
  125. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/dataset_info.py +0 -0
  126. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/file.py +0 -0
  127. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/image.py +0 -0
  128. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/meta_formats.py +0 -0
  129. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/model_store.py +0 -0
  130. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/pytorch.py +0 -0
  131. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/settings.py +0 -0
  132. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/text.py +0 -0
  133. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/udf.py +0 -0
  134. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/udf_signature.py +0 -0
  135. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/utils.py +0 -0
  136. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/vfile.py +0 -0
  137. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/webdataset.py +0 -0
  138. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/lib/webdataset_laion.py +0 -0
  139. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/listing.py +0 -0
  140. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/node.py +0 -0
  141. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/nodes_fetcher.py +0 -0
  142. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/nodes_thread_pool.py +0 -0
  143. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/progress.py +0 -0
  144. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/py.typed +0 -0
  145. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/query/__init__.py +0 -0
  146. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/query/batch.py +0 -0
  147. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/query/builtins.py +0 -0
  148. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/query/dispatch.py +0 -0
  149. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/query/metrics.py +0 -0
  150. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/query/params.py +0 -0
  151. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/query/schema.py +0 -0
  152. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/query/session.py +0 -0
  153. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/query/udf.py +0 -0
  154. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/remote/__init__.py +0 -0
  155. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/remote/studio.py +0 -0
  156. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/__init__.py +0 -0
  157. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/default/__init__.py +0 -0
  158. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/default/base.py +0 -0
  159. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/functions/__init__.py +0 -0
  160. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/functions/array.py +0 -0
  161. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/functions/conditional.py +0 -0
  162. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/functions/path.py +0 -0
  163. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/functions/random.py +0 -0
  164. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/functions/string.py +0 -0
  165. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/selectable.py +0 -0
  166. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/sqlite/__init__.py +0 -0
  167. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/sqlite/base.py +0 -0
  168. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/sqlite/types.py +0 -0
  169. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/sqlite/vector.py +0 -0
  170. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/types.py +0 -0
  171. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/sql/utils.py +0 -0
  172. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/storage.py +0 -0
  173. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/torch/__init__.py +0 -0
  174. {datachain-0.2.15 → datachain-0.2.16}/src/datachain/utils.py +0 -0
  175. {datachain-0.2.15 → datachain-0.2.16}/src/datachain.egg-info/SOURCES.txt +0 -0
  176. {datachain-0.2.15 → datachain-0.2.16}/src/datachain.egg-info/dependency_links.txt +0 -0
  177. {datachain-0.2.15 → datachain-0.2.16}/src/datachain.egg-info/entry_points.txt +0 -0
  178. {datachain-0.2.15 → datachain-0.2.16}/src/datachain.egg-info/requires.txt +0 -0
  179. {datachain-0.2.15 → datachain-0.2.16}/src/datachain.egg-info/top_level.txt +0 -0
  180. {datachain-0.2.15 → datachain-0.2.16}/tests/__init__.py +0 -0
  181. {datachain-0.2.15 → datachain-0.2.16}/tests/benchmarks/__init__.py +0 -0
  182. {datachain-0.2.15 → datachain-0.2.16}/tests/benchmarks/conftest.py +0 -0
  183. {datachain-0.2.15 → datachain-0.2.16}/tests/benchmarks/test_ls.py +0 -0
  184. {datachain-0.2.15 → datachain-0.2.16}/tests/benchmarks/test_version.py +0 -0
  185. {datachain-0.2.15 → datachain-0.2.16}/tests/conftest.py +0 -0
  186. {datachain-0.2.15 → datachain-0.2.16}/tests/data.py +0 -0
  187. {datachain-0.2.15 → datachain-0.2.16}/tests/examples/__init__.py +0 -0
  188. {datachain-0.2.15 → datachain-0.2.16}/tests/examples/test_wds_e2e.py +0 -0
  189. {datachain-0.2.15 → datachain-0.2.16}/tests/examples/wds_data.py +0 -0
  190. {datachain-0.2.15 → datachain-0.2.16}/tests/func/__init__.py +0 -0
  191. {datachain-0.2.15 → datachain-0.2.16}/tests/func/test_client.py +0 -0
  192. {datachain-0.2.15 → datachain-0.2.16}/tests/func/test_datachain.py +0 -0
  193. {datachain-0.2.15 → datachain-0.2.16}/tests/func/test_dataset_query.py +0 -0
  194. {datachain-0.2.15 → datachain-0.2.16}/tests/func/test_datasets.py +0 -0
  195. {datachain-0.2.15 → datachain-0.2.16}/tests/func/test_feature_pickling.py +0 -0
  196. {datachain-0.2.15 → datachain-0.2.16}/tests/func/test_ls.py +0 -0
  197. {datachain-0.2.15 → datachain-0.2.16}/tests/func/test_pull.py +0 -0
  198. {datachain-0.2.15 → datachain-0.2.16}/tests/func/test_pytorch.py +0 -0
  199. {datachain-0.2.15 → datachain-0.2.16}/tests/func/test_query.py +0 -0
  200. {datachain-0.2.15 → datachain-0.2.16}/tests/scripts/feature_class.py +0 -0
  201. {datachain-0.2.15 → datachain-0.2.16}/tests/scripts/feature_class_parallel.py +0 -0
  202. {datachain-0.2.15 → datachain-0.2.16}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  203. {datachain-0.2.15 → datachain-0.2.16}/tests/scripts/name_len_normal.py +0 -0
  204. {datachain-0.2.15 → datachain-0.2.16}/tests/scripts/name_len_slow.py +0 -0
  205. {datachain-0.2.15 → datachain-0.2.16}/tests/test_cli_e2e.py +0 -0
  206. {datachain-0.2.15 → datachain-0.2.16}/tests/test_query_e2e.py +0 -0
  207. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/__init__.py +0 -0
  208. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/__init__.py +0 -0
  209. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/conftest.py +0 -0
  210. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_arrow.py +0 -0
  211. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_clip.py +0 -0
  212. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  213. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_datachain_merge.py +0 -0
  214. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_feature.py +0 -0
  215. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_feature_utils.py +0 -0
  216. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_file.py +0 -0
  217. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_image.py +0 -0
  218. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_signal_schema.py +0 -0
  219. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_text.py +0 -0
  220. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_udf_signature.py +0 -0
  221. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_utils.py +0 -0
  222. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/lib/test_webdataset.py +0 -0
  223. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/sql/__init__.py +0 -0
  224. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/sql/sqlite/__init__.py +0 -0
  225. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/sql/sqlite/test_utils.py +0 -0
  226. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/sql/test_array.py +0 -0
  227. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/sql/test_conditional.py +0 -0
  228. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/sql/test_path.py +0 -0
  229. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/sql/test_random.py +0 -0
  230. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/sql/test_selectable.py +0 -0
  231. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/sql/test_string.py +0 -0
  232. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_asyn.py +0 -0
  233. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_cache.py +0 -0
  234. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_catalog.py +0 -0
  235. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_catalog_loader.py +0 -0
  236. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_cli_parsing.py +0 -0
  237. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_client.py +0 -0
  238. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_client_s3.py +0 -0
  239. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_data_storage.py +0 -0
  240. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_database_engine.py +0 -0
  241. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_dataset.py +0 -0
  242. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_dispatch.py +0 -0
  243. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_fileslice.py +0 -0
  244. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_id_generator.py +0 -0
  245. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_listing.py +0 -0
  246. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_metastore.py +0 -0
  247. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_module_exports.py +0 -0
  248. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_query_metrics.py +0 -0
  249. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_query_params.py +0 -0
  250. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_serializer.py +0 -0
  251. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_session.py +0 -0
  252. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_storage.py +0 -0
  253. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_udf.py +0 -0
  254. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_utils.py +0 -0
  255. {datachain-0.2.15 → datachain-0.2.16}/tests/unit/test_warehouse.py +0 -0
  256. {datachain-0.2.15 → datachain-0.2.16}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.15
3
+ Version: 0.2.16
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -1,3 +1,4 @@
1
+ # pip install defusedxml
1
2
  import json
2
3
 
3
4
  from PIL import (
@@ -63,7 +64,7 @@ if __name__ == "__main__":
63
64
  DataChain.from_storage(source, type="image")
64
65
  .settings(parallel=-1)
65
66
  .filter(C("file.name").glob("*.jpg"))
66
- .limit(10000)
67
+ .limit(5000)
67
68
  .map(
68
69
  image_description,
69
70
  params=["file"],
@@ -48,7 +48,7 @@ def openimage_detect(args):
48
48
  yield fstream, bbox
49
49
 
50
50
 
51
- source = "gs://datachain-demo/openimages-v6-test-jsonpairs"
51
+ source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
52
52
 
53
53
  (
54
54
  DataChain.from_storage(source)
@@ -36,7 +36,7 @@ def main():
36
36
  print("========================================================================")
37
37
  uri = "gs://datachain-demo/jsonl/object.jsonl"
38
38
  jsonl_ds = DataChain.from_json(uri, meta_type="jsonl", print_schema=True)
39
- print(jsonl_ds.to_pandas())
39
+ jsonl_ds.show()
40
40
 
41
41
  print()
42
42
  print("========================================================================")
@@ -49,8 +49,7 @@ def main():
49
49
  json_pairs_ds = DataChain.from_json(
50
50
  uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage"
51
51
  )
52
- print(json_pairs_ds.to_pandas())
53
- # print(list(json_pairs_ds.collect())[0])
52
+ json_pairs_ds.show()
54
53
 
55
54
  uri = "gs://datachain-demo/coco2017/annotations_captions/"
56
55
 
@@ -72,7 +71,7 @@ def main():
72
71
  static_json_ds = DataChain.from_json(
73
72
  uri, jmespath="licenses", spec=LicenseFeature, nrows=3
74
73
  )
75
- print(static_json_ds.to_pandas())
74
+ static_json_ds.show()
76
75
 
77
76
  print()
78
77
  print("========================================================================")
@@ -88,16 +87,7 @@ def main():
88
87
  print("========================================================================")
89
88
  static_csv_ds = DataChain.from_csv(uri, output=ChatDialog, object_name="chat")
90
89
  static_csv_ds.print_schema()
91
- print(static_csv_ds.to_pandas())
92
-
93
- uri = "gs://datachain-demo/laion-aesthetics-csv"
94
- print()
95
- print("========================================================================")
96
- print("dynamic CSV with header schema test parsing 3/3M objects")
97
- print("========================================================================")
98
- dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3)
99
- dynamic_csv_ds.print_schema()
100
- print(dynamic_csv_ds.to_pandas())
90
+ static_csv_ds.show()
101
91
 
102
92
 
103
93
  if __name__ == "__main__":
@@ -64,7 +64,7 @@ if __name__ == "__main__":
64
64
  optimizer = optim.Adam(model.parameters(), lr=0.001)
65
65
 
66
66
  # Train the model
67
- num_epochs = 10
67
+ num_epochs = 3
68
68
  for epoch in range(num_epochs):
69
69
  for i, data in enumerate(train_loader):
70
70
  inputs, labels = data
@@ -1,5 +1,3 @@
1
- import pandas as pd
2
-
3
1
  from datachain import C, DataChain
4
2
  from datachain.lib.webdataset import process_webdataset
5
3
  from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
@@ -9,25 +7,36 @@ wds = (
9
7
  .filter(C("file.name").glob("00000000.tar"))
10
8
  .settings(cache=True)
11
9
  .gen(laion=process_webdataset(spec=WDSLaion), params="file")
10
+ .save() # materialize chain to avoid downloading data multiple times
11
+ )
12
+
13
+ meta_pq = (
14
+ DataChain.from_parquet("gs://datachain-demo/datacomp-small/metadata/0020f*.parquet")
15
+ .filter(
16
+ C("uid").in_(values[0] for values in wds.select("laion.json.uid").collect())
17
+ )
18
+ .map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
19
+ .save()
12
20
  )
13
21
 
14
22
  meta_emd = (
15
- DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata")
16
- .filter(C("file.name").glob("0020f*.npz"))
23
+ DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata/0020f*.npz")
17
24
  .gen(emd=process_laion_meta)
25
+ .filter(
26
+ C("emd.index").in_(
27
+ values[0] for values in meta_pq.select("source.index").collect()
28
+ )
29
+ )
18
30
  .map(stem=lambda file: file.get_file_stem(), params=["emd.file"], output=str)
19
31
  )
20
32
 
21
- meta_pq = DataChain.from_parquet(
22
- "gs://datachain-demo/datacomp-small/metadata/0020f*.parquet"
23
- ).map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
24
33
 
25
34
  meta = meta_emd.merge(
26
- meta_pq, on=["stem", "emd.index"], right_on=["stem", "source.index"]
35
+ meta_pq,
36
+ on=["stem", "emd.index"],
37
+ right_on=["stem", "source.index"],
27
38
  )
28
39
 
29
40
  res = wds.merge(meta, on="laion.json.uid", right_on="uid")
30
41
 
31
- df = res.limit(10).to_pandas()
32
- with pd.option_context("display.max_columns", None):
33
- print(df)
42
+ res.show(3)
@@ -31,6 +31,7 @@ filtered = (
31
31
  / least(C("laion.json.original_width"), C("laion.json.original_height"))
32
32
  < 3.0
33
33
  )
34
+ .save()
34
35
  )
35
36
  filtered.show(3)
36
37
 
@@ -1217,16 +1217,14 @@ class Catalog:
1217
1217
  def get_temp_table_names(self) -> list[str]:
1218
1218
  return self.warehouse.get_temp_table_names()
1219
1219
 
1220
- def cleanup_temp_tables(self, names: Iterable[str]) -> None:
1220
+ def cleanup_tables(self, names: Iterable[str]) -> None:
1221
1221
  """
1222
- Drop tables created temporarily when processing datasets.
1222
+ Drop tables passed.
1223
1223
 
1224
- This should be implemented even if temporary tables are used to
1225
- ensure that they are cleaned up as soon as they are no longer
1226
- needed. When running the same `DatasetQuery` multiple times we
1227
- may use the same temporary table names.
1224
+ This should be implemented to ensure that the provided tables
1225
+ are cleaned up as soon as they are no longer needed.
1228
1226
  """
1229
- self.warehouse.cleanup_temp_tables(names)
1227
+ self.warehouse.cleanup_tables(names)
1230
1228
  self.id_generator.delete_uris(names)
1231
1229
 
1232
1230
  def create_dataset_from_sources(
@@ -910,7 +910,7 @@ def garbage_collect(catalog: "Catalog"):
910
910
  print("Nothing to clean up.")
911
911
  else:
912
912
  print(f"Garbage collecting {len(temp_tables)} tables.")
913
- catalog.cleanup_temp_tables(temp_tables)
913
+ catalog.cleanup_tables(temp_tables)
914
914
 
915
915
 
916
916
  def completion(shell: str) -> str:
@@ -97,7 +97,7 @@ class AbstractMetastore(ABC, Serializable):
97
97
  def close(self) -> None:
98
98
  """Closes any active database or HTTP connections."""
99
99
 
100
- def cleanup_temp_tables(self, temp_table_names: list[str]) -> None:
100
+ def cleanup_tables(self, temp_table_names: list[str]) -> None:
101
101
  """Cleanup temp tables."""
102
102
 
103
103
  def cleanup_for_tests(self) -> None:
@@ -457,7 +457,7 @@ class AbstractDBMetastore(AbstractMetastore):
457
457
  """Closes any active database connections."""
458
458
  self.db.close()
459
459
 
460
- def cleanup_temp_tables(self, temp_table_names: list[str]) -> None:
460
+ def cleanup_tables(self, temp_table_names: list[str]) -> None:
461
461
  """Cleanup temp tables."""
462
462
  self.id_generator.delete_uris(temp_table_names)
463
463
 
@@ -915,14 +915,12 @@ class AbstractWarehouse(ABC, Serializable):
915
915
  if self.is_temp_table_name(t)
916
916
  ]
917
917
 
918
- def cleanup_temp_tables(self, names: Iterable[str]) -> None:
918
+ def cleanup_tables(self, names: Iterable[str]) -> None:
919
919
  """
920
- Drop tables created temporarily when processing datasets.
920
+ Drop tables passed.
921
921
 
922
- This should be implemented even if temporary tables are used to
923
- ensure that they are cleaned up as soon as they are no longer
924
- needed. When running the same `DatasetQuery` multiple times we
925
- may use the same temporary table names.
922
+ This should be implemented to ensure that the provided tables
923
+ are cleaned up as soon as they are no longer needed.
926
924
  """
927
925
  for name in names:
928
926
  self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
@@ -193,8 +193,6 @@ class DataChain(DatasetQuery):
193
193
  ```
194
194
  """
195
195
 
196
- max_row_count: Optional[int] = None
197
-
198
196
  DEFAULT_FILE_RECORD: ClassVar[dict] = {
199
197
  "source": "",
200
198
  "name": "",
@@ -1124,7 +1122,7 @@ class DataChain(DatasetQuery):
1124
1122
  def _func_fr() -> Iterator[tuple_type]: # type: ignore[valid-type]
1125
1123
  yield from tuples
1126
1124
 
1127
- chain = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD, session=session)
1125
+ chain = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD, session=session)
1128
1126
  if object_name:
1129
1127
  output = {object_name: DataChain._dict_to_data_model(object_name, output)} # type: ignore[arg-type]
1130
1128
  return chain.gen(_func_fr, output=output)
@@ -1441,13 +1439,14 @@ class DataChain(DatasetQuery):
1441
1439
  )
1442
1440
 
1443
1441
  @classmethod
1444
- def create_empty(
1442
+ def from_records(
1445
1443
  cls,
1446
1444
  to_insert: Optional[Union[dict, list[dict]]],
1447
1445
  session: Optional[Session] = None,
1448
1446
  ) -> "DataChain":
1449
- """Create empty chain. Returns a chain. This method is used for programmatically
1450
- generating a chains in contrast of reading data from storages or other sources.
1447
+ """Create a DataChain from the provided records. This method can be used for
1448
+ programmatically generating a chain in contrast of reading data from storages
1449
+ or other sources.
1451
1450
 
1452
1451
  Parameters:
1453
1452
  to_insert : records (or a single record) to insert. Each record is
@@ -1455,8 +1454,8 @@ class DataChain(DatasetQuery):
1455
1454
 
1456
1455
  Example:
1457
1456
  ```py
1458
- empty = DataChain.create_empty()
1459
- single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
1457
+ empty = DataChain.from_records()
1458
+ single_record = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
1460
1459
  ```
1461
1460
  """
1462
1461
  session = Session.get(session)
@@ -1602,18 +1601,7 @@ class DataChain(DatasetQuery):
1602
1601
  @detach
1603
1602
  def limit(self, n: int) -> "Self":
1604
1603
  """Return the first n rows of the chain."""
1605
- n = max(n, 0)
1606
-
1607
- if self.max_row_count is None:
1608
- self.max_row_count = n
1609
- return super().limit(n)
1610
-
1611
- limit = min(n, self.max_row_count)
1612
- if limit == self.max_row_count:
1613
- return self
1614
-
1615
- self.max_row_count = limit
1616
- return super().limit(self.max_row_count)
1604
+ return super().limit(n)
1617
1605
 
1618
1606
  @detach
1619
1607
  def offset(self, offset: int) -> "Self":
@@ -243,8 +243,11 @@ class SignalSchema:
243
243
  curr_type = None
244
244
  i = 0
245
245
  while curr_tree is not None and i < len(path):
246
- if val := curr_tree.get(path[i], None):
246
+ if val := curr_tree.get(path[i]):
247
247
  curr_type, curr_tree = val
248
+ elif i == 0 and len(path) > 1 and (val := curr_tree.get(".".join(path))):
249
+ curr_type, curr_tree = val
250
+ break
248
251
  else:
249
252
  curr_type = None
250
253
  i += 1
@@ -1201,10 +1201,10 @@ class DatasetQuery:
1201
1201
  # implementations, as errors may close or render unusable the existing
1202
1202
  # connections.
1203
1203
  metastore = self.catalog.metastore.clone(use_new_connection=True)
1204
- metastore.cleanup_temp_tables(self.temp_table_names)
1204
+ metastore.cleanup_tables(self.temp_table_names)
1205
1205
  metastore.close()
1206
1206
  warehouse = self.catalog.warehouse.clone(use_new_connection=True)
1207
- warehouse.cleanup_temp_tables(self.temp_table_names)
1207
+ warehouse.cleanup_tables(self.temp_table_names)
1208
1208
  warehouse.close()
1209
1209
  self.temp_table_names = []
1210
1210
 
@@ -1383,6 +1383,9 @@ class DatasetQuery:
1383
1383
  @detach
1384
1384
  def limit(self, n: int) -> "Self":
1385
1385
  query = self.clone(new_table=False)
1386
+ for step in query.steps:
1387
+ if isinstance(step, SQLLimit) and step.n < n:
1388
+ return query
1386
1389
  query.steps.append(SQLLimit(n))
1387
1390
  return query
1388
1391
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.15
3
+ Version: 0.2.16
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -1120,7 +1120,7 @@ def test_garbage_collect(cloud_test_catalog, from_cli, capsys):
1120
1120
  captured = capsys.readouterr()
1121
1121
  assert captured.out == "Garbage collecting 4 tables.\n"
1122
1122
  else:
1123
- catalog.cleanup_temp_tables(temp_tables)
1123
+ catalog.cleanup_tables(temp_tables)
1124
1124
  assert catalog.get_temp_table_names() == []
1125
1125
 
1126
1126
 
@@ -95,7 +95,7 @@ def test_pandas_incorrect_column_names(catalog):
95
95
 
96
96
 
97
97
  def test_from_features_basic(catalog):
98
- ds = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
98
+ ds = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
99
99
  ds = ds.gen(lambda prm: [File(name="")] * 5, params="parent", output={"file": File})
100
100
 
101
101
  ds_name = "my_ds"
@@ -109,7 +109,7 @@ def test_from_features_basic(catalog):
109
109
 
110
110
 
111
111
  def test_from_features(catalog):
112
- ds = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
112
+ ds = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
113
113
  ds = ds.gen(
114
114
  lambda prm: list(zip([File(name="")] * len(features), features)),
115
115
  params="parent",
@@ -138,7 +138,7 @@ def test_datasets(catalog):
138
138
 
139
139
 
140
140
  def test_preserve_feature_schema(catalog):
141
- ds = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
141
+ ds = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
142
142
  ds = ds.gen(
143
143
  lambda prm: list(zip([File(name="")] * len(features), features, features)),
144
144
  params="parent",
@@ -612,6 +612,41 @@ def test_select_restore_from_saving(catalog):
612
612
  assert n == len(features_nested)
613
613
 
614
614
 
615
+ def test_select_distinct(catalog):
616
+ class Embedding(BaseModel):
617
+ id: int
618
+ filename: str
619
+ values: list[float]
620
+
621
+ expected = [
622
+ [0.1, 0.3],
623
+ [0.1, 0.4],
624
+ [0.1, 0.5],
625
+ [0.1, 0.6],
626
+ ]
627
+
628
+ actual = (
629
+ DataChain.from_values(
630
+ embedding=[
631
+ Embedding(id=1, filename="a.jpg", values=expected[0]),
632
+ Embedding(id=2, filename="b.jpg", values=expected[2]),
633
+ Embedding(id=3, filename="c.jpg", values=expected[1]),
634
+ Embedding(id=4, filename="d.jpg", values=expected[1]),
635
+ Embedding(id=5, filename="e.jpg", values=expected[3]),
636
+ ],
637
+ )
638
+ .select("embedding.values", "embedding.filename")
639
+ .distinct("embedding.values")
640
+ .order_by("embedding.values")
641
+ .collect()
642
+ )
643
+
644
+ actual = [emb[0] for emb in actual]
645
+ assert len(actual) == 4
646
+ for i in [0, 1]:
647
+ assert np.allclose([emb[i] for emb in actual], [emp[i] for emp in expected])
648
+
649
+
615
650
  def test_from_dataset_name_version(catalog):
616
651
  name = "test-version"
617
652
  DataChain.from_values(
@@ -1193,3 +1228,33 @@ def test_custom_model_with_nested_lists():
1193
1228
  traces_double=[[{"x": 0.5, "y": 0.5}], [{"x": 0.5, "y": 0.5}]],
1194
1229
  )
1195
1230
  ]
1231
+
1232
+
1233
+ def test_min_limit():
1234
+ dc = DataChain.from_values(a=[1, 2, 3, 4, 5])
1235
+ assert dc.count() == 5
1236
+ assert dc.limit(4).count() == 4
1237
+ assert dc.count() == 5
1238
+ assert dc.limit(1).count() == 1
1239
+ assert dc.count() == 5
1240
+ assert dc.limit(2).limit(3).count() == 2
1241
+ assert dc.count() == 5
1242
+ assert dc.limit(3).limit(2).count() == 2
1243
+ assert dc.count() == 5
1244
+
1245
+
1246
+ def test_show_limit():
1247
+ dc = DataChain.from_values(a=[1, 2, 3, 4, 5])
1248
+ assert dc.count() == 5
1249
+ assert dc.limit(4).count() == 4
1250
+ dc.show(1)
1251
+ assert dc.count() == 5
1252
+ assert dc.limit(1).count() == 1
1253
+ dc.show(1)
1254
+ assert dc.count() == 5
1255
+ assert dc.limit(2).limit(3).count() == 2
1256
+ dc.show(1)
1257
+ assert dc.count() == 5
1258
+ assert dc.limit(3).limit(2).count() == 2
1259
+ dc.show(1)
1260
+ assert dc.count() == 5
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes