datachain 0.3.15__tar.gz → 0.3.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (247) hide show
  1. {datachain-0.3.15 → datachain-0.3.17}/.pre-commit-config.yaml +1 -1
  2. {datachain-0.3.15/src/datachain.egg-info → datachain-0.3.17}/PKG-INFO +1 -1
  3. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/catalog/catalog.py +13 -37
  4. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/cli.py +0 -25
  5. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/metastore.py +7 -66
  6. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/sqlite.py +24 -2
  7. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/warehouse.py +19 -25
  8. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/dc.py +1 -2
  9. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/listing.py +1 -0
  10. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/tar.py +2 -1
  11. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/node.py +17 -3
  12. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/__init__.py +0 -2
  13. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/dataset.py +58 -145
  14. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/schema.py +23 -12
  15. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/udf.py +2 -42
  16. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/utils.py +0 -40
  17. {datachain-0.3.15 → datachain-0.3.17/src/datachain.egg-info}/PKG-INFO +1 -1
  18. {datachain-0.3.15 → datachain-0.3.17}/src/datachain.egg-info/SOURCES.txt +0 -2
  19. {datachain-0.3.15 → datachain-0.3.17}/tests/conftest.py +15 -9
  20. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_catalog.py +0 -116
  21. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_datachain.py +628 -12
  22. datachain-0.3.17/tests/func/test_dataset_query.py +1195 -0
  23. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_datasets.py +101 -88
  24. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_feature_pickling.py +0 -8
  25. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_pull.py +23 -11
  26. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_query.py +16 -10
  27. {datachain-0.3.15 → datachain-0.3.17}/tests/scripts/name_len_slow.py +9 -15
  28. {datachain-0.3.15 → datachain-0.3.17}/tests/test_cli_e2e.py +1 -0
  29. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_datachain.py +15 -0
  30. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_datachain_merge.py +98 -1
  31. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_data_storage.py +17 -10
  32. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_utils.py +0 -25
  33. {datachain-0.3.15 → datachain-0.3.17}/tests/utils.py +22 -63
  34. datachain-0.3.15/src/datachain/query/builtins.py +0 -96
  35. datachain-0.3.15/tests/func/test_dataset_query.py +0 -3463
  36. datachain-0.3.15/tests/unit/test_udf.py +0 -98
  37. {datachain-0.3.15 → datachain-0.3.17}/.cruft.json +0 -0
  38. {datachain-0.3.15 → datachain-0.3.17}/.gitattributes +0 -0
  39. {datachain-0.3.15 → datachain-0.3.17}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  40. {datachain-0.3.15 → datachain-0.3.17}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  41. {datachain-0.3.15 → datachain-0.3.17}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  42. {datachain-0.3.15 → datachain-0.3.17}/.github/codecov.yaml +0 -0
  43. {datachain-0.3.15 → datachain-0.3.17}/.github/dependabot.yml +0 -0
  44. {datachain-0.3.15 → datachain-0.3.17}/.github/workflows/benchmarks.yml +0 -0
  45. {datachain-0.3.15 → datachain-0.3.17}/.github/workflows/release.yml +0 -0
  46. {datachain-0.3.15 → datachain-0.3.17}/.github/workflows/tests-studio.yml +0 -0
  47. {datachain-0.3.15 → datachain-0.3.17}/.github/workflows/tests.yml +0 -0
  48. {datachain-0.3.15 → datachain-0.3.17}/.github/workflows/update-template.yaml +0 -0
  49. {datachain-0.3.15 → datachain-0.3.17}/.gitignore +0 -0
  50. {datachain-0.3.15 → datachain-0.3.17}/CODE_OF_CONDUCT.rst +0 -0
  51. {datachain-0.3.15 → datachain-0.3.17}/CONTRIBUTING.rst +0 -0
  52. {datachain-0.3.15 → datachain-0.3.17}/LICENSE +0 -0
  53. {datachain-0.3.15 → datachain-0.3.17}/README.rst +0 -0
  54. {datachain-0.3.15 → datachain-0.3.17}/docs/assets/captioned_cartoons.png +0 -0
  55. {datachain-0.3.15 → datachain-0.3.17}/docs/assets/datachain-white.svg +0 -0
  56. {datachain-0.3.15 → datachain-0.3.17}/docs/assets/datachain.svg +0 -0
  57. {datachain-0.3.15 → datachain-0.3.17}/docs/assets/flowchart.png +0 -0
  58. {datachain-0.3.15 → datachain-0.3.17}/docs/index.md +0 -0
  59. {datachain-0.3.15 → datachain-0.3.17}/docs/references/datachain.md +0 -0
  60. {datachain-0.3.15 → datachain-0.3.17}/docs/references/datatype.md +0 -0
  61. {datachain-0.3.15 → datachain-0.3.17}/docs/references/file.md +0 -0
  62. {datachain-0.3.15 → datachain-0.3.17}/docs/references/index.md +0 -0
  63. {datachain-0.3.15 → datachain-0.3.17}/docs/references/sql.md +0 -0
  64. {datachain-0.3.15 → datachain-0.3.17}/docs/references/torch.md +0 -0
  65. {datachain-0.3.15 → datachain-0.3.17}/docs/references/udf.md +0 -0
  66. {datachain-0.3.15 → datachain-0.3.17}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  67. {datachain-0.3.15 → datachain-0.3.17}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  68. {datachain-0.3.15 → datachain-0.3.17}/examples/computer_vision/openimage-detect.py +0 -0
  69. {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/common_sql_functions.py +0 -0
  70. {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/json-csv-reader.py +0 -0
  71. {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/torch-loader.py +0 -0
  72. {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/udfs/parallel.py +0 -0
  73. {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/udfs/simple.py +0 -0
  74. {datachain-0.3.15 → datachain-0.3.17}/examples/get_started/udfs/stateful.py +0 -0
  75. {datachain-0.3.15 → datachain-0.3.17}/examples/llm_and_nlp/claude-query.py +0 -0
  76. {datachain-0.3.15 → datachain-0.3.17}/examples/llm_and_nlp/unstructured-text.py +0 -0
  77. {datachain-0.3.15 → datachain-0.3.17}/examples/multimodal/clip_inference.py +0 -0
  78. {datachain-0.3.15 → datachain-0.3.17}/examples/multimodal/hf_pipeline.py +0 -0
  79. {datachain-0.3.15 → datachain-0.3.17}/examples/multimodal/openai_image_desc_lib.py +0 -0
  80. {datachain-0.3.15 → datachain-0.3.17}/examples/multimodal/wds.py +0 -0
  81. {datachain-0.3.15 → datachain-0.3.17}/examples/multimodal/wds_filtered.py +0 -0
  82. {datachain-0.3.15 → datachain-0.3.17}/mkdocs.yml +0 -0
  83. {datachain-0.3.15 → datachain-0.3.17}/noxfile.py +0 -0
  84. {datachain-0.3.15 → datachain-0.3.17}/pyproject.toml +0 -0
  85. {datachain-0.3.15 → datachain-0.3.17}/setup.cfg +0 -0
  86. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/__init__.py +0 -0
  87. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/__main__.py +0 -0
  88. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/asyn.py +0 -0
  89. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/cache.py +0 -0
  90. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/catalog/__init__.py +0 -0
  91. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/catalog/datasource.py +0 -0
  92. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/catalog/loader.py +0 -0
  93. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/cli_utils.py +0 -0
  94. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/__init__.py +0 -0
  95. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/azure.py +0 -0
  96. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/fileslice.py +0 -0
  97. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/fsspec.py +0 -0
  98. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/gcs.py +0 -0
  99. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/hf.py +0 -0
  100. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/local.py +0 -0
  101. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/client/s3.py +0 -0
  102. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/config.py +0 -0
  103. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/__init__.py +0 -0
  104. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/db_engine.py +0 -0
  105. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/id_generator.py +0 -0
  106. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/job.py +0 -0
  107. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/schema.py +0 -0
  108. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/data_storage/serializer.py +0 -0
  109. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/dataset.py +0 -0
  110. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/error.py +0 -0
  111. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/job.py +0 -0
  112. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/__init__.py +0 -0
  113. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/arrow.py +0 -0
  114. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/clip.py +0 -0
  115. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/__init__.py +0 -0
  116. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/flatten.py +0 -0
  117. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/python_to_sql.py +0 -0
  118. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/sql_to_python.py +0 -0
  119. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/unflatten.py +0 -0
  120. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  121. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/data_model.py +0 -0
  122. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/dataset_info.py +0 -0
  123. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/file.py +0 -0
  124. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/hf.py +0 -0
  125. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/image.py +0 -0
  126. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/listing_info.py +0 -0
  127. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/meta_formats.py +0 -0
  128. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/model_store.py +0 -0
  129. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/pytorch.py +0 -0
  130. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/settings.py +0 -0
  131. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/signal_schema.py +0 -0
  132. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/text.py +0 -0
  133. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/udf.py +0 -0
  134. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/udf_signature.py +0 -0
  135. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/utils.py +0 -0
  136. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/vfile.py +0 -0
  137. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/webdataset.py +0 -0
  138. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/lib/webdataset_laion.py +0 -0
  139. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/listing.py +0 -0
  140. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/nodes_fetcher.py +0 -0
  141. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/nodes_thread_pool.py +0 -0
  142. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/progress.py +0 -0
  143. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/py.typed +0 -0
  144. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/batch.py +0 -0
  145. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/dispatch.py +0 -0
  146. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/metrics.py +0 -0
  147. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/params.py +0 -0
  148. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/queue.py +0 -0
  149. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/query/session.py +0 -0
  150. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/remote/__init__.py +0 -0
  151. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/remote/studio.py +0 -0
  152. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/__init__.py +0 -0
  153. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/default/__init__.py +0 -0
  154. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/default/base.py +0 -0
  155. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/__init__.py +0 -0
  156. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/array.py +0 -0
  157. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/conditional.py +0 -0
  158. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/path.py +0 -0
  159. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/random.py +0 -0
  160. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/functions/string.py +0 -0
  161. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/selectable.py +0 -0
  162. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/sqlite/__init__.py +0 -0
  163. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/sqlite/base.py +0 -0
  164. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/sqlite/types.py +0 -0
  165. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/sqlite/vector.py +0 -0
  166. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/types.py +0 -0
  167. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/sql/utils.py +0 -0
  168. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/storage.py +0 -0
  169. {datachain-0.3.15 → datachain-0.3.17}/src/datachain/torch/__init__.py +0 -0
  170. {datachain-0.3.15 → datachain-0.3.17}/src/datachain.egg-info/dependency_links.txt +0 -0
  171. {datachain-0.3.15 → datachain-0.3.17}/src/datachain.egg-info/entry_points.txt +0 -0
  172. {datachain-0.3.15 → datachain-0.3.17}/src/datachain.egg-info/requires.txt +0 -0
  173. {datachain-0.3.15 → datachain-0.3.17}/src/datachain.egg-info/top_level.txt +0 -0
  174. {datachain-0.3.15 → datachain-0.3.17}/tests/__init__.py +0 -0
  175. {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/__init__.py +0 -0
  176. {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/conftest.py +0 -0
  177. {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  178. {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/datasets/.dvc/config +0 -0
  179. {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/datasets/.gitignore +0 -0
  180. {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  181. {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/test_datachain.py +0 -0
  182. {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/test_ls.py +0 -0
  183. {datachain-0.3.15 → datachain-0.3.17}/tests/benchmarks/test_version.py +0 -0
  184. {datachain-0.3.15 → datachain-0.3.17}/tests/data.py +0 -0
  185. {datachain-0.3.15 → datachain-0.3.17}/tests/examples/__init__.py +0 -0
  186. {datachain-0.3.15 → datachain-0.3.17}/tests/examples/test_examples.py +0 -0
  187. {datachain-0.3.15 → datachain-0.3.17}/tests/examples/test_wds_e2e.py +0 -0
  188. {datachain-0.3.15 → datachain-0.3.17}/tests/examples/wds_data.py +0 -0
  189. {datachain-0.3.15 → datachain-0.3.17}/tests/func/__init__.py +0 -0
  190. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_client.py +0 -0
  191. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_listing.py +0 -0
  192. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_ls.py +0 -0
  193. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_meta_formats.py +0 -0
  194. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_metrics.py +0 -0
  195. {datachain-0.3.15 → datachain-0.3.17}/tests/func/test_pytorch.py +0 -0
  196. {datachain-0.3.15 → datachain-0.3.17}/tests/scripts/feature_class.py +0 -0
  197. {datachain-0.3.15 → datachain-0.3.17}/tests/scripts/feature_class_parallel.py +0 -0
  198. {datachain-0.3.15 → datachain-0.3.17}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  199. {datachain-0.3.15 → datachain-0.3.17}/tests/test_query_e2e.py +0 -0
  200. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/__init__.py +0 -0
  201. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/__init__.py +0 -0
  202. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/conftest.py +0 -0
  203. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_arrow.py +0 -0
  204. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_clip.py +0 -0
  205. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  206. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_feature.py +0 -0
  207. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_feature_utils.py +0 -0
  208. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_file.py +0 -0
  209. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_hf.py +0 -0
  210. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_image.py +0 -0
  211. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_schema.py +0 -0
  212. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_signal_schema.py +0 -0
  213. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_sql_to_python.py +0 -0
  214. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_text.py +0 -0
  215. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_udf_signature.py +0 -0
  216. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_utils.py +0 -0
  217. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/lib/test_webdataset.py +0 -0
  218. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/__init__.py +0 -0
  219. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/sqlite/__init__.py +0 -0
  220. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/sqlite/test_utils.py +0 -0
  221. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_array.py +0 -0
  222. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_conditional.py +0 -0
  223. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_path.py +0 -0
  224. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_random.py +0 -0
  225. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_selectable.py +0 -0
  226. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/sql/test_string.py +0 -0
  227. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_asyn.py +0 -0
  228. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_cache.py +0 -0
  229. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_catalog.py +0 -0
  230. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_catalog_loader.py +0 -0
  231. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_cli_parsing.py +0 -0
  232. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_client.py +0 -0
  233. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_client_s3.py +0 -0
  234. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_database_engine.py +0 -0
  235. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_dataset.py +0 -0
  236. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_dispatch.py +0 -0
  237. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_fileslice.py +0 -0
  238. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_id_generator.py +0 -0
  239. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_listing.py +0 -0
  240. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_metastore.py +0 -0
  241. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_module_exports.py +0 -0
  242. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_query_metrics.py +0 -0
  243. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_query_params.py +0 -0
  244. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_serializer.py +0 -0
  245. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_session.py +0 -0
  246. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_storage.py +0 -0
  247. {datachain-0.3.15 → datachain-0.3.17}/tests/unit/test_warehouse.py +0 -0
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.6.4'
27
+ rev: 'v0.6.5'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.15
3
+ Version: 0.3.17
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -68,8 +68,6 @@ from datachain.utils import (
68
68
  DataChainDir,
69
69
  batched,
70
70
  datachain_paths_join,
71
- import_object,
72
- parse_params_string,
73
71
  )
74
72
 
75
73
  from .datasource import DataSource
@@ -843,7 +841,7 @@ class Catalog:
843
841
  from datachain.query import DatasetQuery
844
842
 
845
843
  def _row_to_node(d: dict[str, Any]) -> Node:
846
- del d["source"]
844
+ del d["file__source"]
847
845
  return Node.from_dict(d)
848
846
 
849
847
  enlisted_sources: list[tuple[bool, bool, Any]] = []
@@ -1148,30 +1146,28 @@ class Catalog:
1148
1146
  if not sources:
1149
1147
  raise ValueError("Sources needs to be non empty list")
1150
1148
 
1151
- from datachain.query import DatasetQuery
1149
+ from datachain.lib.dc import DataChain
1150
+ from datachain.query.session import Session
1151
+
1152
+ session = Session.get(catalog=self, client_config=client_config)
1152
1153
 
1153
- dataset_queries = []
1154
+ chains = []
1154
1155
  for source in sources:
1155
1156
  if source.startswith(DATASET_PREFIX):
1156
- dq = DatasetQuery(
1157
- name=source[len(DATASET_PREFIX) :],
1158
- catalog=self,
1159
- client_config=client_config,
1157
+ dc = DataChain.from_dataset(
1158
+ source[len(DATASET_PREFIX) :], session=session
1160
1159
  )
1161
1160
  else:
1162
- dq = DatasetQuery(
1163
- path=source,
1164
- catalog=self,
1165
- client_config=client_config,
1166
- recursive=recursive,
1161
+ dc = DataChain.from_storage(
1162
+ source, session=session, recursive=recursive
1167
1163
  )
1168
1164
 
1169
- dataset_queries.append(dq)
1165
+ chains.append(dc)
1170
1166
 
1171
1167
  # create union of all dataset queries created from sources
1172
- dq = reduce(lambda ds1, ds2: ds1.union(ds2), dataset_queries)
1168
+ dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
1173
1169
  try:
1174
- dq.save(name)
1170
+ dc.save(name)
1175
1171
  except Exception as e: # noqa: BLE001
1176
1172
  try:
1177
1173
  ds = self.get_dataset(name)
@@ -1731,26 +1727,6 @@ class Catalog:
1731
1727
  output, sources, client_config=client_config, recursive=recursive
1732
1728
  )
1733
1729
 
1734
- def apply_udf(
1735
- self,
1736
- udf_location: str,
1737
- source: str,
1738
- target_name: str,
1739
- parallel: Optional[int] = None,
1740
- params: Optional[str] = None,
1741
- ):
1742
- from datachain.query import DatasetQuery
1743
-
1744
- if source.startswith(DATASET_PREFIX):
1745
- ds = DatasetQuery(name=source[len(DATASET_PREFIX) :], catalog=self)
1746
- else:
1747
- ds = DatasetQuery(path=source, catalog=self)
1748
- udf = import_object(udf_location)
1749
- if params:
1750
- args, kwargs = parse_params_string(params)
1751
- udf = udf(*args, **kwargs)
1752
- ds.add_signals(udf, parallel=parallel).save(target_name)
1753
-
1754
1730
  def query(
1755
1731
  self,
1756
1732
  query_script: str,
@@ -494,27 +494,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
494
494
  help="Query parameters",
495
495
  )
496
496
 
497
- apply_udf_parser = subp.add_parser(
498
- "apply-udf", parents=[parent_parser], description="Apply UDF"
499
- )
500
- apply_udf_parser.add_argument("udf", type=str, help="UDF location")
501
- apply_udf_parser.add_argument("source", type=str, help="Source storage or dataset")
502
- apply_udf_parser.add_argument("target", type=str, help="Target dataset name")
503
- apply_udf_parser.add_argument(
504
- "--parallel",
505
- nargs="?",
506
- type=int,
507
- const=-1,
508
- default=None,
509
- metavar="N",
510
- help=(
511
- "Use multiprocessing to run the UDF with N worker processes. "
512
- "N defaults to the CPU count."
513
- ),
514
- )
515
- apply_udf_parser.add_argument(
516
- "--udf-params", type=str, default=None, help="UDF class parameters"
517
- )
518
497
  subp.add_parser(
519
498
  "clear-cache", parents=[parent_parser], description="Clear the local file cache"
520
499
  )
@@ -1016,10 +995,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1016
995
  parallel=args.parallel,
1017
996
  params=args.param,
1018
997
  )
1019
- elif args.command == "apply-udf":
1020
- catalog.apply_udf(
1021
- args.udf, args.source, args.target, args.parallel, args.udf_params
1022
- )
1023
998
  elif args.command == "clear-cache":
1024
999
  clear_cache(catalog)
1025
1000
  elif args.command == "gc":
@@ -297,39 +297,6 @@ class AbstractMetastore(ABC, Serializable):
297
297
  #
298
298
  # Dataset dependencies
299
299
  #
300
-
301
- def add_dependency(
302
- self,
303
- dependency: DatasetDependency,
304
- source_dataset_name: str,
305
- source_dataset_version: int,
306
- ) -> None:
307
- """Add dependency to dataset or storage."""
308
- if dependency.is_dataset:
309
- self.add_dataset_dependency(
310
- source_dataset_name,
311
- source_dataset_version,
312
- dependency.dataset_name,
313
- int(dependency.version),
314
- )
315
- else:
316
- self.add_storage_dependency(
317
- source_dataset_name,
318
- source_dataset_version,
319
- StorageURI(dependency.name),
320
- dependency.version,
321
- )
322
-
323
- @abstractmethod
324
- def add_storage_dependency(
325
- self,
326
- source_dataset_name: str,
327
- source_dataset_version: int,
328
- storage_uri: StorageURI,
329
- storage_timestamp_str: Optional[str] = None,
330
- ) -> None:
331
- """Adds storage dependency to dataset."""
332
-
333
300
  @abstractmethod
334
301
  def add_dataset_dependency(
335
302
  self,
@@ -1268,32 +1235,6 @@ class AbstractDBMetastore(AbstractMetastore):
1268
1235
  #
1269
1236
  # Dataset dependencies
1270
1237
  #
1271
-
1272
- def _insert_dataset_dependency(self, data: dict[str, Any]) -> None:
1273
- """Method for inserting dependencies."""
1274
- self.db.execute(self._datasets_dependencies_insert().values(**data))
1275
-
1276
- def add_storage_dependency(
1277
- self,
1278
- source_dataset_name: str,
1279
- source_dataset_version: int,
1280
- storage_uri: StorageURI,
1281
- storage_timestamp_str: Optional[str] = None,
1282
- ) -> None:
1283
- source_dataset = self.get_dataset(source_dataset_name)
1284
- storage = self.get_storage(storage_uri)
1285
-
1286
- self._insert_dataset_dependency(
1287
- {
1288
- "source_dataset_id": source_dataset.id,
1289
- "source_dataset_version_id": (
1290
- source_dataset.get_version(source_dataset_version).id
1291
- ),
1292
- "bucket_id": storage.id,
1293
- "bucket_version": storage_timestamp_str,
1294
- }
1295
- )
1296
-
1297
1238
  def add_dataset_dependency(
1298
1239
  self,
1299
1240
  source_dataset_name: str,
@@ -1305,15 +1246,15 @@ class AbstractDBMetastore(AbstractMetastore):
1305
1246
  source_dataset = self.get_dataset(source_dataset_name)
1306
1247
  dataset = self.get_dataset(dataset_name)
1307
1248
 
1308
- self._insert_dataset_dependency(
1309
- {
1310
- "source_dataset_id": source_dataset.id,
1311
- "source_dataset_version_id": (
1249
+ self.db.execute(
1250
+ self._datasets_dependencies_insert().values(
1251
+ source_dataset_id=source_dataset.id,
1252
+ source_dataset_version_id=(
1312
1253
  source_dataset.get_version(source_dataset_version).id
1313
1254
  ),
1314
- "dataset_id": dataset.id,
1315
- "dataset_version_id": dataset.get_version(dataset_version).id,
1316
- }
1255
+ dataset_id=dataset.id,
1256
+ dataset_version_id=dataset.get_version(dataset_version).id,
1257
+ )
1317
1258
  )
1318
1259
 
1319
1260
  def update_dataset_dependency_source(
@@ -40,7 +40,9 @@ if TYPE_CHECKING:
40
40
  from sqlalchemy.dialects.sqlite import Insert
41
41
  from sqlalchemy.engine.base import Engine
42
42
  from sqlalchemy.schema import SchemaItem
43
+ from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
43
44
  from sqlalchemy.sql.elements import ColumnElement
45
+ from sqlalchemy.sql.selectable import Join
44
46
  from sqlalchemy.types import TypeEngine
45
47
 
46
48
  from datachain.lib.file import File
@@ -649,11 +651,14 @@ class SQLiteWarehouse(AbstractWarehouse):
649
651
  self, dataset: DatasetRecord, version: int
650
652
  ) -> list[StorageURI]:
651
653
  dr = self.dataset_rows(dataset, version)
652
- query = dr.select(dr.c.source).distinct()
654
+ query = dr.select(dr.c.file__source).distinct()
653
655
  cur = self.db.cursor()
654
656
  cur.row_factory = sqlite3.Row # type: ignore[assignment]
655
657
 
656
- return [StorageURI(row["source"]) for row in self.db.execute(query, cursor=cur)]
658
+ return [
659
+ StorageURI(row["file__source"])
660
+ for row in self.db.execute(query, cursor=cur)
661
+ ]
657
662
 
658
663
  def merge_dataset_rows(
659
664
  self,
@@ -788,6 +793,23 @@ class SQLiteWarehouse(AbstractWarehouse):
788
793
  if progress_cb:
789
794
  progress_cb(len(batch_ids))
790
795
 
796
+ def join(
797
+ self,
798
+ left: "_FromClauseArgument",
799
+ right: "_FromClauseArgument",
800
+ onclause: "_OnClauseArgument",
801
+ inner: bool = True,
802
+ ) -> "Join":
803
+ """
804
+ Join two tables together.
805
+ """
806
+ return sqlalchemy.join(
807
+ left,
808
+ right,
809
+ onclause,
810
+ isouter=not inner,
811
+ )
812
+
791
813
  def create_pre_udf_table(self, query: "Select") -> "Table":
792
814
  """
793
815
  Create a temporary table from a query for use in a UDF.
@@ -27,8 +27,12 @@ from datachain.storage import StorageURI
27
27
  from datachain.utils import sql_escape_like
28
28
 
29
29
  if TYPE_CHECKING:
30
- from sqlalchemy.sql._typing import _ColumnsClauseArgument
31
- from sqlalchemy.sql.selectable import Select
30
+ from sqlalchemy.sql._typing import (
31
+ _ColumnsClauseArgument,
32
+ _FromClauseArgument,
33
+ _OnClauseArgument,
34
+ )
35
+ from sqlalchemy.sql.selectable import Join, Select
32
36
  from sqlalchemy.types import TypeEngine
33
37
 
34
38
  from datachain.data_storage import AbstractIDGenerator, schema
@@ -894,6 +898,18 @@ class AbstractWarehouse(ABC, Serializable):
894
898
  Copy the results of a query into a table.
895
899
  """
896
900
 
901
+ @abstractmethod
902
+ def join(
903
+ self,
904
+ left: "_FromClauseArgument",
905
+ right: "_FromClauseArgument",
906
+ onclause: "_OnClauseArgument",
907
+ inner: bool = True,
908
+ ) -> "Join":
909
+ """
910
+ Join two tables together.
911
+ """
912
+
897
913
  @abstractmethod
898
914
  def create_pre_udf_table(self, query: "Select") -> "Table":
899
915
  """
@@ -922,32 +938,10 @@ class AbstractWarehouse(ABC, Serializable):
922
938
  are cleaned up as soon as they are no longer needed.
923
939
  """
924
940
  with tqdm(desc="Cleanup", unit=" tables") as pbar:
925
- for name in names:
941
+ for name in set(names):
926
942
  self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
927
943
  pbar.update(1)
928
944
 
929
- def changed_query(
930
- self,
931
- source_query: sa.sql.selectable.Select,
932
- target_query: sa.sql.selectable.Select,
933
- ) -> sa.sql.selectable.Select:
934
- sq = source_query.alias("source_query")
935
- tq = target_query.alias("target_query")
936
-
937
- source_target_join = sa.join(
938
- sq, tq, (sq.c.source == tq.c.source) & (sq.c.path == tq.c.path)
939
- )
940
-
941
- return (
942
- select(*sq.c)
943
- .select_from(source_target_join)
944
- .where(
945
- (sq.c.last_modified > tq.c.last_modified)
946
- & (sq.c.is_latest == true())
947
- & (tq.c.is_latest == true())
948
- )
949
- )
950
-
951
945
 
952
946
  def _random_string(length: int) -> str:
953
947
  return "".join(
@@ -1337,8 +1337,7 @@ class DataChain(DatasetQuery):
1337
1337
  other.signals_schema.resolve(*right_on).db_signals(),
1338
1338
  ) # type: ignore[arg-type]
1339
1339
  )
1340
-
1341
- return super()._subtract(other, signals) # type: ignore[arg-type]
1340
+ return super().subtract(other, signals) # type: ignore[arg-type]
1342
1341
 
1343
1342
  @classmethod
1344
1343
  def from_values(
@@ -77,6 +77,7 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
77
77
  """
78
78
  Parsing uri and returns listing dataset name, listing uri and listing path
79
79
  """
80
+ client_config = client_config or {}
80
81
  client = Client.get_client(uri, cache, **client_config)
81
82
  storage_uri, path = Client.parse_url(uri)
82
83
 
@@ -30,4 +30,5 @@ def process_tar(file: File) -> Iterator[File]:
30
30
  with file.open() as fd:
31
31
  with tarfile.open(fileobj=fd) as tar:
32
32
  for entry in tar.getmembers():
33
- yield build_tar_member(file, entry)
33
+ if entry.isfile():
34
+ yield build_tar_member(file, entry)
@@ -114,9 +114,23 @@ class Node:
114
114
  )
115
115
 
116
116
  @classmethod
117
- def from_dict(cls, d: dict[str, Any]) -> "Self":
118
- kw = {f.name: d[f.name] for f in attrs.fields(cls) if f.name in d}
119
- return cls(**kw)
117
+ def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
118
+ def _dval(field_name: str):
119
+ return d.get(f"{file_prefix}__{field_name}")
120
+
121
+ return cls(
122
+ sys__id=d["sys__id"],
123
+ sys__rand=d["sys__rand"],
124
+ source=_dval("source"),
125
+ path=_dval("path"),
126
+ etag=_dval("etag"),
127
+ is_latest=_dval("is_latest"),
128
+ size=_dval("size"),
129
+ last_modified=_dval("last_modified"),
130
+ version=_dval("version"),
131
+ location=_dval("location"),
132
+ dir_type=DirType.FILE,
133
+ )
120
134
 
121
135
  @classmethod
122
136
  def from_dir(cls, path, **kwargs) -> "Node":
@@ -2,7 +2,6 @@ from .dataset import DatasetQuery
2
2
  from .params import param
3
3
  from .schema import C, DatasetRow, LocalFilename, Object, Stream
4
4
  from .session import Session
5
- from .udf import udf
6
5
 
7
6
  __all__ = [
8
7
  "C",
@@ -13,5 +12,4 @@ __all__ = [
13
12
  "Session",
14
13
  "Stream",
15
14
  "param",
16
- "udf",
17
15
  ]