datachain 0.6.2__tar.gz → 0.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (263) hide show
  1. {datachain-0.6.2 → datachain-0.6.3}/.github/workflows/tests-studio.yml +1 -1
  2. {datachain-0.6.2 → datachain-0.6.3}/.pre-commit-config.yaml +1 -1
  3. {datachain-0.6.2/src/datachain.egg-info → datachain-0.6.3}/PKG-INFO +2 -2
  4. {datachain-0.6.2 → datachain-0.6.3}/noxfile.py +1 -2
  5. {datachain-0.6.2 → datachain-0.6.3}/pyproject.toml +4 -5
  6. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/arrow.py +2 -15
  7. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/data_model.py +10 -2
  8. datachain-0.6.3/src/datachain/lib/utils.py +60 -0
  9. {datachain-0.6.2 → datachain-0.6.3/src/datachain.egg-info}/PKG-INFO +2 -2
  10. {datachain-0.6.2 → datachain-0.6.3}/src/datachain.egg-info/requires.txt +1 -1
  11. datachain-0.6.3/tests/benchmarks/conftest.py +8 -0
  12. {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/test_datachain.py +0 -3
  13. datachain-0.6.3/tests/benchmarks/test_ls.py +6 -0
  14. datachain-0.6.3/tests/benchmarks/test_version.py +7 -0
  15. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_arrow.py +11 -3
  16. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_datachain.py +34 -1
  17. datachain-0.6.3/tests/unit/lib/test_utils.py +128 -0
  18. datachain-0.6.2/src/datachain/lib/utils.py +0 -30
  19. datachain-0.6.2/tests/benchmarks/conftest.py +0 -137
  20. datachain-0.6.2/tests/benchmarks/test_ls.py +0 -2
  21. datachain-0.6.2/tests/benchmarks/test_version.py +0 -2
  22. datachain-0.6.2/tests/unit/lib/test_utils.py +0 -58
  23. {datachain-0.6.2 → datachain-0.6.3}/.cruft.json +0 -0
  24. {datachain-0.6.2 → datachain-0.6.3}/.gitattributes +0 -0
  25. {datachain-0.6.2 → datachain-0.6.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  26. {datachain-0.6.2 → datachain-0.6.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
  27. {datachain-0.6.2 → datachain-0.6.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  28. {datachain-0.6.2 → datachain-0.6.3}/.github/codecov.yaml +0 -0
  29. {datachain-0.6.2 → datachain-0.6.3}/.github/dependabot.yml +0 -0
  30. {datachain-0.6.2 → datachain-0.6.3}/.github/workflows/benchmarks.yml +0 -0
  31. {datachain-0.6.2 → datachain-0.6.3}/.github/workflows/release.yml +0 -0
  32. {datachain-0.6.2 → datachain-0.6.3}/.github/workflows/tests.yml +0 -0
  33. {datachain-0.6.2 → datachain-0.6.3}/.github/workflows/update-template.yaml +0 -0
  34. {datachain-0.6.2 → datachain-0.6.3}/.gitignore +0 -0
  35. {datachain-0.6.2 → datachain-0.6.3}/CODE_OF_CONDUCT.rst +0 -0
  36. {datachain-0.6.2 → datachain-0.6.3}/CONTRIBUTING.rst +0 -0
  37. {datachain-0.6.2 → datachain-0.6.3}/LICENSE +0 -0
  38. {datachain-0.6.2 → datachain-0.6.3}/README.rst +0 -0
  39. {datachain-0.6.2 → datachain-0.6.3}/docs/assets/captioned_cartoons.png +0 -0
  40. {datachain-0.6.2 → datachain-0.6.3}/docs/assets/datachain-white.svg +0 -0
  41. {datachain-0.6.2 → datachain-0.6.3}/docs/assets/datachain.svg +0 -0
  42. {datachain-0.6.2 → datachain-0.6.3}/docs/assets/flowchart.png +0 -0
  43. {datachain-0.6.2 → datachain-0.6.3}/docs/index.md +0 -0
  44. {datachain-0.6.2 → datachain-0.6.3}/docs/references/datachain.md +0 -0
  45. {datachain-0.6.2 → datachain-0.6.3}/docs/references/datatype.md +0 -0
  46. {datachain-0.6.2 → datachain-0.6.3}/docs/references/file.md +0 -0
  47. {datachain-0.6.2 → datachain-0.6.3}/docs/references/index.md +0 -0
  48. {datachain-0.6.2 → datachain-0.6.3}/docs/references/sql.md +0 -0
  49. {datachain-0.6.2 → datachain-0.6.3}/docs/references/torch.md +0 -0
  50. {datachain-0.6.2 → datachain-0.6.3}/docs/references/udf.md +0 -0
  51. {datachain-0.6.2 → datachain-0.6.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
  52. {datachain-0.6.2 → datachain-0.6.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
  53. {datachain-0.6.2 → datachain-0.6.3}/examples/computer_vision/openimage-detect.py +0 -0
  54. {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/common_sql_functions.py +0 -0
  55. {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/json-csv-reader.py +0 -0
  56. {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/torch-loader.py +0 -0
  57. {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/udfs/parallel.py +0 -0
  58. {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/udfs/simple.py +0 -0
  59. {datachain-0.6.2 → datachain-0.6.3}/examples/get_started/udfs/stateful.py +0 -0
  60. {datachain-0.6.2 → datachain-0.6.3}/examples/llm_and_nlp/claude-query.py +0 -0
  61. {datachain-0.6.2 → datachain-0.6.3}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
  62. {datachain-0.6.2 → datachain-0.6.3}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
  63. {datachain-0.6.2 → datachain-0.6.3}/examples/multimodal/clip_inference.py +0 -0
  64. {datachain-0.6.2 → datachain-0.6.3}/examples/multimodal/hf_pipeline.py +0 -0
  65. {datachain-0.6.2 → datachain-0.6.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
  66. {datachain-0.6.2 → datachain-0.6.3}/examples/multimodal/wds.py +0 -0
  67. {datachain-0.6.2 → datachain-0.6.3}/examples/multimodal/wds_filtered.py +0 -0
  68. {datachain-0.6.2 → datachain-0.6.3}/mkdocs.yml +0 -0
  69. {datachain-0.6.2 → datachain-0.6.3}/overrides/main.html +0 -0
  70. {datachain-0.6.2 → datachain-0.6.3}/setup.cfg +0 -0
  71. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/__init__.py +0 -0
  72. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/__main__.py +0 -0
  73. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/asyn.py +0 -0
  74. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/cache.py +0 -0
  75. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/catalog/__init__.py +0 -0
  76. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/catalog/catalog.py +0 -0
  77. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/catalog/datasource.py +0 -0
  78. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/catalog/loader.py +0 -0
  79. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/cli.py +0 -0
  80. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/cli_utils.py +0 -0
  81. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/__init__.py +0 -0
  82. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/azure.py +0 -0
  83. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/fileslice.py +0 -0
  84. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/fsspec.py +0 -0
  85. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/gcs.py +0 -0
  86. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/hf.py +0 -0
  87. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/local.py +0 -0
  88. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/client/s3.py +0 -0
  89. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/config.py +0 -0
  90. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/__init__.py +0 -0
  91. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/db_engine.py +0 -0
  92. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/id_generator.py +0 -0
  93. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/job.py +0 -0
  94. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/metastore.py +0 -0
  95. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/schema.py +0 -0
  96. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/serializer.py +0 -0
  97. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/sqlite.py +0 -0
  98. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/data_storage/warehouse.py +0 -0
  99. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/dataset.py +0 -0
  100. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/error.py +0 -0
  101. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/job.py +0 -0
  102. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/__init__.py +0 -0
  103. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/clip.py +0 -0
  104. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/__init__.py +0 -0
  105. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/flatten.py +0 -0
  106. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
  107. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
  108. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/unflatten.py +0 -0
  109. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
  110. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/dataset_info.py +0 -0
  111. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/dc.py +0 -0
  112. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/file.py +0 -0
  113. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/func/__init__.py +0 -0
  114. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/func/aggregate.py +0 -0
  115. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/func/func.py +0 -0
  116. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/hf.py +0 -0
  117. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/image.py +0 -0
  118. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/listing.py +0 -0
  119. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/listing_info.py +0 -0
  120. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/meta_formats.py +0 -0
  121. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/model_store.py +0 -0
  122. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/pytorch.py +0 -0
  123. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/settings.py +0 -0
  124. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/signal_schema.py +0 -0
  125. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/tar.py +0 -0
  126. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/text.py +0 -0
  127. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/udf.py +0 -0
  128. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/udf_signature.py +0 -0
  129. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/vfile.py +0 -0
  130. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/webdataset.py +0 -0
  131. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/lib/webdataset_laion.py +0 -0
  132. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/listing.py +0 -0
  133. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/node.py +0 -0
  134. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/nodes_fetcher.py +0 -0
  135. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/nodes_thread_pool.py +0 -0
  136. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/progress.py +0 -0
  137. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/py.typed +0 -0
  138. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/__init__.py +0 -0
  139. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/batch.py +0 -0
  140. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/dataset.py +0 -0
  141. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/dispatch.py +0 -0
  142. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/metrics.py +0 -0
  143. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/params.py +0 -0
  144. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/queue.py +0 -0
  145. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/schema.py +0 -0
  146. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/query/session.py +0 -0
  147. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/remote/__init__.py +0 -0
  148. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/remote/studio.py +0 -0
  149. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/__init__.py +0 -0
  150. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/default/__init__.py +0 -0
  151. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/default/base.py +0 -0
  152. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/__init__.py +0 -0
  153. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/aggregate.py +0 -0
  154. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/array.py +0 -0
  155. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/conditional.py +0 -0
  156. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/path.py +0 -0
  157. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/random.py +0 -0
  158. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/functions/string.py +0 -0
  159. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/selectable.py +0 -0
  160. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/sqlite/__init__.py +0 -0
  161. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/sqlite/base.py +0 -0
  162. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/sqlite/types.py +0 -0
  163. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/sqlite/vector.py +0 -0
  164. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/types.py +0 -0
  165. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/sql/utils.py +0 -0
  166. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/storage.py +0 -0
  167. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/studio.py +0 -0
  168. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/telemetry.py +0 -0
  169. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/torch/__init__.py +0 -0
  170. {datachain-0.6.2 → datachain-0.6.3}/src/datachain/utils.py +0 -0
  171. {datachain-0.6.2 → datachain-0.6.3}/src/datachain.egg-info/SOURCES.txt +0 -0
  172. {datachain-0.6.2 → datachain-0.6.3}/src/datachain.egg-info/dependency_links.txt +0 -0
  173. {datachain-0.6.2 → datachain-0.6.3}/src/datachain.egg-info/entry_points.txt +0 -0
  174. {datachain-0.6.2 → datachain-0.6.3}/src/datachain.egg-info/top_level.txt +0 -0
  175. {datachain-0.6.2 → datachain-0.6.3}/tests/__init__.py +0 -0
  176. {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/__init__.py +0 -0
  177. {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
  178. {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/datasets/.dvc/config +0 -0
  179. {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/datasets/.gitignore +0 -0
  180. {datachain-0.6.2 → datachain-0.6.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
  181. {datachain-0.6.2 → datachain-0.6.3}/tests/conftest.py +0 -0
  182. {datachain-0.6.2 → datachain-0.6.3}/tests/data.py +0 -0
  183. {datachain-0.6.2 → datachain-0.6.3}/tests/examples/__init__.py +0 -0
  184. {datachain-0.6.2 → datachain-0.6.3}/tests/examples/test_examples.py +0 -0
  185. {datachain-0.6.2 → datachain-0.6.3}/tests/examples/test_wds_e2e.py +0 -0
  186. {datachain-0.6.2 → datachain-0.6.3}/tests/examples/wds_data.py +0 -0
  187. {datachain-0.6.2 → datachain-0.6.3}/tests/func/__init__.py +0 -0
  188. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_catalog.py +0 -0
  189. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_client.py +0 -0
  190. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_datachain.py +0 -0
  191. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_dataset_query.py +0 -0
  192. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_datasets.py +0 -0
  193. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_feature_pickling.py +0 -0
  194. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_listing.py +0 -0
  195. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_ls.py +0 -0
  196. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_meta_formats.py +0 -0
  197. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_metrics.py +0 -0
  198. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_pull.py +0 -0
  199. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_pytorch.py +0 -0
  200. {datachain-0.6.2 → datachain-0.6.3}/tests/func/test_query.py +0 -0
  201. {datachain-0.6.2 → datachain-0.6.3}/tests/scripts/feature_class.py +0 -0
  202. {datachain-0.6.2 → datachain-0.6.3}/tests/scripts/feature_class_exception.py +0 -0
  203. {datachain-0.6.2 → datachain-0.6.3}/tests/scripts/feature_class_parallel.py +0 -0
  204. {datachain-0.6.2 → datachain-0.6.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
  205. {datachain-0.6.2 → datachain-0.6.3}/tests/scripts/name_len_slow.py +0 -0
  206. {datachain-0.6.2 → datachain-0.6.3}/tests/test_atomicity.py +0 -0
  207. {datachain-0.6.2 → datachain-0.6.3}/tests/test_cli_e2e.py +0 -0
  208. {datachain-0.6.2 → datachain-0.6.3}/tests/test_cli_studio.py +0 -0
  209. {datachain-0.6.2 → datachain-0.6.3}/tests/test_query_e2e.py +0 -0
  210. {datachain-0.6.2 → datachain-0.6.3}/tests/test_telemetry.py +0 -0
  211. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/__init__.py +0 -0
  212. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/__init__.py +0 -0
  213. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/conftest.py +0 -0
  214. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_clip.py +0 -0
  215. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
  216. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_datachain_merge.py +0 -0
  217. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_feature.py +0 -0
  218. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_feature_utils.py +0 -0
  219. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_file.py +0 -0
  220. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_hf.py +0 -0
  221. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_image.py +0 -0
  222. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_listing_info.py +0 -0
  223. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_schema.py +0 -0
  224. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_signal_schema.py +0 -0
  225. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_sql_to_python.py +0 -0
  226. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_text.py +0 -0
  227. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_udf_signature.py +0 -0
  228. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/lib/test_webdataset.py +0 -0
  229. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/__init__.py +0 -0
  230. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/sqlite/__init__.py +0 -0
  231. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
  232. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_array.py +0 -0
  233. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_conditional.py +0 -0
  234. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_path.py +0 -0
  235. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_random.py +0 -0
  236. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_selectable.py +0 -0
  237. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/sql/test_string.py +0 -0
  238. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_asyn.py +0 -0
  239. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_cache.py +0 -0
  240. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_catalog.py +0 -0
  241. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_catalog_loader.py +0 -0
  242. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_cli_parsing.py +0 -0
  243. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_client.py +0 -0
  244. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_client_s3.py +0 -0
  245. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_config.py +0 -0
  246. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_data_storage.py +0 -0
  247. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_database_engine.py +0 -0
  248. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_dataset.py +0 -0
  249. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_dispatch.py +0 -0
  250. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_fileslice.py +0 -0
  251. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_id_generator.py +0 -0
  252. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_listing.py +0 -0
  253. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_metastore.py +0 -0
  254. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_module_exports.py +0 -0
  255. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_query.py +0 -0
  256. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_query_metrics.py +0 -0
  257. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_query_params.py +0 -0
  258. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_serializer.py +0 -0
  259. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_session.py +0 -0
  260. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_storage.py +0 -0
  261. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_utils.py +0 -0
  262. {datachain-0.6.2 → datachain-0.6.3}/tests/unit/test_warehouse.py +0 -0
  263. {datachain-0.6.2 → datachain-0.6.3}/tests/utils.py +0 -0
@@ -101,6 +101,6 @@ jobs:
101
101
  pytest
102
102
  --config-file=pyproject.toml -rs
103
103
  --splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
104
- -m 'not benchmark'
104
+ --benchmark-skip
105
105
  tests ../datachain/tests
106
106
  working-directory: backend/datachain_server
@@ -24,7 +24,7 @@ repos:
24
24
  - id: trailing-whitespace
25
25
  exclude: '^LICENSES/'
26
26
  - repo: https://github.com/astral-sh/ruff-pre-commit
27
- rev: 'v0.7.0'
27
+ rev: 'v0.7.1'
28
28
  hooks:
29
29
  - id: ruff
30
30
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.2
3
+ Version: 0.6.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -82,7 +82,7 @@ Requires-Dist: requests-mock; extra == "tests"
82
82
  Requires-Dist: scipy; extra == "tests"
83
83
  Provides-Extra: dev
84
84
  Requires-Dist: datachain[docs,tests]; extra == "dev"
85
- Requires-Dist: mypy==1.12.1; extra == "dev"
85
+ Requires-Dist: mypy==1.13.0; extra == "dev"
86
86
  Requires-Dist: types-python-dateutil; extra == "dev"
87
87
  Requires-Dist: types-pytz; extra == "dev"
88
88
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -22,8 +22,7 @@ def bench(session: nox.Session) -> None:
22
22
  session.install(".[tests]")
23
23
  session.run(
24
24
  "pytest",
25
- "-m",
26
- "benchmark",
25
+ "--benchmark-only",
27
26
  "--benchmark-group-by",
28
27
  "func",
29
28
  *session.posargs,
@@ -94,7 +94,7 @@ tests = [
94
94
  ]
95
95
  dev = [
96
96
  "datachain[docs,tests]",
97
- "mypy==1.12.1",
97
+ "mypy==1.13.0",
98
98
  "types-python-dateutil",
99
99
  "types-pytz",
100
100
  "types-PyYAML",
@@ -127,9 +127,8 @@ namespaces = false
127
127
  [tool.setuptools_scm]
128
128
 
129
129
  [tool.pytest.ini_options]
130
- addopts = "-rfEs -m 'not benchmark and not examples'"
130
+ addopts = "-rfEs -m 'not examples' --benchmark-skip"
131
131
  markers = [
132
- "benchmark: benchmarks.",
133
132
  "e2e: End-to-end tests",
134
133
  "examples: All examples",
135
134
  "computer_vision: Computer vision examples",
@@ -214,6 +213,7 @@ ignore = [
214
213
  select = [
215
214
  "B", # flake8-bugbear
216
215
  "C4", # flake8-comprehensions
216
+ "C420", # unnecessary-dict-comprehension-for-iterable
217
217
  "C90", # mccabe
218
218
  "W", # pycodestyle - Warning
219
219
  "E", # pycodestyle - Error
@@ -252,11 +252,10 @@ select = [
252
252
  "NPY", # numpy
253
253
  "TRY004", # type-check-without-type-error
254
254
  "TRY201", # verbose-raise
255
- "TRY302", # useless-try-except
255
+ "TRY203", # useless-try-except
256
256
  "TRY401", # verbose-log-message
257
257
  "RUF022", # unsorted-dunder-all
258
258
  "RUF023", # unsorted-dunder-slots
259
- "RUF025", # unnecessary-dict-comprehension-for-iterable
260
259
  "RUF027", # missing-f-string-syntax
261
260
  "RUF030", # assert-with-print-message
262
261
  "RUF101", # redirected-noqa
@@ -1,4 +1,3 @@
1
- import re
2
1
  from collections.abc import Sequence
3
2
  from tempfile import NamedTemporaryFile
4
3
  from typing import TYPE_CHECKING, Any, Optional
@@ -13,6 +12,7 @@ from datachain.lib.file import ArrowRow, File
13
12
  from datachain.lib.model_store import ModelStore
14
13
  from datachain.lib.signal_schema import SignalSchema
15
14
  from datachain.lib.udf import Generator
15
+ from datachain.lib.utils import normalize_col_names
16
16
 
17
17
  if TYPE_CHECKING:
18
18
  from datasets.features.features import Features
@@ -128,7 +128,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
128
128
  signal_schema = _get_datachain_schema(schema)
129
129
  if signal_schema:
130
130
  return signal_schema.values
131
- columns = _convert_col_names(col_names) # type: ignore[arg-type]
131
+ columns = list(normalize_col_names(col_names).keys()) # type: ignore[arg-type]
132
132
  hf_schema = _get_hf_schema(schema)
133
133
  if hf_schema:
134
134
  return {
@@ -143,19 +143,6 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
143
143
  return output
144
144
 
145
145
 
146
- def _convert_col_names(col_names: Sequence[str]) -> list[str]:
147
- default_column = 0
148
- converted_col_names = []
149
- for column in col_names:
150
- column = column.lower()
151
- column = re.sub("[^0-9a-z_]+", "", column)
152
- if not column:
153
- column = f"c{default_column}"
154
- default_column += 1
155
- converted_col_names.append(column)
156
- return converted_col_names
157
-
158
-
159
146
  def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
160
147
  """Convert pyarrow types to basic types."""
161
148
  from datetime import datetime
@@ -2,9 +2,10 @@ from collections.abc import Sequence
2
2
  from datetime import datetime
3
3
  from typing import ClassVar, Union, get_args, get_origin
4
4
 
5
- from pydantic import BaseModel, create_model
5
+ from pydantic import BaseModel, Field, create_model
6
6
 
7
7
  from datachain.lib.model_store import ModelStore
8
+ from datachain.lib.utils import normalize_col_names
8
9
 
9
10
  StandardType = Union[
10
11
  type[int],
@@ -60,7 +61,14 @@ def is_chain_type(t: type) -> bool:
60
61
 
61
62
 
62
63
  def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
63
- fields = {name: (anno, ...) for name, anno in data_dict.items()}
64
+ # Gets a map of a normalized_name -> original_name
65
+ columns = normalize_col_names(list(data_dict.keys()))
66
+ # We reverse if for convenience to original_name -> normalized_name
67
+ columns = {v: k for k, v in columns.items()}
68
+
69
+ fields = {
70
+ columns[name]: (anno, Field(alias=name)) for name, anno in data_dict.items()
71
+ }
64
72
  return create_model(
65
73
  name,
66
74
  __base__=(DataModel,), # type: ignore[call-overload]
@@ -0,0 +1,60 @@
1
+ import re
2
+ from abc import ABC, abstractmethod
3
+ from collections.abc import Sequence
4
+
5
+
6
+ class AbstractUDF(ABC):
7
+ @abstractmethod
8
+ def process(self, *args, **kwargs):
9
+ pass
10
+
11
+ @abstractmethod
12
+ def setup(self):
13
+ pass
14
+
15
+ @abstractmethod
16
+ def teardown(self):
17
+ pass
18
+
19
+
20
+ class DataChainError(Exception):
21
+ def __init__(self, message):
22
+ super().__init__(message)
23
+
24
+
25
+ class DataChainParamsError(DataChainError):
26
+ def __init__(self, message):
27
+ super().__init__(message)
28
+
29
+
30
+ class DataChainColumnError(DataChainParamsError):
31
+ def __init__(self, col_name, msg):
32
+ super().__init__(f"Error for column {col_name}: {msg}")
33
+
34
+
35
+ def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
36
+ gen_col_counter = 0
37
+ new_col_names = {}
38
+ org_col_names = set(col_names)
39
+
40
+ for org_column in col_names:
41
+ new_column = org_column.lower()
42
+ new_column = re.sub("[^0-9a-z]+", "_", new_column)
43
+ new_column = new_column.strip("_")
44
+
45
+ generated_column = new_column
46
+
47
+ while (
48
+ not generated_column.isidentifier()
49
+ or generated_column in new_col_names
50
+ or (generated_column != org_column and generated_column in org_col_names)
51
+ ):
52
+ if new_column:
53
+ generated_column = f"c{gen_col_counter}_{new_column}"
54
+ else:
55
+ generated_column = f"c{gen_col_counter}"
56
+ gen_col_counter += 1
57
+
58
+ new_col_names[generated_column] = org_column
59
+
60
+ return new_col_names
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.2
3
+ Version: 0.6.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -82,7 +82,7 @@ Requires-Dist: requests-mock; extra == "tests"
82
82
  Requires-Dist: scipy; extra == "tests"
83
83
  Provides-Extra: dev
84
84
  Requires-Dist: datachain[docs,tests]; extra == "dev"
85
- Requires-Dist: mypy==1.12.1; extra == "dev"
85
+ Requires-Dist: mypy==1.13.0; extra == "dev"
86
86
  Requires-Dist: types-python-dateutil; extra == "dev"
87
87
  Requires-Dist: types-pytz; extra == "dev"
88
88
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -30,7 +30,7 @@ dvc-studio-client<1,>=0.21
30
30
 
31
31
  [dev]
32
32
  datachain[docs,tests]
33
- mypy==1.12.1
33
+ mypy==1.13.0
34
34
  types-python-dateutil
35
35
  types-pytz
36
36
  types-PyYAML
@@ -0,0 +1,8 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+
6
+ @pytest.fixture
7
+ def datasets():
8
+ return Path(__file__).parent / "datasets"
@@ -1,10 +1,7 @@
1
- import pytest
2
-
3
1
  from datachain.lib.dc import DataChain
4
2
  from datachain.lib.webdataset_laion import process_laion_meta
5
3
 
6
4
 
7
- @pytest.mark.benchmark
8
5
  def test_datachain(tmp_dir, test_session, datasets, benchmark):
9
6
  def run_script(uri, **kwargs):
10
7
  DataChain.from_storage(uri, session=test_session, **kwargs).gen(
@@ -0,0 +1,6 @@
1
+ from datachain.cli import ls
2
+
3
+
4
+ def test_ls(benchmark, tmp_dir):
5
+ bucket = "s3://noaa-bathymetry-pds/"
6
+ benchmark.pedantic(ls, args=([bucket],), kwargs={"client_config": {"anon": True}})
@@ -0,0 +1,7 @@
1
+ import shutil
2
+ import subprocess
3
+
4
+
5
+ def test_version(benchmark):
6
+ bin = shutil.which("datachain")
7
+ benchmark(subprocess.check_call, [bin, "--help"])
@@ -168,13 +168,21 @@ def test_parquet_convert_column_names():
168
168
  ("dot.notation.col", pa.int32()),
169
169
  ("with-dashes", pa.int32()),
170
170
  ("with spaces", pa.int32()),
171
+ ("with-multiple--dashes", pa.int32()),
172
+ ("with__underscores", pa.int32()),
173
+ ("__leading__underscores", pa.int32()),
174
+ ("trailing__underscores__", pa.int32()),
171
175
  ]
172
176
  )
173
177
  assert list(schema_to_output(schema)) == [
174
178
  "uppercasecol",
175
- "dotnotationcol",
176
- "withdashes",
177
- "withspaces",
179
+ "dot_notation_col",
180
+ "with_dashes",
181
+ "with_spaces",
182
+ "with_multiple_dashes",
183
+ "with_underscores",
184
+ "leading_underscores",
185
+ "trailing_underscores",
178
186
  ]
179
187
 
180
188
 
@@ -36,6 +36,18 @@ DF_DATA = {
36
36
  "city": ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix"],
37
37
  }
38
38
 
39
+ DF_DATA_NESTED_NOT_NORMALIZED = {
40
+ "nAmE": [
41
+ {"first-SELECT": "Alice", "l--as@t": "Smith"},
42
+ {"l--as@t": "Jones", "first-SELECT": "Bob"},
43
+ {"first-SELECT": "Charlie", "l--as@t": "Brown"},
44
+ {"first-SELECT": "David", "l--as@t": "White"},
45
+ {"first-SELECT": "Eva", "l--as@t": "Black"},
46
+ ],
47
+ "AgE": [25, 30, 35, 40, 45],
48
+ "citY": ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix"],
49
+ }
50
+
39
51
  DF_OTHER_DATA = {
40
52
  "last_name": ["Smith", "Jones"],
41
53
  "country": ["USA", "Russia"],
@@ -272,7 +284,9 @@ def test_listings(test_session, tmp_dir):
272
284
  assert listing.expires
273
285
  assert listing.version == 1
274
286
  assert listing.num_objects == 1
275
- assert listing.size == 2912
287
+ # Exact number if unreliable here since it depends on the PyArrow version
288
+ assert listing.size > 1000
289
+ assert listing.size < 5000
276
290
  assert listing.status == 4
277
291
 
278
292
 
@@ -988,6 +1002,25 @@ def test_parse_tabular_format(tmp_dir, test_session):
988
1002
  assert df1.equals(df)
989
1003
 
990
1004
 
1005
+ def test_parse_nested_json(tmp_dir, test_session):
1006
+ df = pd.DataFrame(DF_DATA_NESTED_NOT_NORMALIZED)
1007
+ path = tmp_dir / "test.jsonl"
1008
+ path.write_text(df.to_json(orient="records", lines=True))
1009
+ dc = DataChain.from_storage(path.as_uri(), session=test_session).parse_tabular(
1010
+ format="json"
1011
+ )
1012
+ # Field names are normalized, values are preserved
1013
+ # E.g. nAmE -> name, l--as@t -> l_as_t, etc
1014
+ df1 = dc.select("name", "age", "city").to_pandas()
1015
+
1016
+ assert df1["name"]["first_select"].to_list() == [
1017
+ d["first-SELECT"] for d in df["nAmE"].to_list()
1018
+ ]
1019
+ assert df1["name"]["l_as_t"].to_list() == [
1020
+ d["l--as@t"] for d in df["nAmE"].to_list()
1021
+ ]
1022
+
1023
+
991
1024
  def test_parse_tabular_partitions(tmp_dir, test_session):
992
1025
  df = pd.DataFrame(DF_DATA)
993
1026
  path = tmp_dir / "test.parquet"
@@ -0,0 +1,128 @@
1
+ from collections.abc import Iterable, Mapping
2
+ from typing import Literal, Optional, Union
3
+
4
+ import pytest
5
+ from pydantic import BaseModel
6
+
7
+ from datachain.lib.convert.python_to_sql import python_to_sql
8
+ from datachain.lib.utils import normalize_col_names
9
+ from datachain.sql.types import JSON, Array, String
10
+
11
+
12
+ class MyModel(BaseModel):
13
+ val1: str
14
+
15
+
16
+ class MyFeature(BaseModel):
17
+ val1: str
18
+
19
+
20
+ @pytest.mark.parametrize(
21
+ "typ,expected",
22
+ (
23
+ (str, String),
24
+ (String, String),
25
+ (Literal["text"], String),
26
+ (dict[str, int], JSON),
27
+ (Mapping[str, int], JSON),
28
+ (Optional[str], String),
29
+ (Union[dict, list[dict]], JSON),
30
+ ),
31
+ )
32
+ def test_convert_type_to_datachain(typ, expected):
33
+ assert python_to_sql(typ) == expected
34
+
35
+
36
+ @pytest.mark.parametrize(
37
+ "typ,expected",
38
+ (
39
+ (list[str], Array(String())),
40
+ (Iterable[str], Array(String())),
41
+ (list[list[str]], Array(Array(String()))),
42
+ ),
43
+ )
44
+ def test_convert_type_to_datachain_array(typ, expected):
45
+ assert python_to_sql(typ).to_dict() == expected.to_dict()
46
+
47
+
48
+ @pytest.mark.parametrize(
49
+ "typ",
50
+ (
51
+ Union[str, int],
52
+ list[Union[str, int]],
53
+ MyFeature,
54
+ MyModel,
55
+ ),
56
+ )
57
+ def test_convert_type_to_datachain_error(typ):
58
+ with pytest.raises(TypeError):
59
+ python_to_sql(typ)
60
+
61
+
62
+ def test_normalize_column_names():
63
+ res = normalize_col_names(
64
+ [
65
+ "UpperCase",
66
+ "_underscore_start",
67
+ "double__underscore",
68
+ "1start_with_number",
69
+ "не_ascii_start",
70
+ " space_start",
71
+ "space_end ",
72
+ "dash-end-",
73
+ "-dash-start",
74
+ "--multiple--dash--",
75
+ "-_ mix_ -dash_ -",
76
+ "__2digit_after_uderscore",
77
+ "",
78
+ "_-_- _---_ _",
79
+ "_-_- _---_ _1",
80
+ ]
81
+ )
82
+ assert list(res.keys()) == [
83
+ "uppercase",
84
+ "underscore_start",
85
+ "double_underscore",
86
+ "c0_1start_with_number",
87
+ "ascii_start",
88
+ "space_start",
89
+ "space_end",
90
+ "dash_end",
91
+ "dash_start",
92
+ "multiple_dash",
93
+ "mix_dash",
94
+ "c1_2digit_after_uderscore",
95
+ "c2",
96
+ "c3",
97
+ "c4_1",
98
+ ]
99
+
100
+
101
+ def test_normalize_column_names_case_repeat():
102
+ res = normalize_col_names(["UpperCase", "UpPerCase"])
103
+
104
+ assert list(res.keys()) == ["uppercase", "c0_uppercase"]
105
+
106
+
107
+ def test_normalize_column_names_exists_after_normalize():
108
+ res = normalize_col_names(["1digit", "c0_1digit"])
109
+
110
+ assert list(res.keys()) == ["c1_1digit", "c0_1digit"]
111
+
112
+
113
+ def test_normalize_column_names_normalized_repeat():
114
+ res = normalize_col_names(["column", "_column"])
115
+
116
+ assert list(res.keys()) == ["column", "c0_column"]
117
+
118
+
119
+ def test_normalize_column_names_normalized_case_repeat():
120
+ res = normalize_col_names(["CoLuMn", "_column"])
121
+
122
+ assert res == {"column": "CoLuMn", "c0_column": "_column"}
123
+
124
+
125
+ def test_normalize_column_names_repeat_generated_after_normalize():
126
+ res = normalize_col_names(["c0_CoLuMn", "_column", "column"])
127
+
128
+ assert res == {"c0_column": "c0_CoLuMn", "c1_column": "_column", "column": "column"}
@@ -1,30 +0,0 @@
1
- from abc import ABC, abstractmethod
2
-
3
-
4
- class AbstractUDF(ABC):
5
- @abstractmethod
6
- def process(self, *args, **kwargs):
7
- pass
8
-
9
- @abstractmethod
10
- def setup(self):
11
- pass
12
-
13
- @abstractmethod
14
- def teardown(self):
15
- pass
16
-
17
-
18
- class DataChainError(Exception):
19
- def __init__(self, message):
20
- super().__init__(message)
21
-
22
-
23
- class DataChainParamsError(DataChainError):
24
- def __init__(self, message):
25
- super().__init__(message)
26
-
27
-
28
- class DataChainColumnError(DataChainParamsError):
29
- def __init__(self, col_name, msg):
30
- super().__init__(f"Error for column {col_name}: {msg}")
@@ -1,137 +0,0 @@
1
- import os
2
- import shutil
3
- from pathlib import Path
4
- from subprocess import check_output
5
-
6
- import pytest
7
- import virtualenv
8
- from dulwich.porcelain import clone
9
- from packaging import version
10
-
11
-
12
- @pytest.fixture
13
- def bucket():
14
- return "s3://noaa-bathymetry-pds/"
15
-
16
-
17
- def pytest_generate_tests(metafunc):
18
- str_revs = metafunc.config.getoption("--datachain-revs")
19
- revs = str_revs.split(",") if str_revs else [None]
20
- if "datachain_rev" in metafunc.fixturenames:
21
- metafunc.parametrize("datachain_rev", revs, scope="session")
22
-
23
-
24
- class VirtualEnv:
25
- def __init__(self, path) -> None:
26
- self.path = path
27
- self.bin = self.path / ("Scripts" if os.name == "nt" else "bin")
28
-
29
- def create(self) -> None:
30
- virtualenv.cli_run([os.fspath(self.path)])
31
-
32
- def run(self, cmd: str, *args: str, env=None) -> None:
33
- exe = self.which(cmd)
34
- check_output([exe, *args], env=env) # noqa: S603
35
-
36
- def which(self, cmd: str) -> str:
37
- assert self.bin.exists()
38
- return shutil.which(cmd, path=self.bin) or cmd
39
-
40
-
41
- @pytest.fixture(scope="session", name="make_datachain_venv")
42
- def fixture_make_datachain_venv(tmp_path_factory):
43
- def _make_datachain_venv(name):
44
- venv_dir = tmp_path_factory.mktemp(f"datachain-venv-{name}")
45
- venv = VirtualEnv(venv_dir)
46
- venv.create()
47
- return venv
48
-
49
- return _make_datachain_venv
50
-
51
-
52
- @pytest.fixture(scope="session", name="datachain_venvs")
53
- def fixture_datachain_venvs():
54
- return {}
55
-
56
-
57
- @pytest.fixture(scope="session", name="datachain_git_repo")
58
- def fixture_datachain_git_repo(tmp_path_factory, test_config):
59
- url = test_config.datachain_git_repo
60
-
61
- if os.path.isdir(url):
62
- return url
63
-
64
- tmp_path = os.fspath(tmp_path_factory.mktemp("datachain-git-repo"))
65
- clone(url, tmp_path)
66
-
67
- return tmp_path
68
-
69
-
70
- @pytest.fixture(scope="session", name="datachain_bin")
71
- def fixture_datachain_bin(
72
- datachain_rev,
73
- datachain_venvs,
74
- make_datachain_venv,
75
- datachain_git_repo,
76
- test_config,
77
- ):
78
- if datachain_rev:
79
- venv = datachain_venvs.get(datachain_rev)
80
- if not venv:
81
- venv = make_datachain_venv(datachain_rev)
82
- venv.run("pip", "install", "-U", "pip")
83
- venv.run(
84
- "pip", "install", f"git+file://{datachain_git_repo}@{datachain_rev}"
85
- )
86
- datachain_venvs[datachain_rev] = venv
87
- datachain_bin = venv.which("datachain")
88
- else:
89
- datachain_bin = test_config.datachain_bin
90
-
91
- def _datachain_bin(*args):
92
- return check_output([datachain_bin, *args], text=True) # noqa: S603
93
-
94
- actual = version.parse(_datachain_bin("--version"))
95
- _datachain_bin.version = (actual.major, actual.minor, actual.micro)
96
-
97
- return _datachain_bin
98
-
99
-
100
- @pytest.fixture(scope="function", name="make_bench")
101
- def fixture_make_bench(request):
102
- def _make_bench(name):
103
- import pytest_benchmark.plugin
104
-
105
- # hack from https://github.com/ionelmc/pytest-benchmark/issues/166
106
- bench = pytest_benchmark.plugin.benchmark.__pytest_wrapped__.obj(request)
107
-
108
- suffix = f"-{name}"
109
-
110
- def add_suffix(_name):
111
- start, sep, end = _name.partition("[")
112
- return start + suffix + sep + end
113
-
114
- bench.name = add_suffix(bench.name)
115
- bench.fullname = add_suffix(bench.fullname)
116
-
117
- return bench
118
-
119
- return _make_bench
120
-
121
-
122
- @pytest.fixture(
123
- scope="function", params=[pytest.param(None, marks=pytest.mark.benchmark)]
124
- )
125
- def bench_datachain(datachain_bin, make_bench):
126
- def _bench_datachain(*args, **kwargs):
127
- name = kwargs.pop("name", None)
128
- name = f"-{name}" if name else ""
129
- bench = make_bench(args[0] + name)
130
- return bench.pedantic(datachain_bin, args=args, **kwargs)
131
-
132
- return _bench_datachain
133
-
134
-
135
- @pytest.fixture
136
- def datasets():
137
- return Path(__file__).parent / "datasets"
@@ -1,2 +0,0 @@
1
- def test_ls(bench_datachain, tmp_dir, bucket):
2
- bench_datachain("ls", bucket, "--anon")
@@ -1,2 +0,0 @@
1
- def test_version(bench_datachain):
2
- bench_datachain("--help", rounds=100)