clarifai 11.3.0rc2__py3-none-any.whl → 11.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (300) hide show
  1. clarifai/__init__.py +1 -1
  2. clarifai/cli/__main__.py +1 -1
  3. clarifai/cli/base.py +144 -136
  4. clarifai/cli/compute_cluster.py +45 -31
  5. clarifai/cli/deployment.py +93 -76
  6. clarifai/cli/model.py +578 -180
  7. clarifai/cli/nodepool.py +100 -82
  8. clarifai/client/__init__.py +12 -2
  9. clarifai/client/app.py +973 -911
  10. clarifai/client/auth/helper.py +345 -342
  11. clarifai/client/auth/register.py +7 -7
  12. clarifai/client/auth/stub.py +107 -106
  13. clarifai/client/base.py +185 -178
  14. clarifai/client/compute_cluster.py +214 -180
  15. clarifai/client/dataset.py +793 -698
  16. clarifai/client/deployment.py +55 -50
  17. clarifai/client/input.py +1223 -1088
  18. clarifai/client/lister.py +47 -45
  19. clarifai/client/model.py +1939 -1717
  20. clarifai/client/model_client.py +525 -502
  21. clarifai/client/module.py +82 -73
  22. clarifai/client/nodepool.py +358 -213
  23. clarifai/client/runner.py +58 -0
  24. clarifai/client/search.py +342 -309
  25. clarifai/client/user.py +419 -414
  26. clarifai/client/workflow.py +294 -274
  27. clarifai/constants/dataset.py +11 -17
  28. clarifai/constants/model.py +8 -2
  29. clarifai/datasets/export/inputs_annotations.py +233 -217
  30. clarifai/datasets/upload/base.py +63 -51
  31. clarifai/datasets/upload/features.py +43 -38
  32. clarifai/datasets/upload/image.py +237 -207
  33. clarifai/datasets/upload/loaders/coco_captions.py +34 -32
  34. clarifai/datasets/upload/loaders/coco_detection.py +72 -65
  35. clarifai/datasets/upload/loaders/imagenet_classification.py +57 -53
  36. clarifai/datasets/upload/loaders/xview_detection.py +274 -132
  37. clarifai/datasets/upload/multimodal.py +55 -46
  38. clarifai/datasets/upload/text.py +55 -47
  39. clarifai/datasets/upload/utils.py +250 -234
  40. clarifai/errors.py +51 -50
  41. clarifai/models/api.py +260 -238
  42. clarifai/modules/css.py +50 -50
  43. clarifai/modules/pages.py +33 -33
  44. clarifai/rag/rag.py +312 -288
  45. clarifai/rag/utils.py +91 -84
  46. clarifai/runners/models/model_builder.py +906 -802
  47. clarifai/runners/models/model_class.py +370 -331
  48. clarifai/runners/models/model_run_locally.py +459 -419
  49. clarifai/runners/models/model_runner.py +170 -162
  50. clarifai/runners/models/model_servicer.py +78 -70
  51. clarifai/runners/server.py +111 -101
  52. clarifai/runners/utils/code_script.py +225 -187
  53. clarifai/runners/utils/const.py +4 -1
  54. clarifai/runners/utils/data_types/__init__.py +12 -0
  55. clarifai/runners/utils/data_types/data_types.py +598 -0
  56. clarifai/runners/utils/data_utils.py +387 -440
  57. clarifai/runners/utils/loader.py +247 -227
  58. clarifai/runners/utils/method_signatures.py +411 -386
  59. clarifai/runners/utils/openai_convertor.py +108 -109
  60. clarifai/runners/utils/serializers.py +175 -179
  61. clarifai/runners/utils/url_fetcher.py +35 -35
  62. clarifai/schema/search.py +56 -63
  63. clarifai/urls/helper.py +125 -102
  64. clarifai/utils/cli.py +129 -123
  65. clarifai/utils/config.py +127 -87
  66. clarifai/utils/constants.py +49 -0
  67. clarifai/utils/evaluation/helpers.py +503 -466
  68. clarifai/utils/evaluation/main.py +431 -393
  69. clarifai/utils/evaluation/testset_annotation_parser.py +154 -144
  70. clarifai/utils/logging.py +324 -306
  71. clarifai/utils/misc.py +60 -56
  72. clarifai/utils/model_train.py +165 -146
  73. clarifai/utils/protobuf.py +126 -103
  74. clarifai/versions.py +3 -1
  75. clarifai/workflows/export.py +48 -50
  76. clarifai/workflows/utils.py +39 -36
  77. clarifai/workflows/validate.py +55 -43
  78. {clarifai-11.3.0rc2.dist-info → clarifai-11.4.0.dist-info}/METADATA +16 -6
  79. clarifai-11.4.0.dist-info/RECORD +109 -0
  80. {clarifai-11.3.0rc2.dist-info → clarifai-11.4.0.dist-info}/WHEEL +1 -1
  81. clarifai/__pycache__/__init__.cpython-310.pyc +0 -0
  82. clarifai/__pycache__/__init__.cpython-311.pyc +0 -0
  83. clarifai/__pycache__/__init__.cpython-39.pyc +0 -0
  84. clarifai/__pycache__/errors.cpython-310.pyc +0 -0
  85. clarifai/__pycache__/errors.cpython-311.pyc +0 -0
  86. clarifai/__pycache__/versions.cpython-310.pyc +0 -0
  87. clarifai/__pycache__/versions.cpython-311.pyc +0 -0
  88. clarifai/cli/__pycache__/__init__.cpython-310.pyc +0 -0
  89. clarifai/cli/__pycache__/__init__.cpython-311.pyc +0 -0
  90. clarifai/cli/__pycache__/base.cpython-310.pyc +0 -0
  91. clarifai/cli/__pycache__/base.cpython-311.pyc +0 -0
  92. clarifai/cli/__pycache__/base_cli.cpython-310.pyc +0 -0
  93. clarifai/cli/__pycache__/compute_cluster.cpython-310.pyc +0 -0
  94. clarifai/cli/__pycache__/compute_cluster.cpython-311.pyc +0 -0
  95. clarifai/cli/__pycache__/deployment.cpython-310.pyc +0 -0
  96. clarifai/cli/__pycache__/deployment.cpython-311.pyc +0 -0
  97. clarifai/cli/__pycache__/model.cpython-310.pyc +0 -0
  98. clarifai/cli/__pycache__/model.cpython-311.pyc +0 -0
  99. clarifai/cli/__pycache__/model_cli.cpython-310.pyc +0 -0
  100. clarifai/cli/__pycache__/nodepool.cpython-310.pyc +0 -0
  101. clarifai/cli/__pycache__/nodepool.cpython-311.pyc +0 -0
  102. clarifai/client/__pycache__/__init__.cpython-310.pyc +0 -0
  103. clarifai/client/__pycache__/__init__.cpython-311.pyc +0 -0
  104. clarifai/client/__pycache__/__init__.cpython-39.pyc +0 -0
  105. clarifai/client/__pycache__/app.cpython-310.pyc +0 -0
  106. clarifai/client/__pycache__/app.cpython-311.pyc +0 -0
  107. clarifai/client/__pycache__/app.cpython-39.pyc +0 -0
  108. clarifai/client/__pycache__/base.cpython-310.pyc +0 -0
  109. clarifai/client/__pycache__/base.cpython-311.pyc +0 -0
  110. clarifai/client/__pycache__/compute_cluster.cpython-310.pyc +0 -0
  111. clarifai/client/__pycache__/compute_cluster.cpython-311.pyc +0 -0
  112. clarifai/client/__pycache__/dataset.cpython-310.pyc +0 -0
  113. clarifai/client/__pycache__/dataset.cpython-311.pyc +0 -0
  114. clarifai/client/__pycache__/deployment.cpython-310.pyc +0 -0
  115. clarifai/client/__pycache__/deployment.cpython-311.pyc +0 -0
  116. clarifai/client/__pycache__/input.cpython-310.pyc +0 -0
  117. clarifai/client/__pycache__/input.cpython-311.pyc +0 -0
  118. clarifai/client/__pycache__/lister.cpython-310.pyc +0 -0
  119. clarifai/client/__pycache__/lister.cpython-311.pyc +0 -0
  120. clarifai/client/__pycache__/model.cpython-310.pyc +0 -0
  121. clarifai/client/__pycache__/model.cpython-311.pyc +0 -0
  122. clarifai/client/__pycache__/module.cpython-310.pyc +0 -0
  123. clarifai/client/__pycache__/module.cpython-311.pyc +0 -0
  124. clarifai/client/__pycache__/nodepool.cpython-310.pyc +0 -0
  125. clarifai/client/__pycache__/nodepool.cpython-311.pyc +0 -0
  126. clarifai/client/__pycache__/search.cpython-310.pyc +0 -0
  127. clarifai/client/__pycache__/search.cpython-311.pyc +0 -0
  128. clarifai/client/__pycache__/user.cpython-310.pyc +0 -0
  129. clarifai/client/__pycache__/user.cpython-311.pyc +0 -0
  130. clarifai/client/__pycache__/workflow.cpython-310.pyc +0 -0
  131. clarifai/client/__pycache__/workflow.cpython-311.pyc +0 -0
  132. clarifai/client/auth/__pycache__/__init__.cpython-310.pyc +0 -0
  133. clarifai/client/auth/__pycache__/__init__.cpython-311.pyc +0 -0
  134. clarifai/client/auth/__pycache__/helper.cpython-310.pyc +0 -0
  135. clarifai/client/auth/__pycache__/helper.cpython-311.pyc +0 -0
  136. clarifai/client/auth/__pycache__/register.cpython-310.pyc +0 -0
  137. clarifai/client/auth/__pycache__/register.cpython-311.pyc +0 -0
  138. clarifai/client/auth/__pycache__/stub.cpython-310.pyc +0 -0
  139. clarifai/client/auth/__pycache__/stub.cpython-311.pyc +0 -0
  140. clarifai/client/cli/__init__.py +0 -0
  141. clarifai/client/cli/__pycache__/__init__.cpython-310.pyc +0 -0
  142. clarifai/client/cli/__pycache__/base_cli.cpython-310.pyc +0 -0
  143. clarifai/client/cli/__pycache__/model_cli.cpython-310.pyc +0 -0
  144. clarifai/client/cli/base_cli.py +0 -88
  145. clarifai/client/cli/model_cli.py +0 -29
  146. clarifai/constants/__pycache__/base.cpython-310.pyc +0 -0
  147. clarifai/constants/__pycache__/base.cpython-311.pyc +0 -0
  148. clarifai/constants/__pycache__/dataset.cpython-310.pyc +0 -0
  149. clarifai/constants/__pycache__/dataset.cpython-311.pyc +0 -0
  150. clarifai/constants/__pycache__/input.cpython-310.pyc +0 -0
  151. clarifai/constants/__pycache__/input.cpython-311.pyc +0 -0
  152. clarifai/constants/__pycache__/model.cpython-310.pyc +0 -0
  153. clarifai/constants/__pycache__/model.cpython-311.pyc +0 -0
  154. clarifai/constants/__pycache__/rag.cpython-310.pyc +0 -0
  155. clarifai/constants/__pycache__/rag.cpython-311.pyc +0 -0
  156. clarifai/constants/__pycache__/search.cpython-310.pyc +0 -0
  157. clarifai/constants/__pycache__/search.cpython-311.pyc +0 -0
  158. clarifai/constants/__pycache__/workflow.cpython-310.pyc +0 -0
  159. clarifai/constants/__pycache__/workflow.cpython-311.pyc +0 -0
  160. clarifai/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
  161. clarifai/datasets/__pycache__/__init__.cpython-311.pyc +0 -0
  162. clarifai/datasets/__pycache__/__init__.cpython-39.pyc +0 -0
  163. clarifai/datasets/export/__pycache__/__init__.cpython-310.pyc +0 -0
  164. clarifai/datasets/export/__pycache__/__init__.cpython-311.pyc +0 -0
  165. clarifai/datasets/export/__pycache__/__init__.cpython-39.pyc +0 -0
  166. clarifai/datasets/export/__pycache__/inputs_annotations.cpython-310.pyc +0 -0
  167. clarifai/datasets/export/__pycache__/inputs_annotations.cpython-311.pyc +0 -0
  168. clarifai/datasets/upload/__pycache__/__init__.cpython-310.pyc +0 -0
  169. clarifai/datasets/upload/__pycache__/__init__.cpython-311.pyc +0 -0
  170. clarifai/datasets/upload/__pycache__/__init__.cpython-39.pyc +0 -0
  171. clarifai/datasets/upload/__pycache__/base.cpython-310.pyc +0 -0
  172. clarifai/datasets/upload/__pycache__/base.cpython-311.pyc +0 -0
  173. clarifai/datasets/upload/__pycache__/features.cpython-310.pyc +0 -0
  174. clarifai/datasets/upload/__pycache__/features.cpython-311.pyc +0 -0
  175. clarifai/datasets/upload/__pycache__/image.cpython-310.pyc +0 -0
  176. clarifai/datasets/upload/__pycache__/image.cpython-311.pyc +0 -0
  177. clarifai/datasets/upload/__pycache__/multimodal.cpython-310.pyc +0 -0
  178. clarifai/datasets/upload/__pycache__/multimodal.cpython-311.pyc +0 -0
  179. clarifai/datasets/upload/__pycache__/text.cpython-310.pyc +0 -0
  180. clarifai/datasets/upload/__pycache__/text.cpython-311.pyc +0 -0
  181. clarifai/datasets/upload/__pycache__/utils.cpython-310.pyc +0 -0
  182. clarifai/datasets/upload/__pycache__/utils.cpython-311.pyc +0 -0
  183. clarifai/datasets/upload/loaders/__pycache__/__init__.cpython-311.pyc +0 -0
  184. clarifai/datasets/upload/loaders/__pycache__/__init__.cpython-39.pyc +0 -0
  185. clarifai/datasets/upload/loaders/__pycache__/coco_detection.cpython-311.pyc +0 -0
  186. clarifai/datasets/upload/loaders/__pycache__/imagenet_classification.cpython-311.pyc +0 -0
  187. clarifai/models/__pycache__/__init__.cpython-39.pyc +0 -0
  188. clarifai/modules/__pycache__/__init__.cpython-39.pyc +0 -0
  189. clarifai/rag/__pycache__/__init__.cpython-310.pyc +0 -0
  190. clarifai/rag/__pycache__/__init__.cpython-311.pyc +0 -0
  191. clarifai/rag/__pycache__/__init__.cpython-39.pyc +0 -0
  192. clarifai/rag/__pycache__/rag.cpython-310.pyc +0 -0
  193. clarifai/rag/__pycache__/rag.cpython-311.pyc +0 -0
  194. clarifai/rag/__pycache__/rag.cpython-39.pyc +0 -0
  195. clarifai/rag/__pycache__/utils.cpython-310.pyc +0 -0
  196. clarifai/rag/__pycache__/utils.cpython-311.pyc +0 -0
  197. clarifai/runners/__pycache__/__init__.cpython-310.pyc +0 -0
  198. clarifai/runners/__pycache__/__init__.cpython-311.pyc +0 -0
  199. clarifai/runners/__pycache__/__init__.cpython-39.pyc +0 -0
  200. clarifai/runners/dockerfile_template/Dockerfile.cpu.template +0 -31
  201. clarifai/runners/dockerfile_template/Dockerfile.cuda.template +0 -42
  202. clarifai/runners/dockerfile_template/Dockerfile.nim +0 -71
  203. clarifai/runners/models/__pycache__/__init__.cpython-310.pyc +0 -0
  204. clarifai/runners/models/__pycache__/__init__.cpython-311.pyc +0 -0
  205. clarifai/runners/models/__pycache__/__init__.cpython-39.pyc +0 -0
  206. clarifai/runners/models/__pycache__/base_typed_model.cpython-310.pyc +0 -0
  207. clarifai/runners/models/__pycache__/base_typed_model.cpython-311.pyc +0 -0
  208. clarifai/runners/models/__pycache__/base_typed_model.cpython-39.pyc +0 -0
  209. clarifai/runners/models/__pycache__/model_builder.cpython-311.pyc +0 -0
  210. clarifai/runners/models/__pycache__/model_class.cpython-310.pyc +0 -0
  211. clarifai/runners/models/__pycache__/model_class.cpython-311.pyc +0 -0
  212. clarifai/runners/models/__pycache__/model_run_locally.cpython-310-pytest-7.1.2.pyc +0 -0
  213. clarifai/runners/models/__pycache__/model_run_locally.cpython-310.pyc +0 -0
  214. clarifai/runners/models/__pycache__/model_run_locally.cpython-311.pyc +0 -0
  215. clarifai/runners/models/__pycache__/model_runner.cpython-310.pyc +0 -0
  216. clarifai/runners/models/__pycache__/model_runner.cpython-311.pyc +0 -0
  217. clarifai/runners/models/__pycache__/model_upload.cpython-310.pyc +0 -0
  218. clarifai/runners/models/base_typed_model.py +0 -238
  219. clarifai/runners/models/model_class_refract.py +0 -80
  220. clarifai/runners/models/model_upload.py +0 -607
  221. clarifai/runners/models/temp.py +0 -25
  222. clarifai/runners/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  223. clarifai/runners/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  224. clarifai/runners/utils/__pycache__/__init__.cpython-38.pyc +0 -0
  225. clarifai/runners/utils/__pycache__/__init__.cpython-39.pyc +0 -0
  226. clarifai/runners/utils/__pycache__/buffered_stream.cpython-310.pyc +0 -0
  227. clarifai/runners/utils/__pycache__/buffered_stream.cpython-38.pyc +0 -0
  228. clarifai/runners/utils/__pycache__/buffered_stream.cpython-39.pyc +0 -0
  229. clarifai/runners/utils/__pycache__/const.cpython-310.pyc +0 -0
  230. clarifai/runners/utils/__pycache__/const.cpython-311.pyc +0 -0
  231. clarifai/runners/utils/__pycache__/constants.cpython-310.pyc +0 -0
  232. clarifai/runners/utils/__pycache__/constants.cpython-38.pyc +0 -0
  233. clarifai/runners/utils/__pycache__/constants.cpython-39.pyc +0 -0
  234. clarifai/runners/utils/__pycache__/data_handler.cpython-310.pyc +0 -0
  235. clarifai/runners/utils/__pycache__/data_handler.cpython-311.pyc +0 -0
  236. clarifai/runners/utils/__pycache__/data_handler.cpython-38.pyc +0 -0
  237. clarifai/runners/utils/__pycache__/data_handler.cpython-39.pyc +0 -0
  238. clarifai/runners/utils/__pycache__/data_utils.cpython-310.pyc +0 -0
  239. clarifai/runners/utils/__pycache__/data_utils.cpython-311.pyc +0 -0
  240. clarifai/runners/utils/__pycache__/data_utils.cpython-38.pyc +0 -0
  241. clarifai/runners/utils/__pycache__/data_utils.cpython-39.pyc +0 -0
  242. clarifai/runners/utils/__pycache__/grpc_server.cpython-310.pyc +0 -0
  243. clarifai/runners/utils/__pycache__/grpc_server.cpython-38.pyc +0 -0
  244. clarifai/runners/utils/__pycache__/grpc_server.cpython-39.pyc +0 -0
  245. clarifai/runners/utils/__pycache__/health.cpython-310.pyc +0 -0
  246. clarifai/runners/utils/__pycache__/health.cpython-38.pyc +0 -0
  247. clarifai/runners/utils/__pycache__/health.cpython-39.pyc +0 -0
  248. clarifai/runners/utils/__pycache__/loader.cpython-310.pyc +0 -0
  249. clarifai/runners/utils/__pycache__/loader.cpython-311.pyc +0 -0
  250. clarifai/runners/utils/__pycache__/logging.cpython-310.pyc +0 -0
  251. clarifai/runners/utils/__pycache__/logging.cpython-38.pyc +0 -0
  252. clarifai/runners/utils/__pycache__/logging.cpython-39.pyc +0 -0
  253. clarifai/runners/utils/__pycache__/stream_source.cpython-310.pyc +0 -0
  254. clarifai/runners/utils/__pycache__/stream_source.cpython-39.pyc +0 -0
  255. clarifai/runners/utils/__pycache__/url_fetcher.cpython-310.pyc +0 -0
  256. clarifai/runners/utils/__pycache__/url_fetcher.cpython-311.pyc +0 -0
  257. clarifai/runners/utils/__pycache__/url_fetcher.cpython-38.pyc +0 -0
  258. clarifai/runners/utils/__pycache__/url_fetcher.cpython-39.pyc +0 -0
  259. clarifai/runners/utils/data_handler.py +0 -231
  260. clarifai/runners/utils/data_handler_refract.py +0 -213
  261. clarifai/runners/utils/data_types.py +0 -469
  262. clarifai/runners/utils/logger.py +0 -0
  263. clarifai/runners/utils/openai_format.py +0 -87
  264. clarifai/schema/__pycache__/search.cpython-310.pyc +0 -0
  265. clarifai/schema/__pycache__/search.cpython-311.pyc +0 -0
  266. clarifai/urls/__pycache__/helper.cpython-310.pyc +0 -0
  267. clarifai/urls/__pycache__/helper.cpython-311.pyc +0 -0
  268. clarifai/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  269. clarifai/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  270. clarifai/utils/__pycache__/__init__.cpython-39.pyc +0 -0
  271. clarifai/utils/__pycache__/cli.cpython-310.pyc +0 -0
  272. clarifai/utils/__pycache__/cli.cpython-311.pyc +0 -0
  273. clarifai/utils/__pycache__/config.cpython-311.pyc +0 -0
  274. clarifai/utils/__pycache__/constants.cpython-310.pyc +0 -0
  275. clarifai/utils/__pycache__/constants.cpython-311.pyc +0 -0
  276. clarifai/utils/__pycache__/logging.cpython-310.pyc +0 -0
  277. clarifai/utils/__pycache__/logging.cpython-311.pyc +0 -0
  278. clarifai/utils/__pycache__/misc.cpython-310.pyc +0 -0
  279. clarifai/utils/__pycache__/misc.cpython-311.pyc +0 -0
  280. clarifai/utils/__pycache__/model_train.cpython-310.pyc +0 -0
  281. clarifai/utils/__pycache__/model_train.cpython-311.pyc +0 -0
  282. clarifai/utils/__pycache__/protobuf.cpython-311.pyc +0 -0
  283. clarifai/utils/evaluation/__pycache__/__init__.cpython-311.pyc +0 -0
  284. clarifai/utils/evaluation/__pycache__/__init__.cpython-39.pyc +0 -0
  285. clarifai/utils/evaluation/__pycache__/helpers.cpython-311.pyc +0 -0
  286. clarifai/utils/evaluation/__pycache__/main.cpython-311.pyc +0 -0
  287. clarifai/utils/evaluation/__pycache__/main.cpython-39.pyc +0 -0
  288. clarifai/workflows/__pycache__/__init__.cpython-310.pyc +0 -0
  289. clarifai/workflows/__pycache__/__init__.cpython-311.pyc +0 -0
  290. clarifai/workflows/__pycache__/__init__.cpython-39.pyc +0 -0
  291. clarifai/workflows/__pycache__/export.cpython-310.pyc +0 -0
  292. clarifai/workflows/__pycache__/export.cpython-311.pyc +0 -0
  293. clarifai/workflows/__pycache__/utils.cpython-310.pyc +0 -0
  294. clarifai/workflows/__pycache__/utils.cpython-311.pyc +0 -0
  295. clarifai/workflows/__pycache__/validate.cpython-310.pyc +0 -0
  296. clarifai/workflows/__pycache__/validate.cpython-311.pyc +0 -0
  297. clarifai-11.3.0rc2.dist-info/RECORD +0 -322
  298. {clarifai-11.3.0rc2.dist-info → clarifai-11.4.0.dist-info}/entry_points.txt +0 -0
  299. {clarifai-11.3.0rc2.dist-info → clarifai-11.4.0.dist-info/licenses}/LICENSE +0 -0
  300. {clarifai-11.3.0rc2.dist-info → clarifai-11.4.0.dist-info}/top_level.txt +0 -0
@@ -20,11 +20,16 @@ from clarifai.client.base import BaseClient
20
20
  from clarifai.client.input import Inputs
21
21
  from clarifai.client.lister import Lister
22
22
  from clarifai.constants.dataset import DATASET_UPLOAD_TASKS, MAX_RETRIES
23
- from clarifai.datasets.export.inputs_annotations import (DatasetExportReader,
24
- InputAnnotationDownloader)
23
+ from clarifai.datasets.export.inputs_annotations import (
24
+ DatasetExportReader,
25
+ InputAnnotationDownloader,
26
+ )
25
27
  from clarifai.datasets.upload.base import ClarifaiDataLoader
26
- from clarifai.datasets.upload.image import (VisualClassificationDataset, VisualDetectionDataset,
27
- VisualSegmentationDataset)
28
+ from clarifai.datasets.upload.image import (
29
+ VisualClassificationDataset,
30
+ VisualDetectionDataset,
31
+ VisualSegmentationDataset,
32
+ )
28
33
  from clarifai.datasets.upload.multimodal import MultiModalDataset
29
34
  from clarifai.datasets.upload.text import TextClassificationDataset
30
35
  from clarifai.datasets.upload.utils import DisplayUploadStatus
@@ -33,709 +38,799 @@ from clarifai.urls.helper import ClarifaiUrlHelper
33
38
  from clarifai.utils.logging import add_file_handler, logger, process_log_files
34
39
  from clarifai.utils.misc import BackoffIterator, Chunker
35
40
 
36
- ClarifaiDatasetType = TypeVar('ClarifaiDatasetType', VisualClassificationDataset,
37
- VisualDetectionDataset, VisualSegmentationDataset,
38
- TextClassificationDataset)
41
+ ClarifaiDatasetType = TypeVar(
42
+ 'ClarifaiDatasetType',
43
+ VisualClassificationDataset,
44
+ VisualDetectionDataset,
45
+ VisualSegmentationDataset,
46
+ TextClassificationDataset,
47
+ )
39
48
 
40
49
 
41
50
  class Dataset(Lister, BaseClient):
42
- """Dataset is a class that provides access to Clarifai API endpoints related to Dataset information."""
43
-
44
- def __init__(self,
45
- url: str = None,
46
- dataset_id: str = None,
47
- dataset_version_id: str = None,
48
- base_url: str = "https://api.clarifai.com",
49
- pat: str = None,
50
- token: str = None,
51
- root_certificates_path: str = None,
52
- **kwargs):
53
- """Initializes a Dataset object.
54
-
55
- Args:
56
- url (str): The URL to initialize the dataset object.
57
- dataset_id (str): The Dataset ID within the App to interact with.
58
- dataset_version_id (str): The Dataset Version ID within the Dataset to interact with.
59
- base_url (str): Base API url. Default "https://api.clarifai.com"
60
- pat (str): A personal access token for authentication. Can be set as env var CLARIFAI_PAT
61
- token (str): A session token for authentication. Accepts either a session token or a pat. Can be set as env var CLARIFAI_SESSION_TOKEN
62
- root_certificates_path (str): Path to the SSL root certificates file, used to establish secure gRPC connections.
63
- **kwargs: Additional keyword arguments to be passed to the Dataset.
64
- """
65
- if url and dataset_id:
66
- raise UserError("You can only specify one of url or dataset_id.")
67
- if url:
68
- user_id, app_id, _, dataset_id, dataset_version_id = ClarifaiUrlHelper.split_clarifai_url(
69
- url)
70
- kwargs = {'user_id': user_id, 'app_id': app_id}
71
- dataset_version = {
72
- 'id': dataset_version_id
73
- } if dataset_version_id else kwargs['version'] if 'version' in kwargs else None
74
- self.kwargs = {**kwargs, 'id': dataset_id, 'version': dataset_version}
75
- self.dataset_info = resources_pb2.Dataset(**self.kwargs)
76
- # Related to Dataset Upload
77
- self.num_workers: int = min(10, cpu_count()) #15 req/sec rate limit
78
- self.annot_num_workers = 4
79
- self.max_retires = 10
80
- self.batch_size = 128 # limit max protos in a req
81
- self.task = None # Upload dataset type
82
- self.input_object = Inputs(
83
- user_id=self.user_id,
84
- app_id=self.app_id,
85
- pat=pat,
86
- token=token,
87
- base_url=base_url,
88
- root_certificates_path=root_certificates_path)
89
- self.logger = logger
90
- BaseClient.__init__(
51
+ """Dataset is a class that provides access to Clarifai API endpoints related to Dataset information."""
52
+
53
+ def __init__(
91
54
  self,
92
- user_id=self.user_id,
93
- app_id=self.app_id,
94
- base=base_url,
95
- pat=pat,
96
- token=token,
97
- root_certificates_path=root_certificates_path)
98
- Lister.__init__(self)
99
-
100
- def create_version(self, **kwargs) -> 'Dataset':
101
- """Creates a dataset version for the Dataset.
102
-
103
- Args:
104
- **kwargs: Additional keyword arguments to be passed to Dataset Version.
105
- - description (str): The description of the dataset version.
106
- - metadata (dict): The metadata of the dataset version.
107
-
108
- Returns:
109
- Dataset: A Dataset object for the specified dataset ID.
110
-
111
- Example:
112
- >>> from clarifai.client.dataset import Dataset
113
- >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
114
- >>> dataset_version = dataset.create_version(description='dataset_version_description')
115
- """
116
- request = service_pb2.PostDatasetVersionsRequest(
117
- user_app_id=self.user_app_id,
118
- dataset_id=self.id,
119
- dataset_versions=[resources_pb2.DatasetVersion(**kwargs)])
120
-
121
- response = self._grpc_request(self.STUB.PostDatasetVersions, request)
122
- if response.status.code != status_code_pb2.SUCCESS:
123
- raise Exception(response.status)
124
- self.logger.info("\nDataset Version created\n%s", response.status)
125
- kwargs.update({
126
- 'dataset_id': self.id,
127
- 'version': response.dataset_versions[0],
128
- })
129
-
130
- return Dataset.from_auth_helper(self.auth_helper, **kwargs)
131
-
132
- def delete_version(self, version_id: str) -> None:
133
- """Deletes a dataset version for the Dataset.
134
-
135
- Args:
136
- version_id (str): The version ID to delete.
137
-
138
- Example:
139
- >>> from clarifai.client.dataset import Dataset
140
- >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
141
- >>> dataset.delete_version(version_id='version_id')
142
- """
143
- request = service_pb2.DeleteDatasetVersionsRequest(
144
- user_app_id=self.user_app_id, dataset_id=self.id, dataset_version_ids=[version_id])
145
-
146
- response = self._grpc_request(self.STUB.DeleteDatasetVersions, request)
147
- if response.status.code != status_code_pb2.SUCCESS:
148
- raise Exception(response.status)
149
- self.logger.info("\nDataset Version Deleted\n%s", response.status)
150
-
151
- def list_versions(self, page_no: int = None,
152
- per_page: int = None) -> Generator['Dataset', None, None]:
153
- """Lists all the versions for the dataset.
154
-
155
- Args:
156
- page_no (int): The page number to list.
157
- per_page (int): The number of items per page.
158
-
159
- Yields:
160
- Dataset: Dataset objects for the versions of the dataset.
161
-
162
- Example:
163
- >>> from clarifai.client.dataset import Dataset
164
- >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
165
- >>> all_dataset_versions = list(dataset.list_versions())
166
-
167
- Note:
168
- Defaults to 16 per page if page_no is specified and per_page is not specified.
169
- If both page_no and per_page are None, then lists all the resources.
170
- """
171
- request_data = dict(
172
- user_app_id=self.user_app_id,
173
- dataset_id=self.id,
174
- )
175
- all_dataset_versions_info = self.list_pages_generator(
176
- self.STUB.ListDatasetVersions,
177
- service_pb2.ListDatasetVersionsRequest,
178
- request_data,
179
- per_page=per_page,
180
- page_no=page_no)
181
-
182
- for dataset_version_info in all_dataset_versions_info:
183
- dataset_version_info['id'] = dataset_version_info['dataset_version_id']
184
- del dataset_version_info['dataset_version_id']
185
- dataset_version_info.pop('metrics', None)
186
- dataset_version_info.pop('export_info', None)
187
- kwargs = {
188
- 'dataset_id': self.id,
189
- 'version': resources_pb2.DatasetVersion(**dataset_version_info),
190
- }
191
- yield Dataset.from_auth_helper(self.auth_helper, **kwargs)
192
-
193
- def list_inputs(self, page_no: int = None, per_page: int = None,
194
- input_type: str = None) -> Generator[Input, None, None]:
195
- """Lists all the inputs for the dataset.
196
-
197
- Args:
198
- page_no (int): The page number to list.
199
- per_page (int): The number of items per page.
200
- input_type (str): The type of input to list. Options: 'image', 'video', 'audio', 'text'.
201
-
202
- Yields:
203
- Input: Input objects in the dataset.
204
-
205
- Example:
206
- >>> from clarifai.client.dataset import Dataset
207
- >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
208
- >>> all_dataset_inputs = list(dataset.list_inputs())
209
- """
210
- return self.input_object.list_inputs(
211
- dataset_id=self.id, page_no=page_no, per_page=per_page, input_type=input_type)
212
-
213
- def __iter__(self):
214
- return iter(DatasetExportReader(archive_url=self.archive_zip()))
215
-
216
- def _concurrent_annot_upload(self, annots: List[List[resources_pb2.Annotation]]
217
- ) -> Union[List[resources_pb2.Annotation], List[None]]:
218
- """Uploads annotations concurrently.
219
-
220
- Args:
221
- annots: annot protos
222
-
223
- Returns:
224
- retry_annot_upload: All failed annot protos during upload
225
- """
226
- annot_threads = []
227
- retry_annot_upload = []
228
-
229
- with ThreadPoolExecutor(max_workers=self.annot_num_workers) as executor: # limit annot workers
230
- annot_threads = [
231
- executor.submit(self.input_object.upload_annotations, inp_batch, False)
232
- for inp_batch in annots
233
- ]
234
-
235
- for job in as_completed(annot_threads):
236
- result = job.result()
237
- if result:
238
- retry_annot_upload.extend(result)
239
-
240
- return retry_annot_upload
241
-
242
- def _delete_failed_inputs(self,
243
- batch_input_ids: List[int],
244
- dataset_obj: ClarifaiDatasetType,
245
- upload_response: MultiInputResponse = None,
246
- batch_no: Optional[int] = None) -> Tuple[List[int], List[int]]:
247
- """Delete failed input ids from clarifai platform dataset.
248
-
249
- Args:
250
- batch_input_ids: batch input ids
251
- dataset_obj: ClarifaiDataset object
252
- upload_response: upload response proto
253
-
254
- Returns:
255
- success_inputs: upload success input ids
256
- failed_inputs: upload failed input ids
257
- """
258
- success_status = status_pb2.Status(code=status_code_pb2.INPUT_DOWNLOAD_SUCCESS)
259
- input_ids = {dataset_obj.all_input_ids[id]: id for id in batch_input_ids}
260
- response = self._grpc_request(
261
- self.STUB.ListInputs,
262
- service_pb2.ListInputsRequest(
263
- ids=list(input_ids.keys()),
264
- per_page=len(input_ids),
55
+ url: str = None,
56
+ dataset_id: str = None,
57
+ dataset_version_id: str = None,
58
+ base_url: str = "https://api.clarifai.com",
59
+ pat: str = None,
60
+ token: str = None,
61
+ root_certificates_path: str = None,
62
+ **kwargs,
63
+ ):
64
+ """Initializes a Dataset object.
65
+
66
+ Args:
67
+ url (str): The URL to initialize the dataset object.
68
+ dataset_id (str): The Dataset ID within the App to interact with.
69
+ dataset_version_id (str): The Dataset Version ID within the Dataset to interact with.
70
+ base_url (str): Base API url. Default "https://api.clarifai.com"
71
+ pat (str): A personal access token for authentication. Can be set as env var CLARIFAI_PAT
72
+ token (str): A session token for authentication. Accepts either a session token or a pat. Can be set as env var CLARIFAI_SESSION_TOKEN
73
+ root_certificates_path (str): Path to the SSL root certificates file, used to establish secure gRPC connections.
74
+ **kwargs: Additional keyword arguments to be passed to the Dataset.
75
+ """
76
+ if url and dataset_id:
77
+ raise UserError("You can only specify one of url or dataset_id.")
78
+ if url:
79
+ user_id, app_id, _, dataset_id, dataset_version_id = (
80
+ ClarifaiUrlHelper.split_clarifai_url(url)
81
+ )
82
+ kwargs = {'user_id': user_id, 'app_id': app_id}
83
+ dataset_version = (
84
+ {'id': dataset_version_id}
85
+ if dataset_version_id
86
+ else kwargs['version']
87
+ if 'version' in kwargs
88
+ else None
89
+ )
90
+ self.kwargs = {**kwargs, 'id': dataset_id, 'version': dataset_version}
91
+ self.dataset_info = resources_pb2.Dataset(**self.kwargs)
92
+ # Related to Dataset Upload
93
+ self.num_workers: int = min(10, cpu_count()) # 15 req/sec rate limit
94
+ self.annot_num_workers = 4
95
+ self.max_retires = 10
96
+ self.batch_size = 128 # limit max protos in a req
97
+ self.task = None # Upload dataset type
98
+ self.input_object = Inputs(
99
+ user_id=self.user_id,
100
+ app_id=self.app_id,
101
+ pat=pat,
102
+ token=token,
103
+ base_url=base_url,
104
+ root_certificates_path=root_certificates_path,
105
+ )
106
+ self.logger = logger
107
+ BaseClient.__init__(
108
+ self,
109
+ user_id=self.user_id,
110
+ app_id=self.app_id,
111
+ base=base_url,
112
+ pat=pat,
113
+ token=token,
114
+ root_certificates_path=root_certificates_path,
115
+ )
116
+ Lister.__init__(self)
117
+
118
+ def create_version(self, **kwargs) -> 'Dataset':
119
+ """Creates a dataset version for the Dataset.
120
+
121
+ Args:
122
+ **kwargs: Additional keyword arguments to be passed to Dataset Version.
123
+ - description (str): The description of the dataset version.
124
+ - metadata (dict): The metadata of the dataset version.
125
+
126
+ Returns:
127
+ Dataset: A Dataset object for the specified dataset ID.
128
+
129
+ Example:
130
+ >>> from clarifai.client.dataset import Dataset
131
+ >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
132
+ >>> dataset_version = dataset.create_version(description='dataset_version_description')
133
+ """
134
+ request = service_pb2.PostDatasetVersionsRequest(
135
+ user_app_id=self.user_app_id,
136
+ dataset_id=self.id,
137
+ dataset_versions=[resources_pb2.DatasetVersion(**kwargs)],
138
+ )
139
+
140
+ response = self._grpc_request(self.STUB.PostDatasetVersions, request)
141
+ if response.status.code != status_code_pb2.SUCCESS:
142
+ raise Exception(response.status)
143
+ self.logger.info("\nDataset Version created\n%s", response.status)
144
+ kwargs.update(
145
+ {
146
+ 'dataset_id': self.id,
147
+ 'version': response.dataset_versions[0],
148
+ }
149
+ )
150
+
151
+ return Dataset.from_auth_helper(self.auth_helper, **kwargs)
152
+
153
+ def delete_version(self, version_id: str) -> None:
154
+ """Deletes a dataset version for the Dataset.
155
+
156
+ Args:
157
+ version_id (str): The version ID to delete.
158
+
159
+ Example:
160
+ >>> from clarifai.client.dataset import Dataset
161
+ >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
162
+ >>> dataset.delete_version(version_id='version_id')
163
+ """
164
+ request = service_pb2.DeleteDatasetVersionsRequest(
165
+ user_app_id=self.user_app_id, dataset_id=self.id, dataset_version_ids=[version_id]
166
+ )
167
+
168
+ response = self._grpc_request(self.STUB.DeleteDatasetVersions, request)
169
+ if response.status.code != status_code_pb2.SUCCESS:
170
+ raise Exception(response.status)
171
+ self.logger.info("\nDataset Version Deleted\n%s", response.status)
172
+
173
+ def list_versions(
174
+ self, page_no: int = None, per_page: int = None
175
+ ) -> Generator['Dataset', None, None]:
176
+ """Lists all the versions for the dataset.
177
+
178
+ Args:
179
+ page_no (int): The page number to list.
180
+ per_page (int): The number of items per page.
181
+
182
+ Yields:
183
+ Dataset: Dataset objects for the versions of the dataset.
184
+
185
+ Example:
186
+ >>> from clarifai.client.dataset import Dataset
187
+ >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
188
+ >>> all_dataset_versions = list(dataset.list_versions())
189
+
190
+ Note:
191
+ Defaults to 16 per page if page_no is specified and per_page is not specified.
192
+ If both page_no and per_page are None, then lists all the resources.
193
+ """
194
+ request_data = dict(
265
195
  user_app_id=self.user_app_id,
266
- status=success_status),
267
- )
268
- response_dict = MessageToDict(response)
269
- success_inputs = response_dict.get('inputs', [])
270
-
271
- success_input_ids = [input.get('id') for input in success_inputs]
272
- failed_input_ids = list(set(input_ids) - set(success_input_ids.copy()))
273
- #check duplicate input ids
274
- duplicate_input_ids = [
275
- input.id for input in upload_response.inputs
276
- if input.status.details == 'Input has a duplicate ID.'
277
- ] #handling duplicte ID failures.
278
- if duplicate_input_ids:
279
- success_input_ids = list(set(success_input_ids.copy()) - set(duplicate_input_ids.copy()))
280
- failed_input_ids = list(set(failed_input_ids) - set(duplicate_input_ids))
281
- duplicate_details = [[
282
- input_ids[id], id, "Input has a duplicate ID.",
283
- getattr(dataset_obj.data_generator[input_ids[id]], 'image_path', None),
284
- getattr(dataset_obj.data_generator[input_ids[id]], 'labels', None),
285
- getattr(dataset_obj.data_generator[input_ids[id]], 'metadata', None)
286
- ] for id in duplicate_input_ids]
287
- duplicate_table = tabulate(
288
- duplicate_details,
289
- headers=["Index", "Input ID", "Status", "Image Path", "Labels", "Metadata"],
290
- tablefmt="grid")
291
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
292
- self.logger.warning(
293
- f"{timestamp}\nFailed to upload {len(duplicate_input_ids)} inputs due to duplicate IDs in current batch {batch_no}:\n{duplicate_table}\n\n"
294
- )
295
-
296
- #delete failed inputs
297
- self._grpc_request(
298
- self.STUB.DeleteInputs,
299
- service_pb2.DeleteInputsRequest(user_app_id=self.user_app_id, ids=failed_input_ids),
300
- )
301
- return [input_ids[id] for id in success_input_ids], [input_ids[id] for id in failed_input_ids]
302
-
303
- def _upload_inputs_annotations(
304
- self,
305
- batch_input_ids: List[int],
306
- dataset_obj: ClarifaiDatasetType,
307
- batch_no: Optional[int] = None,
308
- is_retry_duplicates: bool = False,
309
- ) -> Tuple[List[int], List[resources_pb2.Annotation], MultiInputResponse]:
310
- """Uploads batch of inputs and annotations concurrently to clarifai platform dataset.
311
-
312
- Args:
313
- batch_input_ids: batch input ids
314
- dataset_obj: ClarifaiDataset object
315
-
316
- Returns:
317
- failed_input_ids: failed input ids
318
- retry_annot_protos: failed annot protos
319
- response: upload response proto
320
- """
321
- input_protos, _ = dataset_obj.get_protos(batch_input_ids)
322
- if is_retry_duplicates:
323
- for inp in input_protos:
324
- inp.id = uuid.uuid4().hex
325
-
326
- input_job_id, _response = self.input_object.upload_inputs(inputs=input_protos, show_log=False)
327
- retry_annot_protos = []
328
-
329
- self.input_object._wait_for_inputs(input_job_id)
330
- success_input_ids, failed_input_ids = self._delete_failed_inputs(batch_input_ids, dataset_obj,
331
- _response, batch_no)
332
-
333
- if self.task in ["visual_detection", "visual_segmentation"] and success_input_ids:
334
- _, annotation_protos = dataset_obj.get_protos(success_input_ids)
335
- chunked_annotation_protos = Chunker(annotation_protos, self.batch_size).chunk()
336
- retry_annot_protos.extend(self._concurrent_annot_upload(chunked_annotation_protos))
337
-
338
- return failed_input_ids, retry_annot_protos, _response
339
-
340
- def _retry_uploads(self, failed_input_ids: List[int],
341
- retry_annot_protos: List[resources_pb2.Annotation],
342
- dataset_obj: ClarifaiDatasetType, batch_no: Optional[int]) -> None:
343
- """Retry failed uploads.
344
-
345
- Args:
346
- failed_input_ids: failed input ids
347
- retry_annot_protos: failed annot protos
348
- dataset_obj: ClarifaiDataset object
349
- """
350
-
351
- for _retry in range(MAX_RETRIES):
352
- if not failed_input_ids and not retry_annot_protos:
353
- break
354
- if failed_input_ids:
355
- retry_input_ids = [dataset_obj.all_input_ids[id] for id in failed_input_ids]
356
- logger.warning(
357
- f"Retrying upload for {len(failed_input_ids)} inputs in current batch: {retry_input_ids}\n"
196
+ dataset_id=self.id,
197
+ )
198
+ all_dataset_versions_info = self.list_pages_generator(
199
+ self.STUB.ListDatasetVersions,
200
+ service_pb2.ListDatasetVersionsRequest,
201
+ request_data,
202
+ per_page=per_page,
203
+ page_no=page_no,
358
204
  )
359
- failed_retrying_inputs, _, retry_response = self._upload_inputs_annotations(
360
- failed_input_ids, dataset_obj, batch_no)
361
- failed_input_ids = failed_retrying_inputs
362
- if retry_annot_protos:
363
- chunked_annotation_protos = Chunker(retry_annot_protos, self.batch_size).chunk()
364
- _ = self._concurrent_annot_upload(chunked_annotation_protos)
365
-
366
- #Log failed inputs
367
- if failed_input_ids:
368
- failed_inputs_logs = []
369
- input_map = {input.id: input for input in retry_response.inputs}
370
- for index in failed_retrying_inputs:
371
- failed_id = dataset_obj.all_input_ids[index]
372
- input_details = input_map.get(failed_id)
373
- if input_details:
374
- failed_input_details = [
375
- index, failed_id, input_details.status.details,
376
- getattr(dataset_obj.data_generator[index], 'image_path', None) or
377
- getattr(dataset_obj.data_generator[index], 'text', None),
378
- dataset_obj.data_generator[index].labels, dataset_obj.data_generator[index].metadata
379
- ]
380
- failed_inputs_logs.append(failed_input_details)
381
-
382
- failed_table = tabulate(
383
- failed_inputs_logs,
384
- headers=["Index", "Input ID", "Status", "Input", "Labels", "Metadata"],
385
- tablefmt="grid")
386
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
387
- self.logger.warning(
388
- f"{timestamp}\nFailed to upload {len(failed_retrying_inputs)} inputs in current batch {batch_no} due to {retry_response}:\n{failed_table}\n\n"
389
- )
390
-
391
- def _data_upload(self,
392
- dataset_obj: ClarifaiDatasetType,
393
- is_log_retry: bool = False,
394
- log_retry_ids: List[int] = None,
395
- **kwargs) -> None:
396
- """Uploads inputs and annotations to clarifai platform dataset.
397
-
398
- Args:
399
- dataset_obj: ClarifaiDataset object,
400
- is_log_retry: True if the iteration is to retry uploads from logs.
401
- **kwargs: Additional keyword arguments for retry uploading functionality..
402
-
403
- Returns:
404
- None
405
- """
406
- if is_log_retry:
407
- input_ids = log_retry_ids
408
- else:
409
- input_ids = list(range(len(dataset_obj)))
410
-
411
- chunk_input_ids = Chunker(input_ids, self.batch_size).chunk()
412
- with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
413
- with tqdm(total=len(chunk_input_ids), desc='Uploading Dataset') as progress:
414
- # Submit all jobs to the executor and store the returned futures
415
- futures = [
416
- executor.submit(self._upload_inputs_annotations, batch_input_ids, dataset_obj,
417
- batch_no, **kwargs)
418
- for batch_no, batch_input_ids in enumerate(chunk_input_ids)
205
+
206
+ for dataset_version_info in all_dataset_versions_info:
207
+ dataset_version_info['id'] = dataset_version_info['dataset_version_id']
208
+ del dataset_version_info['dataset_version_id']
209
+ dataset_version_info.pop('metrics', None)
210
+ dataset_version_info.pop('export_info', None)
211
+ kwargs = {
212
+ 'dataset_id': self.id,
213
+ 'version': resources_pb2.DatasetVersion(**dataset_version_info),
214
+ }
215
+ yield Dataset.from_auth_helper(self.auth_helper, **kwargs)
216
+
217
+ def list_inputs(
218
+ self, page_no: int = None, per_page: int = None, input_type: str = None
219
+ ) -> Generator[Input, None, None]:
220
+ """Lists all the inputs for the dataset.
221
+
222
+ Args:
223
+ page_no (int): The page number to list.
224
+ per_page (int): The number of items per page.
225
+ input_type (str): The type of input to list. Options: 'image', 'video', 'audio', 'text'.
226
+
227
+ Yields:
228
+ Input: Input objects in the dataset.
229
+
230
+ Example:
231
+ >>> from clarifai.client.dataset import Dataset
232
+ >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
233
+ >>> all_dataset_inputs = list(dataset.list_inputs())
234
+ """
235
+ return self.input_object.list_inputs(
236
+ dataset_id=self.id, page_no=page_no, per_page=per_page, input_type=input_type
237
+ )
238
+
239
+ def __iter__(self):
240
+ return iter(DatasetExportReader(archive_url=self.archive_zip()))
241
+
242
+ def _concurrent_annot_upload(
243
+ self, annots: List[List[resources_pb2.Annotation]]
244
+ ) -> Union[List[resources_pb2.Annotation], List[None]]:
245
+ """Uploads annotations concurrently.
246
+
247
+ Args:
248
+ annots: annot protos
249
+
250
+ Returns:
251
+ retry_annot_upload: All failed annot protos during upload
252
+ """
253
+ annot_threads = []
254
+ retry_annot_upload = []
255
+
256
+ with ThreadPoolExecutor(
257
+ max_workers=self.annot_num_workers
258
+ ) as executor: # limit annot workers
259
+ annot_threads = [
260
+ executor.submit(self.input_object.upload_annotations, inp_batch, False)
261
+ for inp_batch in annots
262
+ ]
263
+
264
+ for job in as_completed(annot_threads):
265
+ result = job.result()
266
+ if result:
267
+ retry_annot_upload.extend(result)
268
+
269
+ return retry_annot_upload
270
+
271
+ def _delete_failed_inputs(
272
+ self,
273
+ batch_input_ids: List[int],
274
+ dataset_obj: ClarifaiDatasetType,
275
+ upload_response: MultiInputResponse = None,
276
+ batch_no: Optional[int] = None,
277
+ ) -> Tuple[List[int], List[int]]:
278
+ """Delete failed input ids from clarifai platform dataset.
279
+
280
+ Args:
281
+ batch_input_ids: batch input ids
282
+ dataset_obj: ClarifaiDataset object
283
+ upload_response: upload response proto
284
+
285
+ Returns:
286
+ success_inputs: upload success input ids
287
+ failed_inputs: upload failed input ids
288
+ """
289
+ success_status = status_pb2.Status(code=status_code_pb2.INPUT_DOWNLOAD_SUCCESS)
290
+ input_ids = {dataset_obj.all_input_ids[id]: id for id in batch_input_ids}
291
+ response = self._grpc_request(
292
+ self.STUB.ListInputs,
293
+ service_pb2.ListInputsRequest(
294
+ ids=list(input_ids.keys()),
295
+ per_page=len(input_ids),
296
+ user_app_id=self.user_app_id,
297
+ status=success_status,
298
+ ),
299
+ )
300
+ response_dict = MessageToDict(response)
301
+ success_inputs = response_dict.get('inputs', [])
302
+
303
+ success_input_ids = [input.get('id') for input in success_inputs]
304
+ failed_input_ids = list(set(input_ids) - set(success_input_ids.copy()))
305
+ # check duplicate input ids
306
+ duplicate_input_ids = [
307
+ input.id
308
+ for input in upload_response.inputs
309
+ if input.status.details == 'Input has a duplicate ID.'
310
+ ] # handling duplicte ID failures.
311
+ if duplicate_input_ids:
312
+ success_input_ids = list(
313
+ set(success_input_ids.copy()) - set(duplicate_input_ids.copy())
314
+ )
315
+ failed_input_ids = list(set(failed_input_ids) - set(duplicate_input_ids))
316
+ duplicate_details = [
317
+ [
318
+ input_ids[id],
319
+ id,
320
+ "Input has a duplicate ID.",
321
+ getattr(dataset_obj.data_generator[input_ids[id]], 'image_path', None),
322
+ getattr(dataset_obj.data_generator[input_ids[id]], 'labels', None),
323
+ getattr(dataset_obj.data_generator[input_ids[id]], 'metadata', None),
324
+ ]
325
+ for id in duplicate_input_ids
326
+ ]
327
+ duplicate_table = tabulate(
328
+ duplicate_details,
329
+ headers=["Index", "Input ID", "Status", "Image Path", "Labels", "Metadata"],
330
+ tablefmt="grid",
331
+ )
332
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
333
+ self.logger.warning(
334
+ f"{timestamp}\nFailed to upload {len(duplicate_input_ids)} inputs due to duplicate IDs in current batch {batch_no}:\n{duplicate_table}\n\n"
335
+ )
336
+
337
+ # delete failed inputs
338
+ self._grpc_request(
339
+ self.STUB.DeleteInputs,
340
+ service_pb2.DeleteInputsRequest(user_app_id=self.user_app_id, ids=failed_input_ids),
341
+ )
342
+ return [input_ids[id] for id in success_input_ids], [
343
+ input_ids[id] for id in failed_input_ids
419
344
  ]
420
345
 
421
- for batch_no, job in enumerate(as_completed(futures)):
422
- retry_input_ids, retry_annot_protos, _ = job.result()
423
- self._retry_uploads(retry_input_ids, retry_annot_protos, dataset_obj, batch_no)
424
- progress.update()
425
-
426
- def upload_dataset(self,
427
- dataloader: Type[ClarifaiDataLoader],
428
- batch_size: int = 32,
429
- get_upload_status: bool = False,
430
- log_warnings: bool = False,
431
- **kwargs) -> None:
432
- """Uploads a dataset to the app.
433
-
434
- Args:
435
- dataloader (Type[ClarifaiDataLoader]): ClarifaiDataLoader object
436
- batch_size (int): batch size for concurrent upload of inputs and annotations (max: 128)
437
- get_upload_status (bool): True if you want to get the upload status of the dataset
438
- log_warnings (bool): True if you want to save log warnings in a file
439
- kwargs: Additional keyword arguments for retry uploading functionality..
440
- """
441
- #set batch size and task
442
- self.batch_size = min(self.batch_size, batch_size)
443
- self.task = dataloader.task
444
- if self.task not in DATASET_UPLOAD_TASKS:
445
- raise UserError("Task should be one of \
346
+ def _upload_inputs_annotations(
347
+ self,
348
+ batch_input_ids: List[int],
349
+ dataset_obj: ClarifaiDatasetType,
350
+ batch_no: Optional[int] = None,
351
+ is_retry_duplicates: bool = False,
352
+ ) -> Tuple[List[int], List[resources_pb2.Annotation], MultiInputResponse]:
353
+ """Uploads batch of inputs and annotations concurrently to clarifai platform dataset.
354
+
355
+ Args:
356
+ batch_input_ids: batch input ids
357
+ dataset_obj: ClarifaiDataset object
358
+
359
+ Returns:
360
+ failed_input_ids: failed input ids
361
+ retry_annot_protos: failed annot protos
362
+ response: upload response proto
363
+ """
364
+ input_protos, _ = dataset_obj.get_protos(batch_input_ids)
365
+ if is_retry_duplicates:
366
+ for inp in input_protos:
367
+ inp.id = uuid.uuid4().hex
368
+
369
+ input_job_id, _response = self.input_object.upload_inputs(
370
+ inputs=input_protos, show_log=False
371
+ )
372
+ retry_annot_protos = []
373
+
374
+ self.input_object._wait_for_inputs(input_job_id)
375
+ success_input_ids, failed_input_ids = self._delete_failed_inputs(
376
+ batch_input_ids, dataset_obj, _response, batch_no
377
+ )
378
+
379
+ if self.task in ["visual_detection", "visual_segmentation"] and success_input_ids:
380
+ _, annotation_protos = dataset_obj.get_protos(success_input_ids)
381
+ chunked_annotation_protos = Chunker(annotation_protos, self.batch_size).chunk()
382
+ retry_annot_protos.extend(self._concurrent_annot_upload(chunked_annotation_protos))
383
+
384
+ return failed_input_ids, retry_annot_protos, _response
385
+
386
+ def _retry_uploads(
387
+ self,
388
+ failed_input_ids: List[int],
389
+ retry_annot_protos: List[resources_pb2.Annotation],
390
+ dataset_obj: ClarifaiDatasetType,
391
+ batch_no: Optional[int],
392
+ ) -> None:
393
+ """Retry failed uploads.
394
+
395
+ Args:
396
+ failed_input_ids: failed input ids
397
+ retry_annot_protos: failed annot protos
398
+ dataset_obj: ClarifaiDataset object
399
+ """
400
+
401
+ for _retry in range(MAX_RETRIES):
402
+ if not failed_input_ids and not retry_annot_protos:
403
+ break
404
+ if failed_input_ids:
405
+ retry_input_ids = [dataset_obj.all_input_ids[id] for id in failed_input_ids]
406
+ logger.warning(
407
+ f"Retrying upload for {len(failed_input_ids)} inputs in current batch: {retry_input_ids}\n"
408
+ )
409
+ failed_retrying_inputs, _, retry_response = self._upload_inputs_annotations(
410
+ failed_input_ids, dataset_obj, batch_no
411
+ )
412
+ failed_input_ids = failed_retrying_inputs
413
+ if retry_annot_protos:
414
+ chunked_annotation_protos = Chunker(retry_annot_protos, self.batch_size).chunk()
415
+ _ = self._concurrent_annot_upload(chunked_annotation_protos)
416
+
417
+ # Log failed inputs
418
+ if failed_input_ids:
419
+ failed_inputs_logs = []
420
+ input_map = {input.id: input for input in retry_response.inputs}
421
+ for index in failed_retrying_inputs:
422
+ failed_id = dataset_obj.all_input_ids[index]
423
+ input_details = input_map.get(failed_id)
424
+ if input_details:
425
+ failed_input_details = [
426
+ index,
427
+ failed_id,
428
+ input_details.status.details,
429
+ getattr(dataset_obj.data_generator[index], 'image_path', None)
430
+ or getattr(dataset_obj.data_generator[index], 'text', None),
431
+ dataset_obj.data_generator[index].labels,
432
+ dataset_obj.data_generator[index].metadata,
433
+ ]
434
+ failed_inputs_logs.append(failed_input_details)
435
+
436
+ failed_table = tabulate(
437
+ failed_inputs_logs,
438
+ headers=["Index", "Input ID", "Status", "Input", "Labels", "Metadata"],
439
+ tablefmt="grid",
440
+ )
441
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
442
+ self.logger.warning(
443
+ f"{timestamp}\nFailed to upload {len(failed_retrying_inputs)} inputs in current batch {batch_no} due to {retry_response}:\n{failed_table}\n\n"
444
+ )
445
+
446
+ def _data_upload(
447
+ self,
448
+ dataset_obj: ClarifaiDatasetType,
449
+ is_log_retry: bool = False,
450
+ log_retry_ids: List[int] = None,
451
+ **kwargs,
452
+ ) -> None:
453
+ """Uploads inputs and annotations to clarifai platform dataset.
454
+
455
+ Args:
456
+ dataset_obj: ClarifaiDataset object,
457
+ is_log_retry: True if the iteration is to retry uploads from logs.
458
+ **kwargs: Additional keyword arguments for retry uploading functionality..
459
+
460
+ Returns:
461
+ None
462
+ """
463
+ if is_log_retry:
464
+ input_ids = log_retry_ids
465
+ else:
466
+ input_ids = list(range(len(dataset_obj)))
467
+
468
+ chunk_input_ids = Chunker(input_ids, self.batch_size).chunk()
469
+ with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
470
+ with tqdm(total=len(chunk_input_ids), desc='Uploading Dataset') as progress:
471
+ # Submit all jobs to the executor and store the returned futures
472
+ futures = [
473
+ executor.submit(
474
+ self._upload_inputs_annotations,
475
+ batch_input_ids,
476
+ dataset_obj,
477
+ batch_no,
478
+ **kwargs,
479
+ )
480
+ for batch_no, batch_input_ids in enumerate(chunk_input_ids)
481
+ ]
482
+
483
+ for batch_no, job in enumerate(as_completed(futures)):
484
+ retry_input_ids, retry_annot_protos, _ = job.result()
485
+ self._retry_uploads(retry_input_ids, retry_annot_protos, dataset_obj, batch_no)
486
+ progress.update()
487
+
488
+ def upload_dataset(
489
+ self,
490
+ dataloader: Type[ClarifaiDataLoader],
491
+ batch_size: int = 32,
492
+ get_upload_status: bool = False,
493
+ log_warnings: bool = False,
494
+ **kwargs,
495
+ ) -> None:
496
+ """Uploads a dataset to the app.
497
+
498
+ Args:
499
+ dataloader (Type[ClarifaiDataLoader]): ClarifaiDataLoader object
500
+ batch_size (int): batch size for concurrent upload of inputs and annotations (max: 128)
501
+ get_upload_status (bool): True if you want to get the upload status of the dataset
502
+ log_warnings (bool): True if you want to save log warnings in a file
503
+ kwargs: Additional keyword arguments for retry uploading functionality..
504
+ """
505
+ # set batch size and task
506
+ self.batch_size = min(self.batch_size, batch_size)
507
+ self.task = dataloader.task
508
+ if self.task not in DATASET_UPLOAD_TASKS:
509
+ raise UserError(
510
+ "Task should be one of \
446
511
  'text_classification', 'visual_classification', \
447
512
  'visual_detection', 'visual_segmentation', 'visual_captioning', 'multimodal_dataset'"
448
- )
449
-
450
- if self.task == "text_classification":
451
- dataset_obj = TextClassificationDataset(dataloader, self.id)
452
-
453
- elif self.task == "visual_detection":
454
- dataset_obj = VisualDetectionDataset(dataloader, self.id)
455
-
456
- elif self.task == "visual_segmentation":
457
- dataset_obj = VisualSegmentationDataset(dataloader, self.id)
458
-
459
- elif self.task == "multimodal_dataset":
460
- dataset_obj = MultiModalDataset(dataloader, self.id)
461
-
462
- else: # visual_classification & visual_captioning
463
- dataset_obj = VisualClassificationDataset(dataloader, self.id)
464
-
465
- if get_upload_status:
466
- pre_upload_stats = self.get_upload_status(pre_upload=True)
467
-
468
- #add file handler to log warnings
469
- if log_warnings:
470
- add_file_handler(self.logger, f"Dataset_Upload{str(int(datetime.now().timestamp()))}.log")
471
- self._data_upload(dataset_obj, **kwargs)
472
-
473
- if get_upload_status:
474
- self.get_upload_status(dataloader=dataloader, pre_upload_stats=pre_upload_stats)
475
-
476
- def retry_upload_from_logs(self,
477
- log_file_path: str,
478
- dataloader: Type[ClarifaiDataLoader],
479
- retry_duplicates: bool = False,
480
- log_warnings: bool = False,
481
- **kwargs) -> None:
482
- """Retries failed uploads from the log file.
483
-
484
- Args:
485
- log_file_path (str): path to the log file
486
- dataloader (Type[ClarifaiDataLoader]): ClarifaiDataLoader object
487
- retry_duplicate (bool): True if you want to retry duplicate inputs
488
- kwargs: Additional keyword arguments for retry uploading functionality..
489
- """
490
-
491
- duplicate_input_ids, failed_input_ids = process_log_files(log_file_path)
492
- if log_warnings:
493
- add_file_handler(self.logger, f"Dataset_Upload{str(int(datetime.now().timestamp()))}.log")
494
-
495
- if retry_duplicates and duplicate_input_ids:
496
- logger.warning(f"Retrying upload for {len(duplicate_input_ids)} duplicate inputs...\n")
497
- duplicate_inputs_indexes = [input["Index"] for input in duplicate_input_ids]
498
- self.upload_dataset(
499
- dataloader=dataloader,
500
- log_retry_ids=duplicate_inputs_indexes,
501
- is_retry_duplicates=True,
502
- is_log_retry=True,
503
- **kwargs)
504
-
505
- if failed_input_ids:
506
- #failed_inputs= ([input["Input_ID"] for input in failed_input_ids])
507
- logger.warning(f"Retrying upload for {len(failed_input_ids)} failed inputs...\n")
508
- failed_input_indexes = [input["Index"] for input in failed_input_ids]
509
- self.upload_dataset(
510
- dataloader=dataloader, log_retry_ids=failed_input_indexes, is_log_retry=True, **kwargs)
511
-
512
- def upload_from_csv(self,
513
- csv_path: str,
514
- input_type: str = 'text',
515
- csv_type: str = None,
516
- labels: bool = True,
517
- batch_size: int = 128) -> None:
518
- """Uploads dataset from a csv file.
519
-
520
- Args:
521
- csv_path (str): path to the csv file
522
- input_type (str): type of the dataset(text, image)
523
- csv_type (str): type of the csv file(raw, url, file_path)
524
- labels (bool): True if csv file has labels column
525
- batch_size (int): batch size for concurrent upload of inputs and annotations
526
-
527
- Example:
528
- >>> from clarifai.client.dataset import Dataset
529
- >>> dataset = Dataset(user_id = 'user_id', app_id = 'demo_app', dataset_id = 'demo_dataset')
530
- >>> dataset.upload_from_csv(csv_path='csv_path', input_type='text', csv_type='raw, labels=True)
531
-
532
- Note:
533
- CSV file supports 'inputid', 'input', 'concepts', 'metadata', 'geopoints' columns.
534
- All the data in the CSV should be in double quotes.
535
- metadata should be in single quotes format. Example: "{'key': 'value'}"
536
- geopoints should be in "long,lat" format.
537
- """
538
- if input_type not in ['image', 'text', 'video', 'audio']:
539
- raise UserError('Invalid input type, it should be image,text,audio or video')
540
- if csv_type not in ['raw', 'url', 'file_path']:
541
- raise UserError('Invalid csv type, it should be raw, url or file_path')
542
- assert csv_path.endswith('.csv'), 'csv_path should be a csv file'
543
- if csv_type == 'raw' and input_type != 'text':
544
- raise UserError('Only text input type is supported for raw csv type')
545
- batch_size = min(128, batch_size)
546
- input_protos = self.input_object.get_inputs_from_csv(
547
- csv_path=csv_path,
548
- input_type=input_type,
549
- csv_type=csv_type,
550
- dataset_id=self.id,
551
- labels=labels)
552
- self.input_object._bulk_upload(inputs=input_protos, batch_size=batch_size)
553
-
554
- def upload_from_folder(self,
555
- folder_path: str,
556
- input_type: str,
557
- labels: bool = False,
558
- batch_size: int = 128) -> None:
559
- """Upload dataset from folder.
560
-
561
- Args:
562
- folder_path (str): Path to the folder containing images.
563
- input_type (str): type of the dataset(text, image)
564
- labels (bool): True if folder name is the label for the inputs
565
- batch_size (int): batch size for concurrent upload of inputs and annotations
566
-
567
- Example:
568
- >>> from clarifai.client.dataset import Dataset
569
- >>> dataset = Dataset(user_id = 'user_id', app_id = 'demo_app', dataset_id = 'demo_dataset')
570
- >>> dataset.upload_from_folder(folder_path='folder_path', input_type='text', labels=True)
571
-
572
- Note: The filename is used as the input_id.
573
- """
574
- if input_type not in ['image', 'text']:
575
- raise UserError('Invalid input type it should be image or text')
576
- if input_type == 'image':
577
- input_protos = self.input_object.get_image_inputs_from_folder(
578
- folder_path=folder_path, dataset_id=self.id, labels=labels)
579
- if input_type == 'text':
580
- input_protos = self.input_object.get_text_inputs_from_folder(
581
- folder_path=folder_path, dataset_id=self.id, labels=labels)
582
- self.input_object._bulk_upload(inputs=input_protos, batch_size=batch_size)
583
-
584
- def get_upload_status(
585
- self,
586
- dataloader: Type[ClarifaiDataLoader] = None,
587
- delete_version: bool = False,
588
- timeout: int = 600,
589
- pre_upload_stats: Tuple[Dict[str, int], Dict[str, int]] = None,
590
- pre_upload: bool = False) -> Optional[Tuple[Dict[str, int], Dict[str, int]]]:
591
- """Creates a new dataset version and displays the upload status of the dataset.
592
-
593
- Args:
594
- dataloader (Type[ClarifaiDataLoader]): ClarifaiDataLoader object
595
- delete_version (bool): True if you want to delete the version after getting the upload status
596
- timeout (int): Timeout in seconds for getting the upload status. Default is 600 seconds.
597
- pre_upload_stats (Tuple[Dict[str, int], Dict[str, int]]): The pre upload stats for the dataset.
598
- pre_upload (bool): True if you want to get the pre upload stats for the dataset.
599
-
600
- Example:
601
- >>> from clarifai.client.dataset import Dataset
602
- >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
603
- >>> dataset.get_upload_status(dataloader)
604
-
605
- Note:
606
- This is a beta feature and is subject to change.
607
- """
608
- self.logger.info("Getting dataset upload status...")
609
- dataset_version_id = uuid.uuid4().hex
610
- _ = self.create_version(id=dataset_version_id, description="SDK Upload Status")
611
-
612
- request_data = dict(
613
- user_app_id=self.user_app_id,
614
- dataset_id=self.id,
615
- dataset_version_id=dataset_version_id,
616
- )
617
-
618
- start_time = time.time()
619
- backoff_iterator = BackoffIterator(10)
620
- while (True):
621
- dataset_metrics_response = self._grpc_request(
622
- self.STUB.ListDatasetVersionMetricsGroups,
623
- service_pb2.ListDatasetVersionMetricsGroupsRequest(**request_data),
624
- )
625
-
626
- if dataset_metrics_response.status.code != status_code_pb2.SUCCESS:
627
- self.delete_version(dataset_version_id)
628
- raise Exception("Failed to get dataset metrics {}".format(dataset_metrics_response.status))
629
-
630
- dict_response = MessageToDict(dataset_metrics_response)
631
- if len(dict_response.keys()) == 1 and time.time() - start_time < timeout:
632
- self.logger.info("Crunching the dataset metrics. Please wait...")
633
- time.sleep(next(backoff_iterator))
634
- continue
635
- else:
636
- if time.time() - start_time > timeout:
637
- self.delete_version(dataset_version_id)
638
- raise UserError(
639
- "Dataset metrics are taking too long to process. Please try again later.")
640
- break
641
- #get pre upload stats
642
- if pre_upload:
643
- return DisplayUploadStatus.get_dataset_version_stats(dataset_metrics_response)
644
-
645
- dataset_info_dict = dict(user_id=self.user_id, app_id=self.app_id, dataset_id=self.id)
646
- DisplayUploadStatus(dataloader, dataset_metrics_response, dataset_info_dict, pre_upload_stats)
647
-
648
- if delete_version:
649
- self.delete_version(dataset_version_id)
650
-
651
- def merge_dataset(self, merge_dataset_id: str) -> None:
652
- """Merges the another dataset into current dataset.
653
-
654
- Args:
655
- merge_dataset_id (str): The dataset ID of the dataset to merge.
656
-
657
- Example:
658
- >>> from clarifai.client.dataset import Dataset
659
- >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
660
- >>> dataset.merge_dataset(merge_dataset_id='merge_dataset_id')
661
- """
662
- dataset_filter = resources_pb2.Filter(
663
- input=resources_pb2.Input(dataset_ids=[merge_dataset_id]))
664
- query = resources_pb2.Search(query=resources_pb2.Query(filters=[dataset_filter]))
665
- request = service_pb2.PostDatasetInputsRequest(
666
- user_app_id=self.user_app_id, dataset_id=self.id, search=query)
667
-
668
- response = self._grpc_request(self.STUB.PostDatasetInputs, request)
669
- if response.status.code != status_code_pb2.SUCCESS:
670
- raise Exception(response.status)
671
- self.logger.info("\nDataset Merged\n%s", response.status)
672
-
673
- def archive_zip(self, wait: bool = True) -> str:
674
- """Exports the dataset to a zip file URL."""
675
- request = service_pb2.PutDatasetVersionExportsRequest(
676
- user_app_id=self.user_app_id,
677
- dataset_id=self.id,
678
- dataset_version_id=self.version.id,
679
- exports=[
680
- resources_pb2.DatasetVersionExport(
681
- format=resources_pb2.DatasetVersionExportFormat.CLARIFAI_DATA_PROTOBUF)
682
- ])
683
-
684
- response = self._grpc_request(self.STUB.PutDatasetVersionExports, request)
685
- if response.status.code != status_code_pb2.SUCCESS:
686
- raise Exception(response.status)
687
- if wait:
688
- while response.exports[0].status.code in (
689
- status_code_pb2.DATASET_VERSION_EXPORT_PENDING,
690
- status_code_pb2.DATASET_VERSION_EXPORT_IN_PROGRESS):
691
- time.sleep(1)
513
+ )
514
+
515
+ if self.task == "text_classification":
516
+ dataset_obj = TextClassificationDataset(dataloader, self.id)
517
+
518
+ elif self.task == "visual_detection":
519
+ dataset_obj = VisualDetectionDataset(dataloader, self.id)
520
+
521
+ elif self.task == "visual_segmentation":
522
+ dataset_obj = VisualSegmentationDataset(dataloader, self.id)
523
+
524
+ elif self.task == "multimodal_dataset":
525
+ dataset_obj = MultiModalDataset(dataloader, self.id)
526
+
527
+ else: # visual_classification & visual_captioning
528
+ dataset_obj = VisualClassificationDataset(dataloader, self.id)
529
+
530
+ if get_upload_status:
531
+ pre_upload_stats = self.get_upload_status(pre_upload=True)
532
+
533
+ # add file handler to log warnings
534
+ if log_warnings:
535
+ add_file_handler(
536
+ self.logger, f"Dataset_Upload{str(int(datetime.now().timestamp()))}.log"
537
+ )
538
+ self._data_upload(dataset_obj, **kwargs)
539
+
540
+ if get_upload_status:
541
+ self.get_upload_status(dataloader=dataloader, pre_upload_stats=pre_upload_stats)
542
+
543
+ def retry_upload_from_logs(
544
+ self,
545
+ log_file_path: str,
546
+ dataloader: Type[ClarifaiDataLoader],
547
+ retry_duplicates: bool = False,
548
+ log_warnings: bool = False,
549
+ **kwargs,
550
+ ) -> None:
551
+ """Retries failed uploads from the log file.
552
+
553
+ Args:
554
+ log_file_path (str): path to the log file
555
+ dataloader (Type[ClarifaiDataLoader]): ClarifaiDataLoader object
556
+ retry_duplicate (bool): True if you want to retry duplicate inputs
557
+ kwargs: Additional keyword arguments for retry uploading functionality..
558
+ """
559
+
560
+ duplicate_input_ids, failed_input_ids = process_log_files(log_file_path)
561
+ if log_warnings:
562
+ add_file_handler(
563
+ self.logger, f"Dataset_Upload{str(int(datetime.now().timestamp()))}.log"
564
+ )
565
+
566
+ if retry_duplicates and duplicate_input_ids:
567
+ logger.warning(f"Retrying upload for {len(duplicate_input_ids)} duplicate inputs...\n")
568
+ duplicate_inputs_indexes = [input["Index"] for input in duplicate_input_ids]
569
+ self.upload_dataset(
570
+ dataloader=dataloader,
571
+ log_retry_ids=duplicate_inputs_indexes,
572
+ is_retry_duplicates=True,
573
+ is_log_retry=True,
574
+ **kwargs,
575
+ )
576
+
577
+ if failed_input_ids:
578
+ # failed_inputs= ([input["Input_ID"] for input in failed_input_ids])
579
+ logger.warning(f"Retrying upload for {len(failed_input_ids)} failed inputs...\n")
580
+ failed_input_indexes = [input["Index"] for input in failed_input_ids]
581
+ self.upload_dataset(
582
+ dataloader=dataloader,
583
+ log_retry_ids=failed_input_indexes,
584
+ is_log_retry=True,
585
+ **kwargs,
586
+ )
587
+
588
+ def upload_from_csv(
589
+ self,
590
+ csv_path: str,
591
+ input_type: str = 'text',
592
+ csv_type: str = None,
593
+ labels: bool = True,
594
+ batch_size: int = 128,
595
+ ) -> None:
596
+ """Uploads dataset from a csv file.
597
+
598
+ Args:
599
+ csv_path (str): path to the csv file
600
+ input_type (str): type of the dataset(text, image)
601
+ csv_type (str): type of the csv file(raw, url, file_path)
602
+ labels (bool): True if csv file has labels column
603
+ batch_size (int): batch size for concurrent upload of inputs and annotations
604
+
605
+ Example:
606
+ >>> from clarifai.client.dataset import Dataset
607
+ >>> dataset = Dataset(user_id = 'user_id', app_id = 'demo_app', dataset_id = 'demo_dataset')
608
+ >>> dataset.upload_from_csv(csv_path='csv_path', input_type='text', csv_type='raw, labels=True)
609
+
610
+ Note:
611
+ CSV file supports 'inputid', 'input', 'concepts', 'metadata', 'geopoints' columns.
612
+ All the data in the CSV should be in double quotes.
613
+ metadata should be in single quotes format. Example: "{'key': 'value'}"
614
+ geopoints should be in "long,lat" format.
615
+ """
616
+ if input_type not in ['image', 'text', 'video', 'audio']:
617
+ raise UserError('Invalid input type, it should be image,text,audio or video')
618
+ if csv_type not in ['raw', 'url', 'file_path']:
619
+ raise UserError('Invalid csv type, it should be raw, url or file_path')
620
+ assert csv_path.endswith('.csv'), 'csv_path should be a csv file'
621
+ if csv_type == 'raw' and input_type != 'text':
622
+ raise UserError('Only text input type is supported for raw csv type')
623
+ batch_size = min(128, batch_size)
624
+ input_protos = self.input_object.get_inputs_from_csv(
625
+ csv_path=csv_path,
626
+ input_type=input_type,
627
+ csv_type=csv_type,
628
+ dataset_id=self.id,
629
+ labels=labels,
630
+ )
631
+ self.input_object._bulk_upload(inputs=input_protos, batch_size=batch_size)
632
+
633
+ def upload_from_folder(
634
+ self, folder_path: str, input_type: str, labels: bool = False, batch_size: int = 128
635
+ ) -> None:
636
+ """Upload dataset from folder.
637
+
638
+ Args:
639
+ folder_path (str): Path to the folder containing images.
640
+ input_type (str): type of the dataset(text, image)
641
+ labels (bool): True if folder name is the label for the inputs
642
+ batch_size (int): batch size for concurrent upload of inputs and annotations
643
+
644
+ Example:
645
+ >>> from clarifai.client.dataset import Dataset
646
+ >>> dataset = Dataset(user_id = 'user_id', app_id = 'demo_app', dataset_id = 'demo_dataset')
647
+ >>> dataset.upload_from_folder(folder_path='folder_path', input_type='text', labels=True)
648
+
649
+ Note: The filename is used as the input_id.
650
+ """
651
+ if input_type not in ['image', 'text']:
652
+ raise UserError('Invalid input type it should be image or text')
653
+ if input_type == 'image':
654
+ input_protos = self.input_object.get_image_inputs_from_folder(
655
+ folder_path=folder_path, dataset_id=self.id, labels=labels
656
+ )
657
+ if input_type == 'text':
658
+ input_protos = self.input_object.get_text_inputs_from_folder(
659
+ folder_path=folder_path, dataset_id=self.id, labels=labels
660
+ )
661
+ self.input_object._bulk_upload(inputs=input_protos, batch_size=batch_size)
662
+
663
+ def get_upload_status(
664
+ self,
665
+ dataloader: Type[ClarifaiDataLoader] = None,
666
+ delete_version: bool = False,
667
+ timeout: int = 600,
668
+ pre_upload_stats: Tuple[Dict[str, int], Dict[str, int]] = None,
669
+ pre_upload: bool = False,
670
+ ) -> Optional[Tuple[Dict[str, int], Dict[str, int]]]:
671
+ """Creates a new dataset version and displays the upload status of the dataset.
672
+
673
+ Args:
674
+ dataloader (Type[ClarifaiDataLoader]): ClarifaiDataLoader object
675
+ delete_version (bool): True if you want to delete the version after getting the upload status
676
+ timeout (int): Timeout in seconds for getting the upload status. Default is 600 seconds.
677
+ pre_upload_stats (Tuple[Dict[str, int], Dict[str, int]]): The pre upload stats for the dataset.
678
+ pre_upload (bool): True if you want to get the pre upload stats for the dataset.
679
+
680
+ Example:
681
+ >>> from clarifai.client.dataset import Dataset
682
+ >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
683
+ >>> dataset.get_upload_status(dataloader)
684
+
685
+ Note:
686
+ This is a beta feature and is subject to change.
687
+ """
688
+ self.logger.info("Getting dataset upload status...")
689
+ dataset_version_id = uuid.uuid4().hex
690
+ _ = self.create_version(id=dataset_version_id, description="SDK Upload Status")
691
+
692
+ request_data = dict(
693
+ user_app_id=self.user_app_id,
694
+ dataset_id=self.id,
695
+ dataset_version_id=dataset_version_id,
696
+ )
697
+
698
+ start_time = time.time()
699
+ backoff_iterator = BackoffIterator(10)
700
+ while True:
701
+ dataset_metrics_response = self._grpc_request(
702
+ self.STUB.ListDatasetVersionMetricsGroups,
703
+ service_pb2.ListDatasetVersionMetricsGroupsRequest(**request_data),
704
+ )
705
+
706
+ if dataset_metrics_response.status.code != status_code_pb2.SUCCESS:
707
+ self.delete_version(dataset_version_id)
708
+ raise Exception(
709
+ "Failed to get dataset metrics {}".format(dataset_metrics_response.status)
710
+ )
711
+
712
+ dict_response = MessageToDict(dataset_metrics_response)
713
+ if len(dict_response.keys()) == 1 and time.time() - start_time < timeout:
714
+ self.logger.info("Crunching the dataset metrics. Please wait...")
715
+ time.sleep(next(backoff_iterator))
716
+ continue
717
+ else:
718
+ if time.time() - start_time > timeout:
719
+ self.delete_version(dataset_version_id)
720
+ raise UserError(
721
+ "Dataset metrics are taking too long to process. Please try again later."
722
+ )
723
+ break
724
+ # get pre upload stats
725
+ if pre_upload:
726
+ return DisplayUploadStatus.get_dataset_version_stats(dataset_metrics_response)
727
+
728
+ dataset_info_dict = dict(user_id=self.user_id, app_id=self.app_id, dataset_id=self.id)
729
+ DisplayUploadStatus(
730
+ dataloader, dataset_metrics_response, dataset_info_dict, pre_upload_stats
731
+ )
732
+
733
+ if delete_version:
734
+ self.delete_version(dataset_version_id)
735
+
736
+ def merge_dataset(self, merge_dataset_id: str) -> None:
737
+ """Merges the another dataset into current dataset.
738
+
739
+ Args:
740
+ merge_dataset_id (str): The dataset ID of the dataset to merge.
741
+
742
+ Example:
743
+ >>> from clarifai.client.dataset import Dataset
744
+ >>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
745
+ >>> dataset.merge_dataset(merge_dataset_id='merge_dataset_id')
746
+ """
747
+ dataset_filter = resources_pb2.Filter(
748
+ input=resources_pb2.Input(dataset_ids=[merge_dataset_id])
749
+ )
750
+ query = resources_pb2.Search(query=resources_pb2.Query(filters=[dataset_filter]))
751
+ request = service_pb2.PostDatasetInputsRequest(
752
+ user_app_id=self.user_app_id, dataset_id=self.id, search=query
753
+ )
754
+
755
+ response = self._grpc_request(self.STUB.PostDatasetInputs, request)
756
+ if response.status.code != status_code_pb2.SUCCESS:
757
+ raise Exception(response.status)
758
+ self.logger.info("\nDataset Merged\n%s", response.status)
759
+
760
+ def archive_zip(self, wait: bool = True) -> str:
761
+ """Exports the dataset to a zip file URL."""
762
+ request = service_pb2.PutDatasetVersionExportsRequest(
763
+ user_app_id=self.user_app_id,
764
+ dataset_id=self.id,
765
+ dataset_version_id=self.version.id,
766
+ exports=[
767
+ resources_pb2.DatasetVersionExport(
768
+ format=resources_pb2.DatasetVersionExportFormat.CLARIFAI_DATA_PROTOBUF
769
+ )
770
+ ],
771
+ )
772
+
692
773
  response = self._grpc_request(self.STUB.PutDatasetVersionExports, request)
693
774
  if response.status.code != status_code_pb2.SUCCESS:
694
- raise Exception(response.status)
695
- if response.exports[0].status.code != status_code_pb2.DATASET_VERSION_EXPORT_SUCCESS:
696
- raise Exception(response.exports[0].status)
697
- return response.exports[0].url
698
-
699
- def export(self,
700
- save_path: str,
701
- archive_url: str = None,
702
- local_archive_path: str = None,
703
- split: str = 'all',
704
- num_workers: int = 4) -> None:
705
- """Exports the Clarifai protobuf dataset to a local archive.
706
-
707
- Args:
708
- save_path (str): The path to save the archive to.
709
- archive_url (str): The URL to the Clarifai protobuf archive.
710
- local_archive_path (str): The path to the local Clarifai protobuf archive.
711
- split (str): Export dataset inputs in the directory format {split}/{input_type}. Default is all.
712
- num_workers (int): Number of workers to use for downloading the archive. Default is 4.
713
-
714
- Example:
715
- >>> from clarifai.client.dataset import Dataset
716
- >>> Dataset().export(save_path='output.zip')
717
- """
718
- if local_archive_path and not os.path.exists(local_archive_path):
719
- raise UserError(f"Archive {local_archive_path} does not exist.")
720
- if not archive_url and not local_archive_path:
721
- archive_url = self.archive_zip()
722
- # Create a session object and set auth header
723
- session = requests.Session()
724
- retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
725
- session.mount('https://', HTTPAdapter(max_retries=retries))
726
- session.headers.update({'Authorization': self.metadata[0][1]})
727
- with DatasetExportReader(
728
- session=session, archive_url=archive_url, local_archive_path=local_archive_path) as reader:
729
- InputAnnotationDownloader(session, reader, num_workers).download_archive(
730
- save_path=save_path, split=split)
731
-
732
- def __getattr__(self, name):
733
- return getattr(self.dataset_info, name)
734
-
735
- def __str__(self):
736
- init_params = [param for param in self.kwargs.keys()]
737
- attribute_strings = [
738
- f"{param}={getattr(self.dataset_info, param)}" for param in init_params
739
- if hasattr(self.dataset_info, param)
740
- ]
741
- return f"Dataset Details: \n{', '.join(attribute_strings)}\n"
775
+ raise Exception(response.status)
776
+ if wait:
777
+ while response.exports[0].status.code in (
778
+ status_code_pb2.DATASET_VERSION_EXPORT_PENDING,
779
+ status_code_pb2.DATASET_VERSION_EXPORT_IN_PROGRESS,
780
+ ):
781
+ time.sleep(1)
782
+ response = self._grpc_request(self.STUB.PutDatasetVersionExports, request)
783
+ if response.status.code != status_code_pb2.SUCCESS:
784
+ raise Exception(response.status)
785
+ if response.exports[0].status.code != status_code_pb2.DATASET_VERSION_EXPORT_SUCCESS:
786
+ raise Exception(response.exports[0].status)
787
+ return response.exports[0].url
788
+
789
+ def export(
790
+ self,
791
+ save_path: str,
792
+ archive_url: str = None,
793
+ local_archive_path: str = None,
794
+ split: str = 'all',
795
+ num_workers: int = 4,
796
+ ) -> None:
797
+ """Exports the Clarifai protobuf dataset to a local archive.
798
+
799
+ Args:
800
+ save_path (str): The path to save the archive to.
801
+ archive_url (str): The URL to the Clarifai protobuf archive.
802
+ local_archive_path (str): The path to the local Clarifai protobuf archive.
803
+ split (str): Export dataset inputs in the directory format {split}/{input_type}. Default is all.
804
+ num_workers (int): Number of workers to use for downloading the archive. Default is 4.
805
+
806
+ Example:
807
+ >>> from clarifai.client.dataset import Dataset
808
+ >>> Dataset().export(save_path='output.zip')
809
+ """
810
+ if local_archive_path and not os.path.exists(local_archive_path):
811
+ raise UserError(f"Archive {local_archive_path} does not exist.")
812
+ if not archive_url and not local_archive_path:
813
+ archive_url = self.archive_zip()
814
+ # Create a session object and set auth header
815
+ session = requests.Session()
816
+ retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
817
+ session.mount('https://', HTTPAdapter(max_retries=retries))
818
+ session.headers.update({'Authorization': self.metadata[0][1]})
819
+ with DatasetExportReader(
820
+ session=session, archive_url=archive_url, local_archive_path=local_archive_path
821
+ ) as reader:
822
+ InputAnnotationDownloader(session, reader, num_workers).download_archive(
823
+ save_path=save_path, split=split
824
+ )
825
+
826
+ def __getattr__(self, name):
827
+ return getattr(self.dataset_info, name)
828
+
829
+ def __str__(self):
830
+ init_params = [param for param in self.kwargs.keys()]
831
+ attribute_strings = [
832
+ f"{param}={getattr(self.dataset_info, param)}"
833
+ for param in init_params
834
+ if hasattr(self.dataset_info, param)
835
+ ]
836
+ return f"Dataset Details: \n{', '.join(attribute_strings)}\n"