easy-cs-rec-custommodel 0.8.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of easy-cs-rec-custommodel might be problematic. Click here for more details.

Files changed (336) hide show
  1. easy_cs_rec_custommodel-0.8.6.dist-info/LICENSE +203 -0
  2. easy_cs_rec_custommodel-0.8.6.dist-info/METADATA +48 -0
  3. easy_cs_rec_custommodel-0.8.6.dist-info/RECORD +336 -0
  4. easy_cs_rec_custommodel-0.8.6.dist-info/WHEEL +6 -0
  5. easy_cs_rec_custommodel-0.8.6.dist-info/top_level.txt +2 -0
  6. easy_rec/__init__.py +114 -0
  7. easy_rec/python/__init__.py +0 -0
  8. easy_rec/python/builders/__init__.py +0 -0
  9. easy_rec/python/builders/hyperparams_builder.py +78 -0
  10. easy_rec/python/builders/loss_builder.py +333 -0
  11. easy_rec/python/builders/optimizer_builder.py +211 -0
  12. easy_rec/python/builders/strategy_builder.py +44 -0
  13. easy_rec/python/compat/__init__.py +0 -0
  14. easy_rec/python/compat/adam_s.py +245 -0
  15. easy_rec/python/compat/array_ops.py +229 -0
  16. easy_rec/python/compat/dynamic_variable.py +542 -0
  17. easy_rec/python/compat/early_stopping.py +653 -0
  18. easy_rec/python/compat/embedding_ops.py +162 -0
  19. easy_rec/python/compat/embedding_parallel_saver.py +316 -0
  20. easy_rec/python/compat/estimator_train.py +116 -0
  21. easy_rec/python/compat/exporter.py +473 -0
  22. easy_rec/python/compat/feature_column/__init__.py +0 -0
  23. easy_rec/python/compat/feature_column/feature_column.py +3675 -0
  24. easy_rec/python/compat/feature_column/feature_column_v2.py +5233 -0
  25. easy_rec/python/compat/feature_column/sequence_feature_column.py +648 -0
  26. easy_rec/python/compat/feature_column/utils.py +154 -0
  27. easy_rec/python/compat/layers.py +329 -0
  28. easy_rec/python/compat/ops.py +14 -0
  29. easy_rec/python/compat/optimizers.py +619 -0
  30. easy_rec/python/compat/queues.py +311 -0
  31. easy_rec/python/compat/regularizers.py +208 -0
  32. easy_rec/python/compat/sok_optimizer.py +440 -0
  33. easy_rec/python/compat/sync_replicas_optimizer.py +528 -0
  34. easy_rec/python/compat/weight_decay_optimizers.py +475 -0
  35. easy_rec/python/core/__init__.py +0 -0
  36. easy_rec/python/core/easyrec_metrics/__init__.py +24 -0
  37. easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_pai.py +3702 -0
  38. easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_tf.py +3768 -0
  39. easy_rec/python/core/learning_schedules.py +228 -0
  40. easy_rec/python/core/metrics.py +402 -0
  41. easy_rec/python/core/sampler.py +844 -0
  42. easy_rec/python/eval.py +102 -0
  43. easy_rec/python/export.py +150 -0
  44. easy_rec/python/feature_column/__init__.py +0 -0
  45. easy_rec/python/feature_column/feature_column.py +664 -0
  46. easy_rec/python/feature_column/feature_group.py +89 -0
  47. easy_rec/python/hpo/__init__.py +0 -0
  48. easy_rec/python/hpo/emr_hpo.py +140 -0
  49. easy_rec/python/hpo/generate_hpo_sql.py +71 -0
  50. easy_rec/python/hpo/pai_hpo.py +297 -0
  51. easy_rec/python/inference/__init__.py +0 -0
  52. easy_rec/python/inference/csv_predictor.py +189 -0
  53. easy_rec/python/inference/hive_parquet_predictor.py +200 -0
  54. easy_rec/python/inference/hive_predictor.py +166 -0
  55. easy_rec/python/inference/odps_predictor.py +70 -0
  56. easy_rec/python/inference/parquet_predictor.py +147 -0
  57. easy_rec/python/inference/parquet_predictor_v2.py +147 -0
  58. easy_rec/python/inference/predictor.py +621 -0
  59. easy_rec/python/inference/processor/__init__.py +0 -0
  60. easy_rec/python/inference/processor/test.py +170 -0
  61. easy_rec/python/inference/vector_retrieve.py +124 -0
  62. easy_rec/python/input/__init__.py +0 -0
  63. easy_rec/python/input/batch_tfrecord_input.py +117 -0
  64. easy_rec/python/input/criteo_binary_reader.py +259 -0
  65. easy_rec/python/input/criteo_input.py +107 -0
  66. easy_rec/python/input/csv_input.py +175 -0
  67. easy_rec/python/input/csv_input_ex.py +72 -0
  68. easy_rec/python/input/csv_input_v2.py +68 -0
  69. easy_rec/python/input/datahub_input.py +320 -0
  70. easy_rec/python/input/dummy_input.py +58 -0
  71. easy_rec/python/input/hive_input.py +123 -0
  72. easy_rec/python/input/hive_parquet_input.py +140 -0
  73. easy_rec/python/input/hive_rtp_input.py +174 -0
  74. easy_rec/python/input/input.py +1064 -0
  75. easy_rec/python/input/kafka_dataset.py +144 -0
  76. easy_rec/python/input/kafka_input.py +235 -0
  77. easy_rec/python/input/load_parquet.py +317 -0
  78. easy_rec/python/input/odps_input.py +101 -0
  79. easy_rec/python/input/odps_input_v2.py +110 -0
  80. easy_rec/python/input/odps_input_v3.py +132 -0
  81. easy_rec/python/input/odps_rtp_input.py +187 -0
  82. easy_rec/python/input/odps_rtp_input_v2.py +104 -0
  83. easy_rec/python/input/parquet_input.py +397 -0
  84. easy_rec/python/input/parquet_input_v2.py +180 -0
  85. easy_rec/python/input/parquet_input_v3.py +203 -0
  86. easy_rec/python/input/rtp_input.py +225 -0
  87. easy_rec/python/input/rtp_input_v2.py +145 -0
  88. easy_rec/python/input/tfrecord_input.py +100 -0
  89. easy_rec/python/layers/__init__.py +0 -0
  90. easy_rec/python/layers/backbone.py +571 -0
  91. easy_rec/python/layers/capsule_layer.py +176 -0
  92. easy_rec/python/layers/cmbf.py +390 -0
  93. easy_rec/python/layers/common_layers.py +192 -0
  94. easy_rec/python/layers/dnn.py +87 -0
  95. easy_rec/python/layers/embed_input_layer.py +25 -0
  96. easy_rec/python/layers/fm.py +26 -0
  97. easy_rec/python/layers/input_layer.py +396 -0
  98. easy_rec/python/layers/keras/__init__.py +34 -0
  99. easy_rec/python/layers/keras/activation.py +114 -0
  100. easy_rec/python/layers/keras/attention.py +267 -0
  101. easy_rec/python/layers/keras/auxiliary_loss.py +47 -0
  102. easy_rec/python/layers/keras/blocks.py +262 -0
  103. easy_rec/python/layers/keras/bst.py +119 -0
  104. easy_rec/python/layers/keras/custom_ops.py +250 -0
  105. easy_rec/python/layers/keras/data_augment.py +133 -0
  106. easy_rec/python/layers/keras/din.py +67 -0
  107. easy_rec/python/layers/keras/einsum_dense.py +598 -0
  108. easy_rec/python/layers/keras/embedding.py +81 -0
  109. easy_rec/python/layers/keras/fibinet.py +251 -0
  110. easy_rec/python/layers/keras/interaction.py +416 -0
  111. easy_rec/python/layers/keras/layer_norm.py +364 -0
  112. easy_rec/python/layers/keras/mask_net.py +166 -0
  113. easy_rec/python/layers/keras/multi_head_attention.py +717 -0
  114. easy_rec/python/layers/keras/multi_task.py +125 -0
  115. easy_rec/python/layers/keras/numerical_embedding.py +376 -0
  116. easy_rec/python/layers/keras/ppnet.py +194 -0
  117. easy_rec/python/layers/keras/transformer.py +192 -0
  118. easy_rec/python/layers/layer_norm.py +51 -0
  119. easy_rec/python/layers/mmoe.py +83 -0
  120. easy_rec/python/layers/multihead_attention.py +162 -0
  121. easy_rec/python/layers/multihead_cross_attention.py +749 -0
  122. easy_rec/python/layers/senet.py +73 -0
  123. easy_rec/python/layers/seq_input_layer.py +134 -0
  124. easy_rec/python/layers/sequence_feature_layer.py +249 -0
  125. easy_rec/python/layers/uniter.py +301 -0
  126. easy_rec/python/layers/utils.py +248 -0
  127. easy_rec/python/layers/variational_dropout_layer.py +130 -0
  128. easy_rec/python/loss/__init__.py +0 -0
  129. easy_rec/python/loss/circle_loss.py +82 -0
  130. easy_rec/python/loss/contrastive_loss.py +79 -0
  131. easy_rec/python/loss/f1_reweight_loss.py +38 -0
  132. easy_rec/python/loss/focal_loss.py +93 -0
  133. easy_rec/python/loss/jrc_loss.py +128 -0
  134. easy_rec/python/loss/listwise_loss.py +161 -0
  135. easy_rec/python/loss/multi_similarity.py +68 -0
  136. easy_rec/python/loss/pairwise_loss.py +307 -0
  137. easy_rec/python/loss/softmax_loss_with_negative_mining.py +110 -0
  138. easy_rec/python/loss/zero_inflated_lognormal.py +76 -0
  139. easy_rec/python/main.py +878 -0
  140. easy_rec/python/model/__init__.py +0 -0
  141. easy_rec/python/model/autoint.py +73 -0
  142. easy_rec/python/model/cmbf.py +47 -0
  143. easy_rec/python/model/collaborative_metric_learning.py +182 -0
  144. easy_rec/python/model/custom_model.py +323 -0
  145. easy_rec/python/model/dat.py +138 -0
  146. easy_rec/python/model/dbmtl.py +116 -0
  147. easy_rec/python/model/dcn.py +70 -0
  148. easy_rec/python/model/deepfm.py +106 -0
  149. easy_rec/python/model/dlrm.py +73 -0
  150. easy_rec/python/model/dropoutnet.py +207 -0
  151. easy_rec/python/model/dssm.py +154 -0
  152. easy_rec/python/model/dssm_senet.py +143 -0
  153. easy_rec/python/model/dummy_model.py +48 -0
  154. easy_rec/python/model/easy_rec_estimator.py +739 -0
  155. easy_rec/python/model/easy_rec_model.py +467 -0
  156. easy_rec/python/model/esmm.py +242 -0
  157. easy_rec/python/model/fm.py +63 -0
  158. easy_rec/python/model/match_model.py +357 -0
  159. easy_rec/python/model/mind.py +445 -0
  160. easy_rec/python/model/mmoe.py +70 -0
  161. easy_rec/python/model/multi_task_model.py +303 -0
  162. easy_rec/python/model/multi_tower.py +62 -0
  163. easy_rec/python/model/multi_tower_bst.py +190 -0
  164. easy_rec/python/model/multi_tower_din.py +130 -0
  165. easy_rec/python/model/multi_tower_recall.py +68 -0
  166. easy_rec/python/model/pdn.py +203 -0
  167. easy_rec/python/model/ple.py +120 -0
  168. easy_rec/python/model/rank_model.py +485 -0
  169. easy_rec/python/model/rocket_launching.py +203 -0
  170. easy_rec/python/model/simple_multi_task.py +54 -0
  171. easy_rec/python/model/uniter.py +46 -0
  172. easy_rec/python/model/wide_and_deep.py +121 -0
  173. easy_rec/python/ops/1.12/incr_record.so +0 -0
  174. easy_rec/python/ops/1.12/kafka.so +0 -0
  175. easy_rec/python/ops/1.12/libcustom_ops.so +0 -0
  176. easy_rec/python/ops/1.12/libembed_op.so +0 -0
  177. easy_rec/python/ops/1.12/libhiredis.so.1.0.0 +0 -0
  178. easy_rec/python/ops/1.12/librdkafka++.so.1 +0 -0
  179. easy_rec/python/ops/1.12/librdkafka.so.1 +0 -0
  180. easy_rec/python/ops/1.12/libredis++.so +0 -0
  181. easy_rec/python/ops/1.12/libredis++.so.1 +0 -0
  182. easy_rec/python/ops/1.12/libredis++.so.1.2.3 +0 -0
  183. easy_rec/python/ops/1.12/libstr_avx_op.so +0 -0
  184. easy_rec/python/ops/1.12/libwrite_sparse_kv.so +0 -0
  185. easy_rec/python/ops/1.15/incr_record.so +0 -0
  186. easy_rec/python/ops/1.15/kafka.so +0 -0
  187. easy_rec/python/ops/1.15/libcustom_ops.so +0 -0
  188. easy_rec/python/ops/1.15/libembed_op.so +0 -0
  189. easy_rec/python/ops/1.15/libhiredis.so.1.0.0 +0 -0
  190. easy_rec/python/ops/1.15/librdkafka++.so +0 -0
  191. easy_rec/python/ops/1.15/librdkafka++.so.1 +0 -0
  192. easy_rec/python/ops/1.15/librdkafka.so +0 -0
  193. easy_rec/python/ops/1.15/librdkafka.so.1 +0 -0
  194. easy_rec/python/ops/1.15/libredis++.so.1 +0 -0
  195. easy_rec/python/ops/1.15/libstr_avx_op.so +0 -0
  196. easy_rec/python/ops/2.12/libcustom_ops.so +0 -0
  197. easy_rec/python/ops/2.12/libload_embed.so +0 -0
  198. easy_rec/python/ops/2.12/libstr_avx_op.so +0 -0
  199. easy_rec/python/ops/__init__.py +0 -0
  200. easy_rec/python/ops/gen_kafka_ops.py +193 -0
  201. easy_rec/python/ops/gen_str_avx_op.py +28 -0
  202. easy_rec/python/ops/incr_record.py +30 -0
  203. easy_rec/python/predict.py +170 -0
  204. easy_rec/python/protos/__init__.py +0 -0
  205. easy_rec/python/protos/autoint_pb2.py +122 -0
  206. easy_rec/python/protos/backbone_pb2.py +1416 -0
  207. easy_rec/python/protos/cmbf_pb2.py +435 -0
  208. easy_rec/python/protos/collaborative_metric_learning_pb2.py +252 -0
  209. easy_rec/python/protos/custom_model_pb2.py +57 -0
  210. easy_rec/python/protos/dat_pb2.py +262 -0
  211. easy_rec/python/protos/data_source_pb2.py +422 -0
  212. easy_rec/python/protos/dataset_pb2.py +1920 -0
  213. easy_rec/python/protos/dbmtl_pb2.py +191 -0
  214. easy_rec/python/protos/dcn_pb2.py +197 -0
  215. easy_rec/python/protos/deepfm_pb2.py +163 -0
  216. easy_rec/python/protos/dlrm_pb2.py +163 -0
  217. easy_rec/python/protos/dnn_pb2.py +329 -0
  218. easy_rec/python/protos/dropoutnet_pb2.py +239 -0
  219. easy_rec/python/protos/dssm_pb2.py +262 -0
  220. easy_rec/python/protos/dssm_senet_pb2.py +282 -0
  221. easy_rec/python/protos/easy_rec_model_pb2.py +1672 -0
  222. easy_rec/python/protos/esmm_pb2.py +133 -0
  223. easy_rec/python/protos/eval_pb2.py +930 -0
  224. easy_rec/python/protos/export_pb2.py +379 -0
  225. easy_rec/python/protos/feature_config_pb2.py +1359 -0
  226. easy_rec/python/protos/fm_pb2.py +90 -0
  227. easy_rec/python/protos/hive_config_pb2.py +138 -0
  228. easy_rec/python/protos/hyperparams_pb2.py +624 -0
  229. easy_rec/python/protos/keras_layer_pb2.py +692 -0
  230. easy_rec/python/protos/layer_pb2.py +1936 -0
  231. easy_rec/python/protos/loss_pb2.py +1713 -0
  232. easy_rec/python/protos/mind_pb2.py +497 -0
  233. easy_rec/python/protos/mmoe_pb2.py +215 -0
  234. easy_rec/python/protos/multi_tower_pb2.py +295 -0
  235. easy_rec/python/protos/multi_tower_recall_pb2.py +198 -0
  236. easy_rec/python/protos/optimizer_pb2.py +2017 -0
  237. easy_rec/python/protos/pdn_pb2.py +293 -0
  238. easy_rec/python/protos/pipeline_pb2.py +516 -0
  239. easy_rec/python/protos/ple_pb2.py +231 -0
  240. easy_rec/python/protos/predict_pb2.py +1140 -0
  241. easy_rec/python/protos/rocket_launching_pb2.py +169 -0
  242. easy_rec/python/protos/seq_encoder_pb2.py +1084 -0
  243. easy_rec/python/protos/simi_pb2.py +54 -0
  244. easy_rec/python/protos/simple_multi_task_pb2.py +97 -0
  245. easy_rec/python/protos/tf_predict_pb2.py +630 -0
  246. easy_rec/python/protos/tower_pb2.py +661 -0
  247. easy_rec/python/protos/train_pb2.py +1197 -0
  248. easy_rec/python/protos/uniter_pb2.py +307 -0
  249. easy_rec/python/protos/variational_dropout_pb2.py +91 -0
  250. easy_rec/python/protos/wide_and_deep_pb2.py +131 -0
  251. easy_rec/python/test/__init__.py +0 -0
  252. easy_rec/python/test/csv_input_test.py +340 -0
  253. easy_rec/python/test/custom_early_stop_func.py +19 -0
  254. easy_rec/python/test/dh_local_run.py +104 -0
  255. easy_rec/python/test/embed_test.py +155 -0
  256. easy_rec/python/test/emr_run.py +119 -0
  257. easy_rec/python/test/eval_metric_test.py +107 -0
  258. easy_rec/python/test/excel_convert_test.py +64 -0
  259. easy_rec/python/test/export_test.py +513 -0
  260. easy_rec/python/test/fg_test.py +70 -0
  261. easy_rec/python/test/hive_input_test.py +311 -0
  262. easy_rec/python/test/hpo_test.py +235 -0
  263. easy_rec/python/test/kafka_test.py +373 -0
  264. easy_rec/python/test/local_incr_test.py +122 -0
  265. easy_rec/python/test/loss_test.py +110 -0
  266. easy_rec/python/test/odps_command.py +61 -0
  267. easy_rec/python/test/odps_local_run.py +86 -0
  268. easy_rec/python/test/odps_run.py +254 -0
  269. easy_rec/python/test/odps_test_cls.py +39 -0
  270. easy_rec/python/test/odps_test_prepare.py +198 -0
  271. easy_rec/python/test/odps_test_util.py +237 -0
  272. easy_rec/python/test/pre_check_test.py +54 -0
  273. easy_rec/python/test/predictor_test.py +394 -0
  274. easy_rec/python/test/rtp_convert_test.py +133 -0
  275. easy_rec/python/test/run.py +138 -0
  276. easy_rec/python/test/train_eval_test.py +1299 -0
  277. easy_rec/python/test/util_test.py +85 -0
  278. easy_rec/python/test/zero_inflated_lognormal_test.py +53 -0
  279. easy_rec/python/tools/__init__.py +0 -0
  280. easy_rec/python/tools/add_boundaries_to_config.py +67 -0
  281. easy_rec/python/tools/add_feature_info_to_config.py +145 -0
  282. easy_rec/python/tools/convert_config_format.py +48 -0
  283. easy_rec/python/tools/convert_rtp_data.py +79 -0
  284. easy_rec/python/tools/convert_rtp_fg.py +106 -0
  285. easy_rec/python/tools/create_config_from_excel.py +427 -0
  286. easy_rec/python/tools/criteo/__init__.py +0 -0
  287. easy_rec/python/tools/criteo/convert_data.py +157 -0
  288. easy_rec/python/tools/edit_lookup_graph.py +134 -0
  289. easy_rec/python/tools/faiss_index_pai.py +116 -0
  290. easy_rec/python/tools/feature_selection.py +316 -0
  291. easy_rec/python/tools/hit_rate_ds.py +223 -0
  292. easy_rec/python/tools/hit_rate_pai.py +138 -0
  293. easy_rec/python/tools/pre_check.py +120 -0
  294. easy_rec/python/tools/predict_and_chk.py +111 -0
  295. easy_rec/python/tools/read_kafka.py +55 -0
  296. easy_rec/python/tools/split_model_pai.py +286 -0
  297. easy_rec/python/tools/split_pdn_model_pai.py +272 -0
  298. easy_rec/python/tools/test_saved_model.py +80 -0
  299. easy_rec/python/tools/view_saved_model.py +39 -0
  300. easy_rec/python/tools/write_kafka.py +65 -0
  301. easy_rec/python/train_eval.py +325 -0
  302. easy_rec/python/utils/__init__.py +15 -0
  303. easy_rec/python/utils/activation.py +120 -0
  304. easy_rec/python/utils/check_utils.py +87 -0
  305. easy_rec/python/utils/compat.py +14 -0
  306. easy_rec/python/utils/config_util.py +652 -0
  307. easy_rec/python/utils/constant.py +43 -0
  308. easy_rec/python/utils/convert_rtp_fg.py +616 -0
  309. easy_rec/python/utils/dag.py +192 -0
  310. easy_rec/python/utils/distribution_utils.py +268 -0
  311. easy_rec/python/utils/ds_util.py +65 -0
  312. easy_rec/python/utils/embedding_utils.py +73 -0
  313. easy_rec/python/utils/estimator_utils.py +1036 -0
  314. easy_rec/python/utils/export_big_model.py +630 -0
  315. easy_rec/python/utils/expr_util.py +118 -0
  316. easy_rec/python/utils/fg_util.py +53 -0
  317. easy_rec/python/utils/hit_rate_utils.py +220 -0
  318. easy_rec/python/utils/hive_utils.py +183 -0
  319. easy_rec/python/utils/hpo_util.py +137 -0
  320. easy_rec/python/utils/hvd_utils.py +56 -0
  321. easy_rec/python/utils/input_utils.py +108 -0
  322. easy_rec/python/utils/io_util.py +282 -0
  323. easy_rec/python/utils/load_class.py +249 -0
  324. easy_rec/python/utils/meta_graph_editor.py +941 -0
  325. easy_rec/python/utils/multi_optimizer.py +62 -0
  326. easy_rec/python/utils/numpy_utils.py +18 -0
  327. easy_rec/python/utils/odps_util.py +79 -0
  328. easy_rec/python/utils/pai_util.py +86 -0
  329. easy_rec/python/utils/proto_util.py +90 -0
  330. easy_rec/python/utils/restore_filter.py +89 -0
  331. easy_rec/python/utils/shape_utils.py +432 -0
  332. easy_rec/python/utils/static_shape.py +71 -0
  333. easy_rec/python/utils/test_utils.py +866 -0
  334. easy_rec/python/utils/tf_utils.py +56 -0
  335. easy_rec/version.py +4 -0
  336. test/__init__.py +0 -0
@@ -0,0 +1,866 @@
1
+ # -*- encoding:utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+ """Contains functions which are convenient for unit testing.
4
+
5
+ isort:skip_file
6
+ """
7
+ from future import standard_library
8
+ standard_library.install_aliases()
9
+ import yaml
10
+ import glob
11
+ import json
12
+ import logging
13
+ import os
14
+ import random
15
+ import shutil
16
+ import string
17
+ import subprocess
18
+ import time
19
+ import six
20
+ from multiprocessing import Process
21
+ from subprocess import getstatusoutput
22
+ from tensorflow.python.platform import gfile
23
+ import numpy as np
24
+ from easy_rec.python.protos.train_pb2 import DistributionStrategy
25
+ from easy_rec.python.utils import config_util
26
+ from easy_rec.python.protos.pipeline_pb2 import EasyRecConfig
27
+ from easy_rec.python.utils.io_util import read_data_from_json_path
28
+ from easy_rec.python.utils import constant
29
+
30
+ TEST_DIR = './tmp/easy_rec_test'
31
+
32
+ # parallel run of tests could take more time
33
+ TEST_TIME_OUT = int(os.environ.get('TEST_TIME_OUT', 1800))
34
+
35
+
36
+ def get_hdfs_tmp_dir(test_dir):
37
+ """Create a randomly of directory in HDFS."""
38
+ tmp_name = ''.join(
39
+ [random.choice(string.ascii_letters + string.digits) for i in range(8)])
40
+ assert isinstance(test_dir, str)
41
+ test_rand_dir = os.path.join(test_dir, tmp_name)
42
+ gfile.MkDir(test_rand_dir)
43
+ return test_rand_dir
44
+
45
+
46
+ def proc_wait(proc, timeout=1200):
47
+ t0 = time.time()
48
+ while proc.poll() is None and time.time() - t0 < timeout:
49
+ time.sleep(1)
50
+ if proc.poll() is None:
51
+ logging.warning('proc[pid=%d] timeout[%d], will kill the proc' %
52
+ (proc.pid, timeout))
53
+ proc.terminate()
54
+ while proc.poll() is None:
55
+ time.sleep(1)
56
+
57
+
58
+ def get_tmp_dir():
59
+ max_retry = 5
60
+ while max_retry > 0:
61
+ tmp_name = ''.join([
62
+ random.choice(string.ascii_letters + string.digits) for i in range(12)
63
+ ])
64
+ if os.environ.get('TEST_DIR', '') != '':
65
+ global TEST_DIR
66
+ TEST_DIR = os.environ['TEST_DIR']
67
+ dir_name = os.path.join(TEST_DIR, tmp_name)
68
+ if not os.path.exists(dir_name):
69
+ os.makedirs(dir_name)
70
+ return dir_name
71
+ else:
72
+ max_retry -= 1
73
+ raise RuntimeError('Failed to get_tmp_dir: max_retry=%d' % max_retry)
74
+
75
+
76
+ def clear_all_tmp_dirs():
77
+ shutil.rmtree(TEST_DIR)
78
+
79
+
80
+ def set_gpu_id(gpu_id_str):
81
+ env = os.environ
82
+ if gpu_id_str is None:
83
+ env['CUDA_VISIBLE_DEVICES'] = ''
84
+ else:
85
+ env['CUDA_VISIBLE_DEVICES'] = gpu_id_str
86
+
87
+
88
+ def get_available_gpus():
89
+ if 'TEST_DEVICES' in os.environ:
90
+ gpus = os.environ['TEST_DEVICES'].split(',')
91
+ else:
92
+ gpus = glob.glob('/dev/nvidia[0-9]*')
93
+ gpus = [gpu.replace('/dev/nvidia', '') for gpu in gpus]
94
+ logging.info('available gpus %s' % gpus)
95
+ return gpus
96
+
97
+
98
+ def run_cmd(cmd_str, log_file, env=None):
99
+ """Run a shell cmd."""
100
+ cmd_str = cmd_str.replace('\r', ' ').replace('\n', ' ')
101
+ logging.info('RUNCMD: %s > %s 2>&1 ' % (cmd_str, log_file))
102
+ with open(log_file, 'w') as lfile:
103
+ proc = subprocess.Popen(
104
+ cmd_str, stdout=lfile, stderr=subprocess.STDOUT, shell=True, env=env)
105
+ if six.PY2:
106
+ # for debug purpose
107
+ proc.args = cmd_str
108
+ return proc
109
+
110
+
111
+ def RunAsSubprocess(f):
112
+ """Function dectorator to run function in subprocess.
113
+
114
+ if a function will start a tf session. Because tensorflow gpu memory will not be cleared until the
115
+ process exit.
116
+ """
117
+
118
+ def wrapped_f(*args, **kw):
119
+ p = Process(target=f, args=args, kwargs=kw)
120
+ p.start()
121
+ p.join(timeout=600)
122
+ assert p.exitcode == 0, 'subprocess run failed: %s' % f.__name__
123
+
124
+ return wrapped_f
125
+
126
+
127
+ def clean_up(test_dir):
128
+ if test_dir is not None:
129
+ shutil.rmtree(test_dir)
130
+
131
+ # reset to cpu mode
132
+ set_gpu_id(None)
133
+
134
+
135
+ def clean_up_hdfs(test_dir):
136
+ if gfile.Exists(test_dir):
137
+ gfile.DeleteRecursively(test_dir)
138
+ set_gpu_id(None)
139
+
140
+
141
+ def _replace_data_for_test(data_path):
142
+ """Replace real data with test data."""
143
+ test_data = {}
144
+
145
+ change = False
146
+ releated_datasets = []
147
+ for k, config in test_data.items():
148
+ if k in data_path:
149
+ releated_datasets.append(k)
150
+
151
+ # if there are multiple keyword detected, use the longest one
152
+ if len(releated_datasets) > 0:
153
+ score = [len(k) for k in releated_datasets]
154
+ best_match = np.argmax(score)
155
+ data_path = test_data[releated_datasets[best_match]]
156
+
157
+ change = True
158
+ assert change, 'Failed to replace data with test data'
159
+
160
+ return data_path
161
+
162
+
163
+ def _load_config_for_test(pipeline_config_path,
164
+ test_dir,
165
+ total_steps=50,
166
+ num_epochs=0):
167
+ pipeline_config = config_util.get_configs_from_pipeline_file(
168
+ pipeline_config_path)
169
+ train_config = pipeline_config.train_config
170
+ eval_config = pipeline_config.eval_config
171
+ data_config = pipeline_config.data_config
172
+
173
+ train_config.num_steps = total_steps
174
+ # change model_dir
175
+ pipeline_config.model_dir = os.path.join(test_dir, 'train')
176
+ logging.info('test_model_dir %s' % pipeline_config.model_dir)
177
+ eval_config.num_examples = max(10, data_config.batch_size)
178
+ data_config.num_epochs = num_epochs
179
+ return pipeline_config
180
+
181
+
182
+ def _load_config_for_distribute_eval(pipeline_config_path, test_dir):
183
+ pipeline_config = config_util.get_configs_from_pipeline_file(
184
+ pipeline_config_path)
185
+ pipeline_config.model_dir = test_dir
186
+ logging.info('test_model_dir %s' % pipeline_config.model_dir)
187
+ return pipeline_config
188
+
189
+
190
+ def test_datahub_train_eval(pipeline_config_path,
191
+ odps_oss_config,
192
+ test_dir,
193
+ process_pipeline_func=None,
194
+ total_steps=50,
195
+ post_check_func=None):
196
+ gpus = get_available_gpus()
197
+ if len(gpus) > 0:
198
+ set_gpu_id(gpus[0])
199
+ else:
200
+ set_gpu_id(None)
201
+
202
+ if not isinstance(pipeline_config_path, EasyRecConfig):
203
+ logging.info('testing pipeline config %s' % pipeline_config_path)
204
+ if 'TF_CONFIG' in os.environ:
205
+ del os.environ['TF_CONFIG']
206
+
207
+ if isinstance(pipeline_config_path, EasyRecConfig):
208
+ pipeline_config = pipeline_config_path
209
+ else:
210
+ pipeline_config = _load_config_for_test(pipeline_config_path, test_dir,
211
+ total_steps)
212
+
213
+ pipeline_config.train_config.train_distribute = 0
214
+ pipeline_config.train_config.num_gpus_per_worker = 1
215
+ pipeline_config.train_config.sync_replicas = False
216
+
217
+ pipeline_config.datahub_train_input.akId = odps_oss_config.dh_id
218
+ pipeline_config.datahub_train_input.akSecret = odps_oss_config.dh_key
219
+ pipeline_config.datahub_train_input.region = odps_oss_config.dh_endpoint
220
+ pipeline_config.datahub_train_input.project = odps_oss_config.dh_project
221
+ pipeline_config.datahub_train_input.topic = odps_oss_config.dh_topic
222
+
223
+ pipeline_config.datahub_eval_input.akId = odps_oss_config.dh_id
224
+ pipeline_config.datahub_eval_input.akSecret = odps_oss_config.dh_key
225
+ pipeline_config.datahub_eval_input.region = odps_oss_config.dh_endpoint
226
+ pipeline_config.datahub_eval_input.project = odps_oss_config.dh_project
227
+ pipeline_config.datahub_eval_input.topic = odps_oss_config.dh_topic
228
+
229
+ if process_pipeline_func is not None:
230
+ assert callable(process_pipeline_func)
231
+ pipeline_config = process_pipeline_func(pipeline_config)
232
+ config_util.save_pipeline_config(pipeline_config, test_dir)
233
+ test_pipeline_config_path = os.path.join(test_dir, 'pipeline.config')
234
+ train_cmd = 'python -m easy_rec.python.train_eval --pipeline_config_path %s' % \
235
+ test_pipeline_config_path
236
+ proc = run_cmd(train_cmd, '%s/log_%s.txt' % (test_dir, 'master'))
237
+ proc_wait(proc, timeout=TEST_TIME_OUT)
238
+ if proc.returncode != 0:
239
+ logging.warning(
240
+ 'train %s failed[pid=%d][code=%d][args=%s]' %
241
+ (test_pipeline_config_path, proc.pid, proc.returncode, proc.args))
242
+ return False
243
+ if post_check_func:
244
+ return post_check_func(pipeline_config)
245
+ return True
246
+
247
+
248
+ def _Load_config_for_test_eval(pipeline_config_path):
249
+ pipeline_config = config_util.get_configs_from_pipeline_file(
250
+ pipeline_config_path)
251
+ return pipeline_config
252
+
253
+
254
+ def test_single_train_eval(pipeline_config_path,
255
+ test_dir,
256
+ process_pipeline_func=None,
257
+ hyperparam_str='',
258
+ total_steps=50,
259
+ post_check_func=None,
260
+ check_mode=False,
261
+ fine_tune_checkpoint=None,
262
+ extra_cmd_args=None,
263
+ timeout=-1):
264
+ gpus = get_available_gpus()
265
+ if len(gpus) > 0:
266
+ set_gpu_id(gpus[0])
267
+ else:
268
+ set_gpu_id(None)
269
+
270
+ if not isinstance(pipeline_config_path, EasyRecConfig):
271
+ logging.info('testing pipeline config %s' % pipeline_config_path)
272
+ if 'TF_CONFIG' in os.environ:
273
+ del os.environ['TF_CONFIG']
274
+
275
+ if isinstance(pipeline_config_path, EasyRecConfig):
276
+ pipeline_config = pipeline_config_path
277
+ else:
278
+ pipeline_config = _load_config_for_test(pipeline_config_path, test_dir,
279
+ total_steps)
280
+
281
+ pipeline_config.train_config.train_distribute = 0
282
+ pipeline_config.train_config.num_gpus_per_worker = 1
283
+ pipeline_config.train_config.sync_replicas = False
284
+ if process_pipeline_func is not None:
285
+ assert callable(process_pipeline_func)
286
+ pipeline_config = process_pipeline_func(pipeline_config)
287
+ config_util.save_pipeline_config(pipeline_config, test_dir)
288
+ test_pipeline_config_path = os.path.join(test_dir, 'pipeline.config')
289
+ train_cmd = 'python -m easy_rec.python.train_eval --pipeline_config_path=' + test_pipeline_config_path
290
+ if hyperparam_str:
291
+ train_cmd += ' --edit_config_json=\'%s\'' % hyperparam_str
292
+ if fine_tune_checkpoint:
293
+ train_cmd += ' --fine_tune_checkpoint %s' % fine_tune_checkpoint
294
+ if check_mode:
295
+ train_cmd += ' --check_mode'
296
+ if extra_cmd_args:
297
+ train_cmd += ' '
298
+ train_cmd += extra_cmd_args
299
+ proc = run_cmd(train_cmd, '%s/log_%s.txt' % (test_dir, 'master'))
300
+ proc_wait(proc, timeout=TEST_TIME_OUT if timeout < 0 else timeout)
301
+ if proc.returncode != 0:
302
+ logging.error('train %s failed' % test_pipeline_config_path)
303
+ return False
304
+ if post_check_func:
305
+ return post_check_func(pipeline_config)
306
+ return True
307
+
308
+
309
+ def test_single_pre_check(pipeline_config_path, test_dir):
310
+ gpus = get_available_gpus()
311
+ if len(gpus) > 0:
312
+ set_gpu_id(gpus[0])
313
+ else:
314
+ set_gpu_id(None)
315
+
316
+ if not isinstance(pipeline_config_path, EasyRecConfig):
317
+ logging.info('testing pipeline config %s' % pipeline_config_path)
318
+ if 'TF_CONFIG' in os.environ:
319
+ del os.environ['TF_CONFIG']
320
+
321
+ if isinstance(pipeline_config_path, EasyRecConfig):
322
+ pipeline_config = pipeline_config_path
323
+ else:
324
+ pipeline_config = _load_config_for_test(pipeline_config_path, test_dir)
325
+
326
+ pipeline_config.train_config.train_distribute = 0
327
+ pipeline_config.train_config.num_gpus_per_worker = 1
328
+ pipeline_config.train_config.sync_replicas = False
329
+
330
+ config_util.save_pipeline_config(pipeline_config, test_dir)
331
+ test_pipeline_config_path = os.path.join(test_dir, 'pipeline.config')
332
+ train_cmd = 'python -m easy_rec.python.tools.pre_check --pipeline_config_path %s ' % (
333
+ test_pipeline_config_path)
334
+
335
+ proc = run_cmd(train_cmd, '%s/log_%s.txt' % (test_dir, 'master'))
336
+ proc_wait(proc, timeout=TEST_TIME_OUT)
337
+ if proc.returncode != 0:
338
+ logging.error('train %s failed' % test_pipeline_config_path)
339
+ return False
340
+ return True
341
+
342
+
343
+ def test_single_predict(test_dir, input_path, output_path, saved_model_dir):
344
+ gpus = get_available_gpus()
345
+ if len(gpus) > 0:
346
+ set_gpu_id(gpus[0])
347
+ else:
348
+ set_gpu_id(None)
349
+
350
+ predict_cmd = 'python -m easy_rec.python.predict --input_path %s --output_path %s --saved_model_dir %s' % (
351
+ input_path, output_path, saved_model_dir)
352
+
353
+ proc = run_cmd(predict_cmd, '%s/log_%s.txt' % (test_dir, 'master'))
354
+ proc_wait(proc, timeout=TEST_TIME_OUT)
355
+ if proc.returncode != 0:
356
+ logging.error('predict failed')
357
+ return False
358
+ return True
359
+
360
+
361
+ def test_feature_selection(pipeline_config):
362
+ model_dir = pipeline_config.model_dir
363
+ pipeline_config_path = os.path.join(model_dir, 'pipeline.config')
364
+ output_dir = os.path.join(model_dir, 'feature_selection')
365
+ cmd = 'python -m easy_rec.python.tools.feature_selection --config_path %s ' \
366
+ '--output_dir %s --topk 5 --visualize true' % (pipeline_config_path, output_dir)
367
+ proc = run_cmd(cmd, os.path.join(model_dir, 'log_feature_selection.txt'))
368
+ proc_wait(proc, timeout=TEST_TIME_OUT)
369
+ if proc.returncode != 0:
370
+ logging.error('feature selection %s failed' % pipeline_config_path)
371
+ return False
372
+ return True
373
+
374
+
375
+ def yaml_replace(train_yaml_path,
376
+ pipline_config_path,
377
+ test_pipeline_config_path,
378
+ test_export_dir=None):
379
+ with open(train_yaml_path, 'r', encoding='utf-8') as _file:
380
+ sample = _file.read()
381
+ x = yaml.load(sample)
382
+ _command = x['app']['command']
383
+ if test_export_dir is not None:
384
+ _command = _command.replace(pipline_config_path,
385
+ test_pipeline_config_path).replace(
386
+ '{EXPOERT_DIR}', test_export_dir)
387
+ else:
388
+ _command = _command.replace(pipline_config_path,
389
+ test_pipeline_config_path)
390
+ x['app']['command'] = _command
391
+
392
+ with open(train_yaml_path, 'w', encoding='utf-8') as _file:
393
+ yaml.dump(x, _file)
394
+
395
+
396
+ def test_hdfs_train_eval(pipeline_config_path,
397
+ train_yaml_path,
398
+ test_dir,
399
+ process_pipeline_func=None,
400
+ hyperparam_str='',
401
+ total_steps=2000):
402
+
403
+ gpus = get_available_gpus()
404
+ if len(gpus) > 0:
405
+ set_gpu_id(gpus[0])
406
+ else:
407
+ set_gpu_id(None)
408
+ logging.info('testing pipeline config %s' % pipeline_config_path)
409
+ logging.info('train_yaml_path %s' % train_yaml_path)
410
+ if 'TF_CONFIG' in os.environ:
411
+ del os.environ['TF_CONFIG']
412
+ pipeline_config = _load_config_for_test(pipeline_config_path, test_dir,
413
+ total_steps)
414
+ logging.info('model_dir in pipeline_config has been modified')
415
+ pipeline_config.train_config.train_distribute = 0
416
+ pipeline_config.train_config.num_gpus_per_worker = 1
417
+ pipeline_config.train_config.sync_replicas = False
418
+ if process_pipeline_func is not None:
419
+ assert callable(process_pipeline_func)
420
+ pipeline_config = process_pipeline_func(pipeline_config)
421
+ config_util.save_pipeline_config(pipeline_config, test_dir)
422
+ test_pipeline_config_path = os.path.join(test_dir, 'pipeline.config')
423
+ yaml_replace(train_yaml_path, pipeline_config_path, test_pipeline_config_path)
424
+ logging.info('test_pipeline_config_path is %s' % test_pipeline_config_path)
425
+ train_cmd = 'el_submit -yaml %s' % train_yaml_path
426
+ proc = subprocess.Popen(train_cmd.split(), stderr=subprocess.STDOUT)
427
+ proc_wait(proc, timeout=TEST_TIME_OUT)
428
+ if proc.returncode != 0:
429
+ logging.error('train %s failed' % test_pipeline_config_path)
430
+ logging.error('train_yaml %s failed' % train_yaml_path)
431
+ return proc.returncode == 0
432
+
433
+
434
+ def test_hdfs_eval(pipeline_config_path,
435
+ eval_yaml_path,
436
+ test_dir,
437
+ process_pipeline_func=None,
438
+ hyperparam_str=''):
439
+
440
+ gpus = get_available_gpus()
441
+ if len(gpus) > 0:
442
+ set_gpu_id(gpus[0])
443
+ else:
444
+ set_gpu_id(None)
445
+ logging.info('testing export pipeline config %s' % pipeline_config_path)
446
+ logging.info('eval_yaml_path %s' % eval_yaml_path)
447
+ if 'TF_CONFIG' in os.environ:
448
+ del os.environ['TF_CONFIG']
449
+ pipeline_config = _Load_config_for_test_eval(pipeline_config_path)
450
+ if process_pipeline_func is not None:
451
+ assert callable(process_pipeline_func)
452
+ pipeline_config = process_pipeline_func(pipeline_config)
453
+ config_util.save_pipeline_config(pipeline_config, test_dir)
454
+ test_pipeline_config_path = os.path.join(test_dir, 'pipeline.config')
455
+ yaml_replace(eval_yaml_path, pipeline_config_path, test_pipeline_config_path)
456
+ logging.info('test_pipeline_config_path is %s' % test_pipeline_config_path)
457
+ eval_cmd = 'el_submit -yaml %s' % eval_yaml_path
458
+ proc = subprocess.Popen(eval_cmd.split(), stderr=subprocess.STDOUT)
459
+ proc_wait(proc, timeout=TEST_TIME_OUT)
460
+ if proc.returncode != 0:
461
+ logging.error('eval %s failed' % test_pipeline_config_path)
462
+ logging.error('eval_yaml %s failed' % eval_yaml_path)
463
+ return proc.returncode == 0
464
+
465
+
466
+ def test_hdfs_export(pipeline_config_path,
467
+ export_yaml_path,
468
+ test_dir,
469
+ process_pipeline_func=None,
470
+ hyperparam_str=''):
471
+
472
+ gpus = get_available_gpus()
473
+ if len(gpus) > 0:
474
+ set_gpu_id(gpus[0])
475
+ else:
476
+ set_gpu_id(None)
477
+ logging.info('testing export pipeline config %s' % pipeline_config_path)
478
+ logging.info('export_yaml_path %s' % export_yaml_path)
479
+ if 'TF_CONFIG' in os.environ:
480
+ del os.environ['TF_CONFIG']
481
+ pipeline_config = _Load_config_for_test_eval(pipeline_config_path)
482
+ if process_pipeline_func is not None:
483
+ assert callable(process_pipeline_func)
484
+ pipeline_config = process_pipeline_func(pipeline_config)
485
+ config_util.save_pipeline_config(pipeline_config, test_dir)
486
+ test_pipeline_config_path = os.path.join(test_dir, 'pipeline.config')
487
+ test_export_path = os.path.join(test_dir, 'export_dir')
488
+ yaml_replace(export_yaml_path, pipeline_config_path,
489
+ test_pipeline_config_path, test_export_path)
490
+ logging.info('test_pipeline_config_path is %s' % test_pipeline_config_path)
491
+ eval_cmd = 'el_submit -yaml %s' % export_yaml_path
492
+ proc = subprocess.Popen(eval_cmd.split(), stderr=subprocess.STDOUT)
493
+ proc_wait(proc, timeout=TEST_TIME_OUT)
494
+ if proc.returncode != 0:
495
+ logging.error('export %s failed' % test_pipeline_config_path)
496
+ logging.error('export_yaml %s failed' % export_yaml_path)
497
+ return proc.returncode == 0
498
+
499
+
500
+ def _ports_in_use(ports):
501
+ ports_str = ''
502
+ for i, port in enumerate(ports):
503
+ if i > 0:
504
+ ports_str += '|'
505
+ ports_str += '0.0.0.0:%d|127.0.0.1:%d' % (port, port)
506
+ stat, output = getstatusoutput('netstat -tlnp | grep -E %s' % ports_str)
507
+ return stat == 0
508
+
509
+
510
+ def get_ports_base(num_worker):
511
+ port_base = int(os.environ.get('PORT_BASE', 10000))
512
+ num_try = 10
513
+ for i in range(num_try):
514
+ ports = np.random.randint(port_base, port_base + 5000, size=num_worker)
515
+ if not _ports_in_use(ports):
516
+ return ports
517
+ logging.info('ports %s in use, retry...' % ports)
518
+
519
+
520
+ def _get_ports(num_worker):
521
+ # port queue to deals with port conflicts when multiple
522
+ # test cases run in parallel
523
+ if 'ports' in os.environ:
524
+ ports = os.environ['ports']
525
+ port_arr = [int(x) for x in ports.split(',')]
526
+ assert len(port_arr) >= num_worker, 'not enough ports: %s, required: %d'\
527
+ % (ports, num_worker)
528
+ return port_arr[:num_worker]
529
+ else:
530
+ return get_ports_base(num_worker)
531
+
532
+
533
+ def _ps_worker_train(pipeline_config_path,
534
+ test_dir,
535
+ num_worker,
536
+ num_evaluator=0,
537
+ fit_on_eval=False,
538
+ fit_on_eval_steps=None):
539
+ gpus = get_available_gpus()
540
+ # not enough gpus, run on cpu only
541
+ if len(gpus) < num_worker:
542
+ gpus = [None] * num_worker
543
+ ports = _get_ports(num_worker + 1)
544
+ chief_or_master = 'master' if num_evaluator == 0 else 'chief'
545
+ cluster = {
546
+ chief_or_master: ['localhost:%d' % ports[0]],
547
+ 'worker': ['localhost:%d' % ports[i] for i in range(1, num_worker)],
548
+ 'ps': ['localhost:%d' % ports[-1]]
549
+ }
550
+ tf_config = {'cluster': cluster}
551
+ procs = {}
552
+ tf_config['task'] = {'type': chief_or_master, 'index': 0}
553
+ os.environ['TF_CONFIG'] = json.dumps(tf_config)
554
+ set_gpu_id(gpus[0])
555
+ train_cmd = 'python -m easy_rec.python.train_eval --pipeline_config_path %s' % pipeline_config_path
556
+ if fit_on_eval:
557
+ train_cmd += ' --fit_on_eval'
558
+ if fit_on_eval_steps is not None:
559
+ train_cmd += ' --fit_on_eval_steps ' + str(int(fit_on_eval_steps))
560
+ procs[chief_or_master] = run_cmd(
561
+ train_cmd, '%s/log_%s.txt' % (test_dir, chief_or_master))
562
+ tf_config['task'] = {'type': 'ps', 'index': 0}
563
+ os.environ['TF_CONFIG'] = json.dumps(tf_config)
564
+ set_gpu_id('')
565
+ procs['ps'] = run_cmd(train_cmd, '%s/log_%s.txt' % (test_dir, 'ps'))
566
+
567
+ for idx in range(num_worker - 1):
568
+ tf_config['task'] = {'type': 'worker', 'index': idx}
569
+ os.environ['TF_CONFIG'] = json.dumps(tf_config)
570
+ set_gpu_id(gpus[idx + 1])
571
+ worker_name = 'worker_%d' % idx
572
+ procs[worker_name] = run_cmd(train_cmd,
573
+ '%s/log_%s.txt' % (test_dir, worker_name))
574
+ if num_evaluator > 0:
575
+ tf_config['task'] = {'type': 'evaluator', 'index': 0}
576
+ os.environ['TF_CONFIG'] = json.dumps(tf_config)
577
+ set_gpu_id('')
578
+ procs['evaluator'] = run_cmd(train_cmd,
579
+ '%s/log_%s.txt' % (test_dir, 'evaluator'))
580
+
581
+ return procs
582
+
583
+
584
+ def _ps_worker_distribute_eval(pipeline_config_path,
585
+ checkpoint_path,
586
+ test_dir,
587
+ num_worker,
588
+ num_evaluator=0):
589
+ gpus = get_available_gpus()
590
+ # not enough gpus, run on cpu only
591
+ if len(gpus) < num_worker:
592
+ gpus = [None] * num_worker
593
+ ports = _get_ports(num_worker + 1)
594
+ chief_or_master = 'master' if num_evaluator == 0 else 'chief'
595
+ cluster = {
596
+ chief_or_master: ['localhost:%d' % ports[0]],
597
+ 'worker': ['localhost:%d' % ports[i] for i in range(1, num_worker)],
598
+ 'ps': ['localhost:%d' % ports[-1]]
599
+ }
600
+ tf_config = {'cluster': cluster}
601
+ procs = {}
602
+ tf_config['task'] = {'type': chief_or_master, 'index': 0}
603
+ os.environ['TF_CONFIG'] = json.dumps(tf_config)
604
+ os.environ[constant.SORT_COL_BY_NAME] = '1'
605
+ set_gpu_id(gpus[0])
606
+ train_cmd = 'python -m easy_rec.python.eval --pipeline_config_path {} --checkpoint_path {} \
607
+ --distribute_eval True --eval_result_path distribute_eval_result.txt'.format(
608
+ pipeline_config_path, checkpoint_path)
609
+ procs[chief_or_master] = run_cmd(
610
+ train_cmd, '%s/distribute_eval_log_%s.txt' % (test_dir, chief_or_master))
611
+ tf_config['task'] = {'type': 'ps', 'index': 0}
612
+ os.environ['TF_CONFIG'] = json.dumps(tf_config)
613
+ set_gpu_id('')
614
+ procs['ps'] = run_cmd(train_cmd,
615
+ '%s/distribute_eval_log_%s.txt' % (test_dir, 'ps'))
616
+
617
+ for idx in range(num_worker - 1):
618
+ tf_config['task'] = {'type': 'worker', 'index': idx}
619
+ os.environ['TF_CONFIG'] = json.dumps(tf_config)
620
+ set_gpu_id(gpus[idx + 1])
621
+ worker_name = 'worker_%d' % idx
622
+ procs[worker_name] = run_cmd(
623
+ train_cmd, '%s/distribute_eval_log_%s.txt' % (test_dir, worker_name))
624
+ if num_evaluator > 0:
625
+ tf_config['task'] = {'type': 'evaluator', 'index': 0}
626
+ os.environ['TF_CONFIG'] = json.dumps(tf_config)
627
+ set_gpu_id('')
628
+ procs['evaluator'] = run_cmd(
629
+ train_cmd, '%s/distribute_eval_log_%s.txt' % (test_dir, 'evaluator'))
630
+
631
+ return procs
632
+
633
+
634
+ def _multi_worker_mirror_train(pipeline_config_path, test_dir, num_worker):
635
+ gpus = get_available_gpus()
636
+ # not enough gpus, run on cpu only
637
+ if len(gpus) < num_worker:
638
+ gpus = [None] * num_worker
639
+ ports = _get_ports(num_worker)
640
+ tf_config = {
641
+ 'cluster': {
642
+ 'worker': ['localhost:%d' % ports[i] for i in range(num_worker)]
643
+ }
644
+ }
645
+ procs = {}
646
+ train_cmd = 'python -m easy_rec.python.train_eval --pipeline_config_path %s' % pipeline_config_path
647
+ for idx in range(num_worker):
648
+ tf_config['task'] = {'type': 'worker', 'index': idx}
649
+ os.environ['TF_CONFIG'] = json.dumps(tf_config)
650
+ set_gpu_id(gpus[idx])
651
+ worker_name = 'worker_%d' % idx
652
+ procs[worker_name] = run_cmd(train_cmd,
653
+ '%s/log_%s.txt' % (test_dir, worker_name))
654
+ return procs
655
+
656
+
657
+ def _multi_worker_hvd_train(pipeline_config_path, test_dir, num_worker):
658
+ gpus = get_available_gpus()
659
+ # not enough gpus, run on cpu only
660
+ if len(gpus) < num_worker:
661
+ gpus = ''
662
+ else:
663
+ gpus = ','.join(gpus)
664
+ set_gpu_id(gpus)
665
+ ports = _get_ports(num_worker)
666
+ hosts = ','.join(['localhost:%d' % ports[i] for i in range(num_worker)])
667
+ train_cmd = 'horovodrun -np %d --hosts %s python -m easy_rec.python.train_eval --pipeline_config_path %s' % (
668
+ num_worker, hosts, pipeline_config_path)
669
+ proc = run_cmd(train_cmd, '%s/log_hvd.txt' % test_dir)
670
+ proc_wait(proc, timeout=1200)
671
+ return proc.returncode == 0
672
+
673
+
674
+ def test_distributed_train_eval(pipeline_config_path,
675
+ test_dir,
676
+ total_steps=50,
677
+ num_evaluator=0,
678
+ edit_config_json=None,
679
+ use_hvd=False,
680
+ fit_on_eval=False,
681
+ num_epoch=0):
682
+ logging.info('testing pipeline config %s' % pipeline_config_path)
683
+ pipeline_config = _load_config_for_test(pipeline_config_path, test_dir,
684
+ total_steps, num_epoch)
685
+ if edit_config_json is not None:
686
+ config_util.edit_config(pipeline_config, edit_config_json)
687
+
688
+ if use_hvd:
689
+ pipeline_config.train_config.sync_replicas = False
690
+ if pipeline_config.train_config.train_distribute not in [
691
+ DistributionStrategy.EmbeddingParallelStrategy,
692
+ DistributionStrategy.SokStrategy
693
+ ]:
694
+ pipeline_config.train_config.train_distribute =\
695
+ DistributionStrategy.HorovodStrategy
696
+
697
+ train_config = pipeline_config.train_config
698
+ config_util.save_pipeline_config(pipeline_config, test_dir)
699
+ test_pipeline_config_path = os.path.join(test_dir, 'pipeline.config')
700
+
701
+ task_failed = None
702
+ procs = None
703
+ try:
704
+ if use_hvd:
705
+ return _multi_worker_hvd_train(test_pipeline_config_path, test_dir, 2)
706
+ if train_config.train_distribute == DistributionStrategy.NoStrategy:
707
+ num_worker = 2
708
+ procs = _ps_worker_train(
709
+ test_pipeline_config_path,
710
+ test_dir,
711
+ num_worker,
712
+ num_evaluator,
713
+ fit_on_eval,
714
+ fit_on_eval_steps=int(total_steps // 2))
715
+ elif train_config.train_distribute == DistributionStrategy.MultiWorkerMirroredStrategy:
716
+ num_worker = 2
717
+ procs = _multi_worker_mirror_train(test_pipeline_config_path, test_dir,
718
+ num_worker)
719
+ else:
720
+ raise NotImplementedError
721
+
722
+ # print proc info
723
+ assert len(procs) > 0, 'processes are empty'
724
+ for k, proc in procs.items():
725
+ logging.info('%s pid: %d' % (k, proc.pid))
726
+ task_finish_cnt = 0
727
+ task_has_finished = {k: False for k in procs.keys()}
728
+ while True:
729
+ for k, proc in procs.items():
730
+ if proc.poll() is None:
731
+ if task_failed is not None:
732
+ logging.error('task %s failed, %s quit' % (task_failed, k))
733
+ proc.terminate()
734
+ if k != 'ps':
735
+ task_has_finished[k] = True
736
+ task_finish_cnt += 1
737
+ logging.info('task_finish_cnt %d' % task_finish_cnt)
738
+ else:
739
+ if not task_has_finished[k]:
740
+ # process quit by itself
741
+ if k != 'ps':
742
+ task_finish_cnt += 1
743
+ task_has_finished[k] = True
744
+ logging.info('task_finish_cnt %d' % task_finish_cnt)
745
+ if proc.returncode != 0:
746
+ logging.error('%s failed' % k)
747
+ task_failed = k
748
+ else:
749
+ logging.info('%s run successfuly' % k)
750
+
751
+ if task_finish_cnt >= num_worker:
752
+ break
753
+ time.sleep(1)
754
+
755
+ except Exception as e:
756
+ logging.error('Exception: ' + str(e))
757
+ raise e
758
+ finally:
759
+ if procs is not None:
760
+ for k, proc in procs.items():
761
+ if proc.poll() is None:
762
+ logging.info('terminate %s' % k)
763
+ proc.terminate()
764
+ if task_failed is not None:
765
+ logging.error('train %s failed' % pipeline_config_path)
766
+
767
+ return task_failed is None
768
+
769
+
770
+ def test_distribute_eval_test(cur_eval_path, test_dir):
771
+ single_work_eval_path = os.path.join(cur_eval_path, 'eval_result.txt')
772
+ distribute_eval_path = os.path.join(test_dir, 'distribute_eval_result.txt')
773
+ if not os.path.exists(distribute_eval_path):
774
+ return False
775
+ single_data = read_data_from_json_path(single_work_eval_path)
776
+ distribute_data = read_data_from_json_path(distribute_eval_path)
777
+ single_ret = {
778
+ k: single_data[k]
779
+ for k in single_data.keys()
780
+ if 'loss' not in k and 'step' not in k
781
+ }
782
+ distribute_ret = {
783
+ k: distribute_data[k] for k in distribute_data.keys() if 'loss' not in k
784
+ }
785
+ difference_num = 0.00001
786
+ for k in single_ret.keys():
787
+ if (abs(single_ret[k] - distribute_ret[k]) > difference_num):
788
+ logging.error(
789
+ 'distribute_eval difference[%.8f] large than threshold[%.8f]' %
790
+ (abs(single_ret[k] - distribute_ret[k]), difference_num))
791
+ return False
792
+ return True
793
+
794
+
795
+ def test_distributed_eval(pipeline_config_path,
796
+ checkpoint_path,
797
+ test_dir,
798
+ total_steps=50,
799
+ num_evaluator=0):
800
+ logging.info('testing pipeline config %s' % pipeline_config_path)
801
+ pipeline_config = _load_config_for_distribute_eval(pipeline_config_path,
802
+ test_dir)
803
+ train_config = pipeline_config.train_config
804
+ config_util.save_pipeline_config(pipeline_config, test_dir)
805
+ test_pipeline_config_path = os.path.join(test_dir, 'pipeline.config')
806
+
807
+ task_failed = None
808
+ procs = None
809
+ is_equal = False
810
+ try:
811
+ if train_config.train_distribute == DistributionStrategy.NoStrategy:
812
+ num_worker = 2
813
+ procs = _ps_worker_distribute_eval(test_pipeline_config_path,
814
+ checkpoint_path, test_dir, num_worker,
815
+ num_evaluator)
816
+ else:
817
+ raise NotImplementedError
818
+
819
+ # print proc info
820
+ assert len(procs) > 0, 'processes are empty'
821
+ for k, proc in procs.items():
822
+ logging.info('%s pid: %d' % (k, proc.pid))
823
+ task_finish_cnt = 0
824
+ task_has_finished = {k: False for k in procs.keys()}
825
+ while True:
826
+ for k, proc in procs.items():
827
+ if proc.poll() is None:
828
+ if task_failed is not None:
829
+ logging.error('task %s failed, %s quit' % (task_failed, k))
830
+ proc.terminate()
831
+ if k != 'ps':
832
+ task_has_finished[k] = True
833
+ task_finish_cnt += 1
834
+ logging.info('task_finish_cnt %d' % task_finish_cnt)
835
+ else:
836
+ if not task_has_finished[k]:
837
+ # process quit by itself
838
+ if k != 'ps':
839
+ task_finish_cnt += 1
840
+ task_has_finished[k] = True
841
+ logging.info('task_finish_cnt %d' % task_finish_cnt)
842
+ if proc.returncode != 0:
843
+ logging.error('%s failed' % k)
844
+ task_failed = k
845
+ else:
846
+ logging.info('%s run successfuly' % k)
847
+ if task_finish_cnt >= num_worker:
848
+ break
849
+ time.sleep(1)
850
+
851
+ is_equal = test_distribute_eval_test(checkpoint_path, test_dir)
852
+
853
+ except Exception as e:
854
+ logging.error('Exception: ' + str(e))
855
+ raise e
856
+ finally:
857
+ if procs is not None:
858
+ for k, proc in procs.items():
859
+ if proc.poll() is None:
860
+ logging.info('terminate %s' % k)
861
+ proc.terminate()
862
+ if task_failed is not None:
863
+ logging.error('eval %s failed[%s]' % (pipeline_config_path, task_failed))
864
+
865
+ eval_success = (task_failed is None) and is_equal
866
+ return eval_success